From f5bb48a5f661466612c3c3207f416b0de5d9f611 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Wed, 25 Mar 2020 09:12:43 +0200
Subject: [PATCH] Implements #331 adaptive SGM penalties

---
 .../include/ftl/operators/cuda/disparity.hpp  |  10 ++
 .../include/ftl/operators/disparity.hpp       |  16 ++-
 components/operators/src/depth.cpp            |   8 +-
 .../operators/src/disparity/disp2depth.cu     | 104 ++++++++++++++++++
 .../operators/src/disparity/fixstars_sgm.cpp  |  60 +++++++++-
 lib/libsgm/include/libsgm.h                   |   5 +-
 lib/libsgm/src/check_consistency.cu           |  12 +-
 lib/libsgm/src/horizontal_path_aggregation.cu |  38 ++++---
 .../src/horizontal_path_aggregation.hpp       |   6 +-
 lib/libsgm/src/internal.h                     |   8 +-
 lib/libsgm/src/median_filter.cu               |  12 +-
 lib/libsgm/src/oblique_path_aggregation.cu    |  65 +++++++----
 lib/libsgm/src/oblique_path_aggregation.hpp   |  12 +-
 lib/libsgm/src/path_aggregation.cu            |  19 ++--
 lib/libsgm/src/path_aggregation.hpp           |   3 +-
 lib/libsgm/src/sgm.cu                         |  15 ++-
 lib/libsgm/src/sgm.hpp                        |   7 +-
 lib/libsgm/src/stereo_sgm.cpp                 |  20 ++--
 lib/libsgm/src/vertical_path_aggregation.cu   |  37 ++++---
 lib/libsgm/src/vertical_path_aggregation.hpp  |   6 +-
 20 files changed, 343 insertions(+), 120 deletions(-)

diff --git a/components/operators/include/ftl/operators/cuda/disparity.hpp b/components/operators/include/ftl/operators/cuda/disparity.hpp
index de4648f6f..75034d9e1 100644
--- a/components/operators/include/ftl/operators/cuda/disparity.hpp
+++ b/components/operators/include/ftl/operators/cuda/disparity.hpp
@@ -19,6 +19,16 @@ void mask_occlusions(const cv::cuda::GpuMat &depth,
 		cv::cuda::GpuMat &mask,
 		const ftl::rgbd::Camera &c, cudaStream_t stream);
 
+void check_reprojection(const cv::cuda::GpuMat &disp,
+			const ftl::cuda::TextureObject<uchar4> &left, const ftl::cuda::TextureObject<uchar4> &right,
+			cudaStream_t stream);
+
+void show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, const cv::cuda::GpuMat &right,
+			float scale, cudaStream_t stream);
+
+void show_disp_density(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left,
+			float scale, cudaStream_t stream);
+
 
 void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
 					cv::cuda::GpuMat &history, cv::cuda::GpuMat &support, int n_max, float threshold, bool fill,
diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp
index 2f9a807cd..eea830cc4 100644
--- a/components/operators/include/ftl/operators/disparity.hpp
+++ b/components/operators/include/ftl/operators/disparity.hpp
@@ -11,6 +11,7 @@
 
 #ifdef HAVE_LIBSGM
 #include <libsgm.h>
+#include <opencv2/cudaimgproc.hpp>
 #endif
 
 namespace ftl {
@@ -19,7 +20,7 @@ namespace operators {
 #ifdef HAVE_LIBSGM
 /*
  * FixstarsSGM https://github.com/fixstars/libSGM
- * 
+ *
  * Requires modified version https://gitlab.utu.fi/joseha/libsgm
  */
 class FixstarsSGM : public ftl::operators::Operator {
@@ -31,21 +32,28 @@ class FixstarsSGM : public ftl::operators::Operator {
 	bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override;
 
 	bool isMemoryHeavy() const override { return true; }
-	
+
 	private:
 	bool init();
 	bool updateParameters();
-	
+	bool updateP2Parameters();
+	void computeP2(cudaStream_t &stream);
+
 	sgm::StereoSGM *ssgm_;
 	cv::Size size_;
 	cv::cuda::GpuMat lbw_;
 	cv::cuda::GpuMat rbw_;
 	cv::cuda::GpuMat disp_int_;
 
+	cv::cuda::GpuMat P2_map_;
+	cv::cuda::GpuMat edges_;
+	cv::Ptr<cv::cuda::CannyEdgeDetector> canny_;
+
 	int P1_;
 	int P2_;
 	int max_disp_;
 	float uniqueness_;
+	bool use_P2_map_;
 };
 #endif
 
@@ -101,7 +109,7 @@ class DepthChannel : public ftl::operators::Operator {
 	std::vector<cv::cuda::GpuMat> rbuf_;
 	cv::Size depth_size_;
 
-	void _createPipeline();
+	void _createPipeline(size_t);
 };
 
 /*
diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp
index 59b50a1fe..5ffb6bbaf 100644
--- a/components/operators/src/depth.cpp
+++ b/components/operators/src/depth.cpp
@@ -131,7 +131,7 @@ DepthChannel::~DepthChannel() {
 
 }
 
-void DepthChannel::_createPipeline() {
+void DepthChannel::_createPipeline(size_t size) {
 	if (pipe_ != nullptr) return;
 
 	pipe_ = ftl::config::create<ftl::operators::Graph>(config(), "depth");
@@ -145,7 +145,7 @@ void DepthChannel::_createPipeline() {
 	#endif
 	#ifdef HAVE_OPTFLOW
 	pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow);
-	pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
+	if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
 	#endif
 	pipe_->append<ftl::operators::DisparityBilateralFilter>("bilateral_filter");
 	//pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
@@ -167,7 +167,7 @@ bool DepthChannel::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda
 		if (!in.hasFrame(i)) continue;
 		auto &f = in.frames[i];
 		if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) {
-			_createPipeline();
+			_createPipeline(in.frames.size());
 
 			cv::cuda::GpuMat& left = f.get<cv::cuda::GpuMat>(Channel::Left);
 			cv::cuda::GpuMat& right = f.get<cv::cuda::GpuMat>(Channel::Right);
@@ -202,7 +202,7 @@ bool DepthChannel::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream
 
 	auto &f = in;
 	if (!f.hasChannel(Channel::Depth) && f.hasChannel(Channel::Right)) {
-		_createPipeline();
+		_createPipeline(1);
 
 		cv::cuda::GpuMat& left = f.get<cv::cuda::GpuMat>(Channel::Left);
 		cv::cuda::GpuMat& right = f.get<cv::cuda::GpuMat>(Channel::Right);
diff --git a/components/operators/src/disparity/disp2depth.cu b/components/operators/src/disparity/disp2depth.cu
index c0011adf4..bed3bc3e8 100644
--- a/components/operators/src/disparity/disp2depth.cu
+++ b/components/operators/src/disparity/disp2depth.cu
@@ -137,3 +137,107 @@ void ftl::cuda::mask_occlusions(const cv::cuda::GpuMat &depth, const cv::cuda::G
 		depth, depthR, mask, c);
 	cudaSafeCall( cudaGetLastError() );
 }
+
+
+// =============================================================================
+
+__global__ void check_reprojection_kernel(cv::cuda::PtrStepSz<short> disp,
+	ftl::cuda::TextureObject<uchar4> left,
+	ftl::cuda::TextureObject<uchar4> right)
+{
+	for (STRIDE_Y(v,disp.rows)) {
+	for (STRIDE_X(u,disp.cols)) {
+		const float d = float(disp(v,u)) / 16.0f;
+		const float4 l = left.tex2D(float(u)+0.5f, float(v)+0.5f);
+
+		if (d > 0) {
+			const float4 r = right.tex2D(float(u-d)+0.5f, float(v)+0.5f);
+			const float diff = max(fabsf(l.x-r.x),max(fabsf(l.y-r.y), fabsf(l.z-r.z)));
+			if (diff > 10.0f) disp(v,u) = 0;
+		}
+	}
+	}
+}
+
+void ftl::cuda::check_reprojection(const cv::cuda::GpuMat &disp, const ftl::cuda::TextureObject<uchar4> &left, const ftl::cuda::TextureObject<uchar4> &right, cudaStream_t stream) {
+	dim3 grid(1,1,1);
+	dim3 threads(128, 4, 1);
+	grid.x = cv::cuda::device::divUp(disp.cols, 128);
+	grid.y = cv::cuda::device::divUp(disp.rows, 4);
+
+	check_reprojection_kernel<<<grid, threads, 0, stream>>>(disp, left, right);
+
+	cudaSafeCall( cudaGetLastError() );
+}
+
+
+// =============================================================================
+
+
+__global__ void show_rpe_kernel(cv::cuda::PtrStepSz<short> disp,
+	cv::cuda::PtrStepSz<uchar4> left,
+	cv::cuda::PtrStepSz<uchar4> right,
+	float scale)
+{
+	for (STRIDE_Y(v,left.rows)) {
+	for (STRIDE_X(u,left.cols)) {
+		short d = disp(v,u) / 16;
+
+		if (d > 0 && u-d >= 0) {
+			uchar4 l = left(v,u);
+			uchar4 r = right(v,u-d);
+			float d = max(abs(int(l.x)-int(r.x)),max(abs(int(l.y)-int(r.y)), abs(int(l.z)-int(r.z))));
+
+			left(v,u) = make_uchar4(0,0,min(255.0f, (d/scale) * 255.0f),255);
+		}
+	}
+	}
+}
+
+void ftl::cuda::show_rpe(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left, const cv::cuda::GpuMat &right,
+			float scale, cudaStream_t stream) {
+	dim3 grid(1,1,1);
+	dim3 threads(128, 4, 1);
+	grid.x = cv::cuda::device::divUp(disp.cols, 128);
+	grid.y = cv::cuda::device::divUp(disp.rows, 4);
+	show_rpe_kernel<<<grid, threads, 0, stream>>>(
+		disp, left, right, scale);
+	cudaSafeCall( cudaGetLastError() );
+}
+
+// =============================================================================
+
+
+template <int MAX_DISP>
+__global__ void show_disp_density_kernel(cv::cuda::PtrStepSz<short> disp,
+	cv::cuda::PtrStepSz<uchar4> left,
+	float scale)
+{
+	for (STRIDE_Y(v,disp.rows)) {
+	for (STRIDE_X(u,disp.cols)) {
+		short d = disp(v,u) / 16;
+		int count = 0;
+
+		for (int i=1; i<MAX_DISP; ++i) {
+			if (u+i-d < disp.cols && u+i-d >= 0) {
+				short dd = disp(v,u+i-d) / 16;
+				if (d > 0 && dd == i) ++count;
+			}
+		}
+
+		count = max(0,count-1);
+		left(v,u) = make_uchar4(0,0,min(255.0f, (float(count)/4.0f) * 255.0f),255);
+	}
+	}
+}
+
+void ftl::cuda::show_disp_density(const cv::cuda::GpuMat &disp, cv::cuda::GpuMat &left,
+			float scale, cudaStream_t stream) {
+	dim3 grid(1,1,1);
+	dim3 threads(128, 4, 1);
+	grid.x = cv::cuda::device::divUp(disp.cols, 128);
+	grid.y = cv::cuda::device::divUp(disp.rows, 4);
+	show_disp_density_kernel<256><<<grid, threads, 0, stream>>>(
+		disp, left, scale);
+	cudaSafeCall( cudaGetLastError() );
+}
diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index d5bfe2d6a..b55a7af6d 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -1,6 +1,7 @@
 #include <loguru.hpp>
 
 #include "ftl/operators/disparity.hpp"
+#include <ftl/operators/cuda/disparity.hpp>
 
 #include <opencv2/cudaimgproc.hpp>
 #include <opencv2/cudaarithm.hpp>
@@ -14,9 +15,22 @@ using ftl::rgbd::Frame;
 using ftl::rgbd::Source;
 using ftl::operators::FixstarsSGM;
 
+void FixstarsSGM::computeP2(cudaStream_t &stream) {
+	const int P3 = config()->value("P3", P2_);
+	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
+	P2_map_.setTo(P3, cvstream);
+
+	if (config()->value("use_P2_map", false)) {
+		edges_.create(size_, CV_8UC1);
+		auto ptr = canny_;
+		ptr->detect(lbw_, edges_, cvstream);
+		P2_map_.setTo(P2_, edges_, cvstream);
+	}
+}
+
 FixstarsSGM::FixstarsSGM(ftl::Configurable* cfg) :
 		ftl::operators::Operator(cfg) {
-	
+
 	ssgm_ = nullptr;
 	size_ = Size(0, 0);
 
@@ -77,6 +91,16 @@ FixstarsSGM::FixstarsSGM(ftl::Configurable* cfg) :
 			updateParameters();
 		}
 	});
+
+	updateP2Parameters();
+
+	cfg->on("canny_low", [this, cfg](const ftl::config::Event&) {
+		updateP2Parameters();
+	});
+
+	cfg->on("canny_high", [this, cfg](const ftl::config::Event&) {
+		updateP2Parameters();
+	});
 }
 
 FixstarsSGM::~FixstarsSGM() {
@@ -100,6 +124,7 @@ bool FixstarsSGM::init() {
 		sgm::StereoSGM::Parameters(P1_, P2_, uniqueness_, true)
 	);
 
+	P2_map_.create(size_, CV_8UC1);
 	return true;
 }
 
@@ -109,13 +134,20 @@ bool FixstarsSGM::updateParameters() {
 		sgm::StereoSGM::Parameters(P1_, P2_, uniqueness_, true));
 }
 
+bool FixstarsSGM::updateP2Parameters() {
+	canny_ = cv::cuda::createCannyEdgeDetector(
+		config()->value("canny_low", 30.0),
+		config()->value("canny_high", 120.0));
+	return true;
+}
+
 bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 	if (!in.hasChannel(Channel::Left) || !in.hasChannel(Channel::Right)) {
 		LOG(ERROR) << "Fixstars is missing Left or Right channel";
 		return false;
 	}
 
-	const auto &l = in.get<GpuMat>(Channel::Left);
+	auto &l = in.get<GpuMat>(Channel::Left);
 	const auto &r = in.get<GpuMat>(Channel::Right);
 
 	if (l.size() != size_) {
@@ -129,14 +161,32 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 	cv::cuda::cvtColor(l, lbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
 	cv::cuda::cvtColor(r, rbw_, cv::COLOR_BGRA2GRAY, 0, cvstream);
 
-	cvstream.waitForCompletion();
-	ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data);
+	//cvstream.waitForCompletion();
+	computeP2(stream);
+	//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
+	ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, stream);
 
 	// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
 	// left_pixels.setTo(0);
 
 	cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
-	
+
+	if (config()->value("check_reprojection", false)) {
+		ftl::cuda::check_reprojection(disp, in.getTexture<uchar4>(Channel::Colour),
+			in.createTexture<uchar4>(Channel::Colour2, true),
+			stream);
+	}
+
+	if (config()->value("show_P2_map", false)) {
+		cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA);
+	}
+	if (config()->value("show_rpe", false)) {
+		ftl::cuda::show_rpe(disp, l, r, 100.0f, stream);
+	}
+	if (config()->value("show_disp_density", false)) {
+		ftl::cuda::show_disp_density(disp, l, 100.0f, stream);
+	}
+
 	//disp_int_.convertTo(disp, CV_32F, 1.0f / 16.0f, cvstream);
 	return true;
 }
diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h
index 7e69ed8ae..ae6365675 100644
--- a/lib/libsgm/include/libsgm.h
+++ b/lib/libsgm/include/libsgm.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <stdint.h>
 #include "libsgm_config.h"
+#include <cuda_runtime.h>
 
 #if defined(LIBSGM_SHARED)
 	#if defined(WIN32) || defined(_WIN32)
@@ -113,13 +114,13 @@ namespace sgm {
 		* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
 		* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 		*/
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream);
 
 		/**
 		 * Same as execute(left_pixels, right_pixels, dst) with image size parameters.
 		 * Dimensions must be smaller or equal to dimensions provided in constructor.
 		 */
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream);
 
 		/**
 		 * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values
diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu
index f0a58247b..fb7282250 100644
--- a/lib/libsgm/src/check_consistency.cu
+++ b/lib/libsgm/src/check_consistency.cu
@@ -45,29 +45,29 @@ namespace {
 namespace sgm {
 	namespace details {
 
-		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) {
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) {
 
 			const dim3 blocks(width / 16, height / 16);
 			const dim3 threads(16, 16);
 			if (depth_bits == 16) {
-				check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
 			}
 			else if (depth_bits == 8) {
-				check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
 			}
 
 			CudaKernelCheck();
 		}
 
-		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) {
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) {
 
 			const dim3 blocks(width / 16, height / 16);
 			const dim3 threads(16, 16);
 			if (depth_bits == 16) {
-				check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
 			}
 			else if (depth_bits == 8) {
-				check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
 			}
 			
 			CudaKernelCheck();	
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
index 2a9f1a87f..b3772b6f6 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.cu
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -31,12 +31,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;
 template <int DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_horizontal_path_kernel(
 	uint8_t *dest,
-	const feature_type *left,
-	const feature_type *right,
+	const feature_type* __restrict__ left,
+	const feature_type* __restrict__ right,
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2)
+	const uint8_t* __restrict__ p2,
+	int p2_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -69,6 +70,7 @@ __global__ void aggregate_horizontal_path_kernel(
 	left  += y0 * width;
 	right += y0 * width;
 	dest  += y0 * MAX_DISPARITY * width;
+	p2    += y0 * p2_pitch;
 
 	if(y0 >= height){
 		return;
@@ -144,7 +146,7 @@ __global__ void aggregate_horizontal_path_kernel(
 				for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){
 					local_costs[k] = __popc(left_value ^ right_buffer[j][k]);
 				}
-				dp[j].update(local_costs, p1, p2, shfl_mask);
+				dp[j].update(local_costs, p1, p2[x], shfl_mask);
 				store_uint8_vector<DP_BLOCK_SIZE>(
 					&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
 					dp[j].dp);
@@ -163,7 +165,8 @@ void enqueue_aggregate_left2right_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -173,7 +176,7 @@ void enqueue_aggregate_left2right_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -184,7 +187,8 @@ void enqueue_aggregate_right2left_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -194,7 +198,7 @@ void enqueue_aggregate_right2left_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 
@@ -205,7 +209,8 @@ template void enqueue_aggregate_left2right_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_left2right_path<128u>(
@@ -215,7 +220,8 @@ template void enqueue_aggregate_left2right_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_left2right_path<256u>(
@@ -225,7 +231,8 @@ template void enqueue_aggregate_left2right_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<64u>(
@@ -235,7 +242,8 @@ template void enqueue_aggregate_right2left_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<128u>(
@@ -245,7 +253,8 @@ template void enqueue_aggregate_right2left_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_right2left_path<256u>(
@@ -255,7 +264,8 @@ template void enqueue_aggregate_right2left_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp
index ce825e5fa..950c52bca 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.hpp
+++ b/lib/libsgm/src/horizontal_path_aggregation.hpp
@@ -30,7 +30,8 @@ void enqueue_aggregate_left2right_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -41,7 +42,8 @@ void enqueue_aggregate_right2left_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h
index 577101728..7d836f5d8 100644
--- a/lib/libsgm/src/internal.h
+++ b/lib/libsgm/src/internal.h
@@ -33,11 +33,11 @@ limitations under the License.
 namespace sgm {
 	namespace details {
 
-		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch);
-		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch);
+		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
+		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
 
-		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel);
-		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel);
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream);
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream);
 
 		void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements);
 		void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements);
diff --git a/lib/libsgm/src/median_filter.cu b/lib/libsgm/src/median_filter.cu
index fffe24f04..7e9311bb6 100644
--- a/lib/libsgm/src/median_filter.cu
+++ b/lib/libsgm/src/median_filter.cu
@@ -256,33 +256,33 @@ namespace {
 namespace sgm {
 	namespace details {
 
-		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch) {
+		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream) {
 
 			if (pitch % 4 == 0) {
 				const dim3 block(BLOCK_X, BLOCK_Y);
 				const dim3 grid(divup(width / 4, block.x), divup(height, block.y));
-				median_kernel_3x3_8u_v4<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+				median_kernel_3x3_8u_v4<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
 			}
 			else {
 				const dim3 block(BLOCK_X, BLOCK_Y);
 				const dim3 grid(divup(width, block.x), divup(height, block.y));
-				median_kernel_3x3_8u<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+				median_kernel_3x3_8u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
 			}
 
 			CudaSafeCall(cudaGetLastError());
 		}
 
-		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch) {
+		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream) {
 			
 			if (pitch % 2 == 0) {
 				const dim3 block(BLOCK_X, BLOCK_Y);
 				const dim3 grid(divup(width / 2, block.x), divup(height, block.y));
-				median_kernel_3x3_16u_v2<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+				median_kernel_3x3_16u_v2<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
 			}
 			else {
 				const dim3 block(BLOCK_X, BLOCK_Y);
 				const dim3 grid(divup(width, block.x), divup(height, block.y));
-				median_kernel_3x3_16u<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+				median_kernel_3x3_16u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
 			}
 
 			CudaSafeCall(cudaGetLastError());
diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu
index 21785c947..65934fb1b 100644
--- a/lib/libsgm/src/oblique_path_aggregation.cu
+++ b/lib/libsgm/src/oblique_path_aggregation.cu
@@ -27,12 +27,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
 template <int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_oblique_path_kernel(
 	uint8_t *dest,
-	const feature_type *left,
-	const feature_type *right,
+	const feature_type* __restrict__ left,
+	const feature_type* __restrict__ right,
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2)
+	const uint8_t* __restrict__ p2,
+	int p2_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -104,7 +105,7 @@ __global__ void aggregate_oblique_path_kernel(
 			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
 				local_costs[j] = __popc(left_value ^ right_values[j]);
 			}
-			dp.update(local_costs, p1, p2, shfl_mask);
+			dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
@@ -122,7 +123,8 @@ void enqueue_aggregate_upleft2downright_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -131,7 +133,7 @@ void enqueue_aggregate_upleft2downright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -142,7 +144,8 @@ void enqueue_aggregate_upright2downleft_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -151,7 +154,7 @@ void enqueue_aggregate_upright2downleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -162,7 +165,8 @@ void enqueue_aggregate_downright2upleft_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -171,7 +175,7 @@ void enqueue_aggregate_downright2upleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -182,7 +186,8 @@ void enqueue_aggregate_downleft2upright_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -191,7 +196,7 @@ void enqueue_aggregate_downleft2upright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 
@@ -202,7 +207,8 @@ template void enqueue_aggregate_upleft2downright_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upleft2downright_path<128u>(
@@ -212,7 +218,8 @@ template void enqueue_aggregate_upleft2downright_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_upleft2downright_path<256u>(
@@ -222,7 +229,8 @@ template void enqueue_aggregate_upleft2downright_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<64u>(
@@ -232,7 +240,8 @@ template void enqueue_aggregate_upright2downleft_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<128u>(
@@ -242,7 +251,8 @@ template void enqueue_aggregate_upright2downleft_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_upright2downleft_path<256u>(
@@ -252,7 +262,8 @@ template void enqueue_aggregate_upright2downleft_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<64u>(
@@ -262,7 +273,8 @@ template void enqueue_aggregate_downright2upleft_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<128u>(
@@ -272,7 +284,8 @@ template void enqueue_aggregate_downright2upleft_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_downright2upleft_path<256u>(
@@ -282,7 +295,8 @@ template void enqueue_aggregate_downright2upleft_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<64u>(
@@ -292,7 +306,8 @@ template void enqueue_aggregate_downleft2upright_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<128u>(
@@ -302,7 +317,8 @@ template void enqueue_aggregate_downleft2upright_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_downleft2upright_path<256u>(
@@ -312,7 +328,8 @@ template void enqueue_aggregate_downleft2upright_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp
index a4f949b1e..36d61513c 100644
--- a/lib/libsgm/src/oblique_path_aggregation.hpp
+++ b/lib/libsgm/src/oblique_path_aggregation.hpp
@@ -30,7 +30,8 @@ void enqueue_aggregate_upleft2downright_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -41,7 +42,8 @@ void enqueue_aggregate_upright2downleft_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -52,7 +54,8 @@ void enqueue_aggregate_downright2upleft_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -63,7 +66,8 @@ void enqueue_aggregate_downleft2upright_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
index 117cac507..d0eac331b 100644
--- a/lib/libsgm/src/path_aggregation.cu
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -47,7 +47,8 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS;
@@ -58,28 +59,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 	cudaStreamSynchronize(stream);
 	path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 0 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[0]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[0]);
 	path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 1 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[1]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[1]);
 	path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 2 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[2]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[2]);
 	path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 3 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[3]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[3]);
 	path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 4 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[4]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[4]);
 	path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 5 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[5]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[5]);
 	path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 6 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[6]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[6]);
 	path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 7 * buffer_step,
-		left, right, width, height, p1, p2, m_streams[7]);
+		left, right, width, height, p1, p2, p2_pitch, m_streams[7]);
 	for(unsigned int i = 0; i < NUM_PATHS; ++i){
 		cudaEventRecord(m_events[i], m_streams[i]);
 		cudaStreamWaitEvent(stream, m_events[i], 0);
diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp
index 3d2b35d8e..221411d26 100644
--- a/lib/libsgm/src/path_aggregation.hpp
+++ b/lib/libsgm/src/path_aggregation.hpp
@@ -46,7 +46,8 @@ public:
 		int width,
 		int height,
 		unsigned int p1,
-		unsigned int p2,
+		const uint8_t *p2,
+		int p2_pitch,
 		cudaStream_t stream);
 
 };
diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu
index d276c3fa9..fc62a0d47 100644
--- a/lib/libsgm/src/sgm.cu
+++ b/lib/libsgm/src/sgm.cu
@@ -52,7 +52,7 @@ public:
 		int src_pitch,
 		int dst_pitch,
 		unsigned int penalty1,
-		unsigned int penalty2,
+		const uint8_t *penalty2,
 		float uniqueness,
 		bool subpixel,
 		cudaStream_t stream)
@@ -66,6 +66,7 @@ public:
 			m_census_right.get_output(),
 			width, height,
 			penalty1, penalty2,
+			src_pitch,
 			stream);
 		m_winner_takes_all.enqueue(
 			dest_left, dest_right,
@@ -97,9 +98,10 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 	int src_pitch,
 	int dst_pitch,
 	unsigned int penalty1,
-	unsigned int penalty2,
+	const uint8_t *penalty2,
 	float uniqueness,
-	bool subpixel)
+	bool subpixel,
+	cudaStream_t stream)
 {
 	m_impl->enqueue(
 		dest_left, dest_right,
@@ -108,8 +110,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 		src_pitch, dst_pitch,
 		penalty1, penalty2,
 		uniqueness, subpixel,
-		0);
-	cudaStreamSynchronize(0);
+		stream);
+	//cudaStreamSynchronize(0);
 }
 
 template <typename T, size_t MAX_DISPARITY>
@@ -123,7 +125,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 	int src_pitch,
 	int dst_pitch,
 	unsigned int penalty1,
-	unsigned int penalty2,
+	const uint8_t *penalty2,
 	float uniqueness,
 	bool subpixel,
 	cudaStream_t stream)
@@ -144,5 +146,6 @@ template class SemiGlobalMatching<uint8_t,  128>;
 template class SemiGlobalMatching<uint8_t,  256>;
 template class SemiGlobalMatching<uint16_t,  64>;
 template class SemiGlobalMatching<uint16_t, 128>;
+template class SemiGlobalMatching<uint16_t,  256>;
 
 }
diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp
index afb308a25..effbe2f9e 100644
--- a/lib/libsgm/src/sgm.hpp
+++ b/lib/libsgm/src/sgm.hpp
@@ -48,9 +48,10 @@ public:
 		int src_pitch,
 		int dst_pitch,
 		unsigned int penalty1,
-		unsigned int penalty2,
+		const uint8_t *penalty2,
 		float uniqueness,
-		bool subpixel);
+		bool subpixel,
+		cudaStream_t stream);
 
 	void enqueue(
 		output_type *dest_left,
@@ -62,7 +63,7 @@ public:
 		int src_pitch,
 		int dst_pitch,
 		unsigned int penalty1,
-		unsigned int penalty2,
+		const uint8_t *penalty2,
 		float uniqueness,
 		bool subpixel,
 		cudaStream_t stream);
diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp
index b0219091f..252f16d9e 100644
--- a/lib/libsgm/src/stereo_sgm.cpp
+++ b/lib/libsgm/src/stereo_sgm.cpp
@@ -29,7 +29,7 @@ namespace sgm {
 	public:
 		using output_type = sgm::output_type;
 		virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, 
-			int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) = 0;
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
 
 		virtual ~SemiGlobalMatchingBase() {}
 	};
@@ -38,9 +38,9 @@ namespace sgm {
 	class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase {
 	public:
 		void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
-			int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) override
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) override
 		{
-			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel);
+			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel, stream);
 		}
 	private:
 		SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_;
@@ -149,7 +149,7 @@ namespace sgm {
 		if (cu_res_) { delete cu_res_; }
 	}
 
-	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch) {
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream) {
 
 		const void *d_input_left, *d_input_right;
 
@@ -173,11 +173,11 @@ namespace sgm {
 			d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
 		
 		cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
-			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, param_.P2, param_.uniqueness, param_.subpixel);
+			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, param_.uniqueness, param_.subpixel, stream);
 
-		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch);
-		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch);
-		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel);
+		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream);
+		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream);
+		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, stream);
 
 		if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
 			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height);
@@ -197,8 +197,8 @@ namespace sgm {
 		}
 	}
 
-	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) {
-		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_);
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream) {
+		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, stream);
 	}
 
 	bool StereoSGM::updateParameters(const Parameters &params) {
diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu
index d318f46b4..54ebafc9c 100644
--- a/lib/libsgm/src/vertical_path_aggregation.cu
+++ b/lib/libsgm/src/vertical_path_aggregation.cu
@@ -27,12 +27,13 @@ static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
 template <int DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_vertical_path_kernel(
 	uint8_t *dest,
-	const feature_type *left,
-	const feature_type *right,
+	const feature_type* __restrict__ left,
+	const feature_type* __restrict__ right,
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2)
+	const uint8_t* __restrict__ p2,
+	int p2_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -102,7 +103,7 @@ __global__ void aggregate_vertical_path_kernel(
 			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
 				local_costs[j] = __popc(left_value ^ right_values[j]);
 			}
-			dp.update(local_costs, p1, p2, shfl_mask);
+			dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
@@ -119,7 +120,8 @@ void enqueue_aggregate_up2down_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -128,7 +130,7 @@ void enqueue_aggregate_up2down_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -139,7 +141,8 @@ void enqueue_aggregate_down2up_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -148,7 +151,7 @@ void enqueue_aggregate_down2up_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2);
+		dest, left, right, width, height, p1, p2, p2_pitch);
 }
 
 
@@ -159,7 +162,8 @@ template void enqueue_aggregate_up2down_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_up2down_path<128u>(
@@ -169,7 +173,8 @@ template void enqueue_aggregate_up2down_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_up2down_path<256u>(
@@ -179,7 +184,8 @@ template void enqueue_aggregate_up2down_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<64u>(
@@ -189,7 +195,8 @@ template void enqueue_aggregate_down2up_path<64u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<128u>(
@@ -199,7 +206,8 @@ template void enqueue_aggregate_down2up_path<128u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 	
 template void enqueue_aggregate_down2up_path<256u>(
@@ -209,7 +217,8 @@ template void enqueue_aggregate_down2up_path<256u>(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp
index 0ae04016a..fb7334adc 100644
--- a/lib/libsgm/src/vertical_path_aggregation.hpp
+++ b/lib/libsgm/src/vertical_path_aggregation.hpp
@@ -30,7 +30,8 @@ void enqueue_aggregate_up2down_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -41,7 +42,8 @@ void enqueue_aggregate_down2up_path(
 	int width,
 	int height,
 	unsigned int p1,
-	unsigned int p2,
+	const uint8_t *p2,
+	int p2_pitch,
 	cudaStream_t stream);
 
 }
-- 
GitLab