diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index 176dc8d10bf23ba91ebc3cf1976b7e2e08a96766..5604f134c3918522f990a52e4aa5115e3df99099 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -62,7 +62,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) :
 	uniqueness_ = cfg->value("uniqueness", 0.95f);
 	P1_ = cfg->value("P1", 10);
 	P2_ = cfg->value("P2", 120);
-	max_disp_ = cfg->value("max_disp", 256);
+	max_disp_ = cfg->value("num_disp", 256);
 
 	if (uniqueness_ < 0.0 || uniqueness_ > 1.0) {
 		uniqueness_ = 1.0;
@@ -79,7 +79,7 @@ FixstarsSGM::FixstarsSGM(ftl::operators::Graph *g, ftl::Configurable* cfg) :
 		LOG(WARNING) << "Invalid value for P2, using value of P1 (" << P1_ << ")";
 	}
 
-	if (!(max_disp_ == 256 || max_disp_ == 128)) {
+	if (!(max_disp_ == 256 || max_disp_ == 128 || max_disp_ == 192)) {
 		max_disp_ = 256;
 		LOG(WARNING) << "Invalid value for max_disp, using default value (256)";
 	}
@@ -216,9 +216,9 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 		weightsF_.convertTo(weights_, CV_8UC1, 255.0f, cvstream);
 
 		//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
-		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream);
+		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), config()->value("min_disp", 60), stream);
 	} else {
-		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, stream);
+		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, config()->value("min_disp", 60), stream);
 	}
 
 	// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
@@ -228,7 +228,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 		ftl::cuda::merge_disparities(disp_int_, disp, stream);
 	}
 
-	cv::cuda::threshold(disp_int_, disp, 4096.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
+	cv::cuda::threshold(disp_int_, disp, 16383.0f, 0.0f, cv::THRESH_TOZERO_INV, cvstream);
 
 	if (config()->value("check_reprojection", false)) {
 		ftl::cuda::check_reprojection(disp, in.getTexture<uchar4>(Channel::Colour),
diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h
index 279b3e5af2557fc2cbb93e660c96edfca6c1a006..bf2c58ea759181d3294d9cb717fd7e65b2b6d60a 100644
--- a/lib/libsgm/include/libsgm.h
+++ b/lib/libsgm/include/libsgm.h
@@ -114,13 +114,13 @@ namespace sgm {
 		* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
 		* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 		*/
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream);
 
 		/**
 		 * Same as execute(left_pixels, right_pixels, dst) with image size parameters.
 		 * Dimensions must be smaller or equal to dimensions provided in constructor.
 		 */
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream);
 
 		/**
 		 * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values
diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu
index fb7282250f95a23d91a33ebf2e5bc38262d3f715..dde4f5f290e86db6c73a4ec03e3e44b4eee566d9 100644
--- a/lib/libsgm/src/check_consistency.cu
+++ b/lib/libsgm/src/check_consistency.cu
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace {
 	template<typename SRC_T, typename DST_T>
-	__global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel)  {
+	__global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel, int min_disp)  {
 
 		const int j = blockIdx.x * blockDim.x + threadIdx.x;
 		const int i = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,6 +28,7 @@ namespace {
 
 		uint8_t mask = d_mask[i * src_pitch + j];
 		int d = d_leftDisp[i * dst_pitch + j];
+		int dout = d + (min_disp << sgm::StereoSGM::SUBPIXEL_SHIFT);
 		if (subpixel) {
 			d >>= sgm::StereoSGM::SUBPIXEL_SHIFT;
 		}
@@ -36,38 +37,39 @@ namespace {
 			int diff = abs(d_rightDisp[i * dst_pitch + k] - d);
 			if (mask != 0 || diff > 1) {
 				// masked or left-right inconsistent pixel -> invalid
-				d_leftDisp[i * dst_pitch + j] = (256 << (sgm::StereoSGM::SUBPIXEL_SHIFT+1));
+				dout = (1024 << (sgm::StereoSGM::SUBPIXEL_SHIFT));
 			}
 		}
+		d_leftDisp[i * dst_pitch + j] = dout;
 	}
 }
 
 namespace sgm {
 	namespace details {
 
-		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) {
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream) {
 
 			const dim3 blocks(width / 16, height / 16);
 			const dim3 threads(16, 16);
 			if (depth_bits == 16) {
-				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp);
 			}
 			else if (depth_bits == 8) {
-				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp);
 			}
 
 			CudaKernelCheck();
 		}
 
-		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream) {
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream) {
 
 			const dim3 blocks(width / 16, height / 16);
 			const dim3 threads(16, 16);
 			if (depth_bits == 16) {
-				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint16_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp);
 			}
 			else if (depth_bits == 8) {
-				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+				check_consistency_kernel<uint8_t> << < blocks, threads, 0, stream >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel, min_disp);
 			}
 			
 			CudaKernelCheck();	
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
index 5eba5372c3d67e9396aaf9569c8e7240c0c98759..8047354b4a5ac07d7a6a2087caa876bd420b5422 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.cu
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -39,7 +39,8 @@ __global__ void aggregate_horizontal_path_kernel(
 	const uint8_t* __restrict__ p2,
 	int p2_pitch,
 	const uint8_t* __restrict__ w,
-	int w_pitch)
+	int w_pitch,
+	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -87,7 +88,7 @@ __global__ void aggregate_horizontal_path_kernel(
 	}else{
 		for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){
 			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
-				const int x = static_cast<int>(width - (j + dp_offset));
+				const int x = static_cast<int>(width - (j + dp_offset)) - min_disp;
 				if(0 <= x && x < static_cast<int>(width)){
 					right_buffer[i][j] = __ldg(&right[i * feature_step + x]);
 				}else{
@@ -120,9 +121,9 @@ __global__ void aggregate_horizontal_path_kernel(
 #else
 					right_buffer[j][0] = __shfl_up(t, 1, SUBGROUP_SIZE);
 #endif
-					if(lane_id == 0){
+					if(lane_id == 0 && x >= min_disp + dp_offset){
 						right_buffer[j][0] =
-							__ldg(&right[j * feature_step + x - dp_offset]);
+							__ldg(&right[j * feature_step + x - min_disp - dp_offset]);
 					}
 				}else{
 					const feature_type t = right_buffer[j][0];
@@ -136,9 +137,9 @@ __global__ void aggregate_horizontal_path_kernel(
 					right_buffer[j][DP_BLOCK_SIZE - 1] = __shfl_down(t, 1, SUBGROUP_SIZE);
 #endif
 					if(lane_id + 1 == SUBGROUP_SIZE){
-						if(x >= dp_offset + DP_BLOCK_SIZE - 1){
+						if(x >= min_disp + dp_offset + DP_BLOCK_SIZE - 1){
 							right_buffer[j][DP_BLOCK_SIZE - 1] =
-								__ldg(&right[j * feature_step + x - (dp_offset + DP_BLOCK_SIZE - 1)]);
+								__ldg(&right[j * feature_step + x - min_disp - (dp_offset + DP_BLOCK_SIZE - 1)]);
 						}else{
 							right_buffer[j][DP_BLOCK_SIZE - 1] = 0;
 						}
@@ -171,6 +172,7 @@ void enqueue_aggregate_left2right_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -180,7 +182,7 @@ void enqueue_aggregate_left2right_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -195,6 +197,7 @@ void enqueue_aggregate_right2left_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -204,7 +207,7 @@ void enqueue_aggregate_right2left_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 
@@ -219,6 +222,7 @@ template void enqueue_aggregate_left2right_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_left2right_path<128u>(
@@ -232,6 +236,21 @@ template void enqueue_aggregate_left2right_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_left2right_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_left2right_path<256u>(
@@ -245,6 +264,7 @@ template void enqueue_aggregate_left2right_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<64u>(
@@ -258,6 +278,7 @@ template void enqueue_aggregate_right2left_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<128u>(
@@ -271,6 +292,21 @@ template void enqueue_aggregate_right2left_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_right2left_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<256u>(
@@ -284,6 +320,7 @@ template void enqueue_aggregate_right2left_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp
index 1b7a7e088f5d4dac793694ba7f4a0a876de2538b..4626dd68f91909b2aab893fea906df5586acef72 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.hpp
+++ b/lib/libsgm/src/horizontal_path_aggregation.hpp
@@ -34,6 +34,7 @@ void enqueue_aggregate_left2right_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -48,6 +49,7 @@ void enqueue_aggregate_right2left_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h
index 7d836f5d8d5e60a0cb416ddb211b5abcfad0439e..c9c9ff259288c9b78a5b07595ce213ffa58bdada 100644
--- a/lib/libsgm/src/internal.h
+++ b/lib/libsgm/src/internal.h
@@ -36,8 +36,8 @@ namespace sgm {
 		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
 		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
 
-		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream);
-		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, cudaStream_t stream);
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream);
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel, int min_disp, cudaStream_t stream);
 
 		void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements);
 		void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements);
diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu
index 97d9b1493c32e54a7bb1195f9a9261243b801444..3734ebae8922a370ea51165c431bc6396b30f51b 100644
--- a/lib/libsgm/src/oblique_path_aggregation.cu
+++ b/lib/libsgm/src/oblique_path_aggregation.cu
@@ -35,7 +35,8 @@ __global__ void aggregate_oblique_path_kernel(
 	const uint8_t* __restrict__ p2,
 	int p2_pitch,
 	const uint8_t* __restrict__ w,
-	int w_pitch)
+	int w_pitch,
+	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -77,7 +78,7 @@ __global__ void aggregate_oblique_path_kernel(
 	for(unsigned int iter = 0; iter < height; ++iter){
 		const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter);
 		const int x = x0 + static_cast<int>(iter) * X_DIRECTION;
-		const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION;
+		const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION - min_disp;
 		// Load right to smem
 		for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){
 			const unsigned int i = i0 + threadIdx.x;
@@ -129,6 +130,7 @@ void enqueue_aggregate_upleft2downright_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -137,7 +139,7 @@ void enqueue_aggregate_upleft2downright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -152,6 +154,7 @@ void enqueue_aggregate_upright2downleft_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -160,7 +163,7 @@ void enqueue_aggregate_upright2downleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -175,6 +178,7 @@ void enqueue_aggregate_downright2upleft_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -183,7 +187,7 @@ void enqueue_aggregate_downright2upleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -198,6 +202,7 @@ void enqueue_aggregate_downleft2upright_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -206,7 +211,7 @@ void enqueue_aggregate_downleft2upright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 
@@ -221,6 +226,7 @@ template void enqueue_aggregate_upleft2downright_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upleft2downright_path<128u>(
@@ -234,6 +240,21 @@ template void enqueue_aggregate_upleft2downright_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_upleft2downright_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upleft2downright_path<256u>(
@@ -247,6 +268,7 @@ template void enqueue_aggregate_upleft2downright_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<64u>(
@@ -260,6 +282,7 @@ template void enqueue_aggregate_upright2downleft_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<128u>(
@@ -273,6 +296,21 @@ template void enqueue_aggregate_upright2downleft_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_upright2downleft_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<256u>(
@@ -286,6 +324,7 @@ template void enqueue_aggregate_upright2downleft_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<64u>(
@@ -299,6 +338,7 @@ template void enqueue_aggregate_downright2upleft_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<128u>(
@@ -312,6 +352,21 @@ template void enqueue_aggregate_downright2upleft_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downright2upleft_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<256u>(
@@ -325,6 +380,7 @@ template void enqueue_aggregate_downright2upleft_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<64u>(
@@ -338,6 +394,7 @@ template void enqueue_aggregate_downleft2upright_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<128u>(
@@ -351,6 +408,21 @@ template void enqueue_aggregate_downleft2upright_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downleft2upright_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<256u>(
@@ -364,6 +436,7 @@ template void enqueue_aggregate_downleft2upright_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp
index 6504f37a2715ef64d7698b00afb5f6b0614b26af..882afed2affc344ea35bfda20c619e7521738ab3 100644
--- a/lib/libsgm/src/oblique_path_aggregation.hpp
+++ b/lib/libsgm/src/oblique_path_aggregation.hpp
@@ -34,6 +34,7 @@ void enqueue_aggregate_upleft2downright_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -48,6 +49,7 @@ void enqueue_aggregate_upright2downleft_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -62,6 +64,7 @@ void enqueue_aggregate_downright2upleft_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -76,6 +79,7 @@ void enqueue_aggregate_downleft2upright_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
index e5019ec3a67ddb1113ce2a7f1a873983359e5fb4..5661d7bee452eb6d28486152c8dfa7bd75e5ecd1 100644
--- a/lib/libsgm/src/path_aggregation.cu
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -53,6 +53,7 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS;
@@ -69,28 +70,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 
 	path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 0 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[0]);
 	path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 1 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[1]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[1]);
 	path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 2 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[2]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[2]);
 	path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 3 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[3]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[3]);
 	path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 4 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[4]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[4]);
 	path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 5 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[5]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[5]);
 	path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 6 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[6]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[6]);
 	path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 7 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[7]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp, m_streams[7]);
 	for(unsigned int i = 0; i < NUM_PATHS; ++i){
 		cudaEventRecord(m_events[i], m_streams[i]);
 		cudaStreamWaitEvent(stream, m_events[i], 0);
@@ -100,6 +101,7 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 
 template class PathAggregation< 64>;
 template class PathAggregation<128>;
+template class PathAggregation<192>;
 template class PathAggregation<256>;
 
 }
diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp
index 7df96996da46711f7ddcb76dcbc44132ca9f194c..c8abf740df09162ea65a01bc5bf0f041d2f2c6e7 100644
--- a/lib/libsgm/src/path_aggregation.hpp
+++ b/lib/libsgm/src/path_aggregation.hpp
@@ -51,6 +51,7 @@ public:
 		int p2_pitch,
 		const uint8_t* w,
 		int w_pitch,
+		int min_disp,
 		cudaStream_t stream);
 
 };
diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp
index c897684e7003512ccd0c9e17d7a084a39319766b..5211c98103c26a27af51fbd91111043e16858d23 100644
--- a/lib/libsgm/src/path_aggregation_common.hpp
+++ b/lib/libsgm/src/path_aggregation_common.hpp
@@ -31,9 +31,9 @@ struct DynamicProgramming {
 	static_assert(
 		DP_BLOCK_SIZE >= 2,
 		"DP_BLOCK_SIZE must be greater than or equal to 2");
-	static_assert(
+	/*static_assert(
 		(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,
-		"SUBGROUP_SIZE must be a power of 2");
+		"SUBGROUP_SIZE must be a power of 2");*/
 
 	uint32_t last_min;
 	uint32_t dp[DP_BLOCK_SIZE];
diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu
index 2b350da9562962aa9e10b4554bb6b6c4517e1023..eb5d4179e0a11b964e8238bbdcfac40598917396 100644
--- a/lib/libsgm/src/sgm.cu
+++ b/lib/libsgm/src/sgm.cu
@@ -57,6 +57,7 @@ public:
 		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
+		int min_disp,
 		cudaStream_t stream)
 	{
 		m_census_left.enqueue(
@@ -70,11 +71,12 @@ public:
 			penalty1, penalty2,
 			src_pitch, // bug?
 			weights, weights_pitch,
+			min_disp,
 			stream);
 		m_winner_takes_all.enqueue(
 			dest_left, dest_right,
 			m_path_aggregation.get_output(),
-			width, height, dst_pitch, uniqueness, subpixel,
+			width, height, dst_pitch, uniqueness, subpixel, min_disp,
 			stream);
 	}
 
@@ -106,6 +108,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 	int weights_pitch,
 	float uniqueness,
 	bool subpixel,
+	int min_disp,
 	cudaStream_t stream)
 {
 	m_impl->enqueue(
@@ -116,6 +119,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 		penalty1, penalty2,
 		weights, weights_pitch,
 		uniqueness, subpixel,
+		min_disp,
 		stream);
 	//cudaStreamSynchronize(0);
 }
@@ -136,6 +140,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 	int weights_pitch,
 	float uniqueness,
 	bool subpixel,
+	int min_disp,
 	cudaStream_t stream)
 {
 	m_impl->enqueue(
@@ -146,12 +151,14 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 		penalty1, penalty2,
 		weights, weights_pitch,
 		uniqueness, subpixel,
+		min_disp,
 		stream);
 }
 
 
 template class SemiGlobalMatching<uint8_t,   64>;
 template class SemiGlobalMatching<uint8_t,  128>;
+template class SemiGlobalMatching<uint8_t,  192>;
 template class SemiGlobalMatching<uint8_t,  256>;
 template class SemiGlobalMatching<uint16_t,  64>;
 template class SemiGlobalMatching<uint16_t, 128>;
diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp
index f0c3c80ee107b95890d6dc8578c72768a70abdd4..9aa2cd387782465c3398aab5cda6a8b6a292018d 100644
--- a/lib/libsgm/src/sgm.hpp
+++ b/lib/libsgm/src/sgm.hpp
@@ -53,6 +53,7 @@ public:
 		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
+		int min_disp,
 		cudaStream_t stream);
 
 	void enqueue(
@@ -70,6 +71,7 @@ public:
 		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
+		int min_disp,
 		cudaStream_t stream);
 
 };
diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp
index a07f2daea087e38f4102c0cb92bf7d1664655c3a..70e1263147fbe3ab4eefb948d591e8506a3be40b 100644
--- a/lib/libsgm/src/stereo_sgm.cpp
+++ b/lib/libsgm/src/stereo_sgm.cpp
@@ -29,7 +29,7 @@ namespace sgm {
 	public:
 		using output_type = sgm::output_type;
 		virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
-			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, int min_disp, cudaStream_t stream) = 0;
 
 		virtual ~SemiGlobalMatchingBase() {}
 	};
@@ -38,9 +38,9 @@ namespace sgm {
 	class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase {
 	public:
 		void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
-			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) override
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, int min_disp, cudaStream_t stream) override
 		{
-			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, stream);
+			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, min_disp, stream);
 		}
 	private:
 		SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_;
@@ -63,6 +63,8 @@ namespace sgm {
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>();
 			else if (input_depth_bits_ == 8 && disparity_size_ == 128)
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>();
+			else if (input_depth_bits_ == 8 && disparity_size_ == 192)
+				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 192>();
 			else if (input_depth_bits_ == 8 && disparity_size_ == 256)
 				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 256>();
 			else if (input_depth_bits_ == 16 && disparity_size_ == 64)
@@ -133,9 +135,9 @@ namespace sgm {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
 			throw std::logic_error("depth bits must be 8 or 16");
 		}
-		if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 256) {
+		if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 192 && disparity_size_ != 256) {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
-			throw std::logic_error("disparity size must be 64, 128 or 256");
+			throw std::logic_error("disparity size must be 64, 128, 192 or 256");
 		}
 		if (param.subpixel && output_depth_bits != 16) {
 			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
@@ -150,7 +152,7 @@ namespace sgm {
 	}
 
 	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch,
-		const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
+		const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream) {
 
 		const void *d_input_left, *d_input_right;
 
@@ -174,11 +176,11 @@ namespace sgm {
 			d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
 
 		cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
-			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, stream);
+			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, min_disp, stream);
 
 		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream);
 		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream);
-		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, stream);
+		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel, min_disp, stream);
 
 		if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
 			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height);
@@ -198,8 +200,8 @@ namespace sgm {
 		}
 	}
 
-	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
-		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, stream);
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, int min_disp, cudaStream_t stream) {
+		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, min_disp, stream);
 	}
 
 	bool StereoSGM::updateParameters(const Parameters &params) {
diff --git a/lib/libsgm/src/utility.hpp b/lib/libsgm/src/utility.hpp
index 17b9f45f5375e1756f581a2d7fd3e72e5f588d60..bbd5dc328a878c241c9ceb4b3a57ff3162239794 100644
--- a/lib/libsgm/src/utility.hpp
+++ b/lib/libsgm/src/utility.hpp
@@ -189,6 +189,14 @@ __device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *pt
 	dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w;
 }
 
+template <>
+__device__ inline void load_uint16_vector<6u>(uint32_t *dest, const uint16_t *ptr){
+	const auto uint32x3 = load_as<uint3>(ptr);
+	load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x3.x));
+	load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x3.y));
+	load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x3.z));
+}
+
 template <>
 __device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr){
 	const auto uint32x4 = load_as<uint4>(ptr);
diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu
index 6fee96892cf038a0583c576688e8456e0c3ec7d7..7f705646039d5a223ca967ecebdcdd70bb6b9667 100644
--- a/lib/libsgm/src/vertical_path_aggregation.cu
+++ b/lib/libsgm/src/vertical_path_aggregation.cu
@@ -35,7 +35,8 @@ __global__ void aggregate_vertical_path_kernel(
 	const uint8_t* __restrict__ p2,
 	int p2_pitch,
 	const uint8_t* __restrict__ w,
-	int w_pitch)
+	int w_pitch,
+	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -81,7 +82,7 @@ __global__ void aggregate_vertical_path_kernel(
 		for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){
 			const unsigned int i = i0 + threadIdx.x;
 			if(i < RIGHT_BUFFER_SIZE){
-				const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i);
+				const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i) - min_disp;
 				feature_type right_value = 0;
 				if(0 <= x && x < static_cast<int>(width)){
 					right_value = right[x + y * width];
@@ -126,6 +127,7 @@ void enqueue_aggregate_up2down_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -134,7 +136,7 @@ void enqueue_aggregate_up2down_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -149,6 +151,7 @@ void enqueue_aggregate_down2up_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -157,7 +160,7 @@ void enqueue_aggregate_down2up_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, min_disp);
 }
 
 
@@ -172,6 +175,7 @@ template void enqueue_aggregate_up2down_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_up2down_path<128u>(
@@ -185,6 +189,7 @@ template void enqueue_aggregate_up2down_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_up2down_path<256u>(
@@ -198,6 +203,21 @@ template void enqueue_aggregate_up2down_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_up2down_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<64u>(
@@ -211,6 +231,7 @@ template void enqueue_aggregate_down2up_path<64u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<128u>(
@@ -224,6 +245,21 @@ template void enqueue_aggregate_down2up_path<128u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_down2up_path<192u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	const uint8_t *p2,
+	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<256u>(
@@ -237,6 +273,7 @@ template void enqueue_aggregate_down2up_path<256u>(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp
index 434dd9231614b821ae1dfcbb439e648a9dd10b9a..18240cf264862e8d7417200c51d1e8584ec49d43 100644
--- a/lib/libsgm/src/vertical_path_aggregation.hpp
+++ b/lib/libsgm/src/vertical_path_aggregation.hpp
@@ -34,6 +34,7 @@ void enqueue_aggregate_up2down_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -48,6 +49,7 @@ void enqueue_aggregate_down2up_path(
 	int p2_pitch,
 	const uint8_t* w,
 	int w_pitch,
+	int min_disp,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/winner_takes_all.cu b/lib/libsgm/src/winner_takes_all.cu
index f33d3005289a5041d03160af960032e221055e59..ea53af6c330d18bb203bfa94ce6a59e3a45c7159 100644
--- a/lib/libsgm/src/winner_takes_all.cu
+++ b/lib/libsgm/src/winner_takes_all.cu
@@ -47,17 +47,17 @@ __device__ int unpack_index(uint32_t packed){
 	return packed & 0xffffu;
 }
 
-using ComputeDisparity = uint32_t(*)(uint32_t, uint32_t, uint16_t*);
+using ComputeDisparity = uint32_t(*)(uint32_t, int, uint32_t, uint16_t*);
 
-__device__ inline uint32_t compute_disparity_normal(uint32_t disp, uint32_t cost = 0, uint16_t* smem = nullptr)
+__device__ inline uint32_t compute_disparity_normal(uint32_t disp, int min_disp, uint32_t cost = 0, uint16_t* smem = nullptr)
 {
-	return disp;
+	return disp; // + min_disp;
 }
 
 template <size_t MAX_DISPARITY>
-__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, uint32_t cost, uint16_t* smem)
+__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, int min_disp, uint32_t cost, uint16_t* smem)
 {
-	int subp = disp;
+	int subp = disp; // + min_disp;
 	subp <<= sgm::StereoSGM::SUBPIXEL_SHIFT;
 	if (disp > 0 && disp < MAX_DISPARITY - 1) {
 		const int left = smem[disp - 1];
@@ -78,7 +78,8 @@ __global__ void winner_takes_all_kernel(
 	int width,
 	int height,
 	int pitch,
-	float uniqueness)
+	float uniqueness,
+	int min_disp)
 {
 	static const unsigned int ACCUMULATION_PER_THREAD = 16u;
 	static const unsigned int REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE;
@@ -178,7 +179,7 @@ __global__ void winner_takes_all_kernel(
 					right_best[i] = min(right_best[i], recv);
 					if(d == MAX_DISPARITY - 1){
 						if(0 <= p){
-							right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+							right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]), min_disp);
 						}
 						right_best[i] = 0xffffffffu;
 					}
@@ -195,7 +196,7 @@ __global__ void winner_takes_all_kernel(
 				}
 				uniq = subgroup_and<WARP_SIZE>(uniq, 0xffffffffu);
 				if(lane_id == 0){
-					left_dest[x] = uniq ? compute_disparity(bestDisp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0;
+					left_dest[x] = uniq ? compute_disparity(bestDisp, min_disp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0;
 				}
 			}
 		}
@@ -204,7 +205,7 @@ __global__ void winner_takes_all_kernel(
 		const unsigned int k = lane_id * REDUCTION_PER_THREAD + i;
 		const int p = static_cast<int>(((width - k) & ~(MAX_DISPARITY - 1)) + k);
 		if(p < width){
-			right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+			right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]), min_disp);
 		}
 	}
 }
@@ -219,6 +220,7 @@ void enqueue_winner_takes_all(
 	int pitch,
 	float uniqueness,
 	bool subpixel,
+	int min_disp,
 	cudaStream_t stream)
 {
 	const int gdim =
@@ -226,10 +228,10 @@ void enqueue_winner_takes_all(
 	const int bdim = BLOCK_SIZE;
 	if (subpixel) {
 		winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim, 0, stream>>>(
-			left_dest, right_dest, src, width, height, pitch, uniqueness);
+			left_dest, right_dest, src, width, height, pitch, uniqueness, min_disp);
 	} else {
 		winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_normal><<<gdim, bdim, 0, stream>>>(
-			left_dest, right_dest, src, width, height, pitch, uniqueness);
+			left_dest, right_dest, src, width, height, pitch, uniqueness, min_disp);
 	}
 }
 
@@ -250,6 +252,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue(
 	int pitch,
 	float uniqueness,
 	bool subpixel,
+	int min_disp,
 	cudaStream_t stream)
 {
 	if(m_left_buffer.size() < static_cast<size_t>(pitch * height)){
@@ -267,6 +270,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue(
 		pitch,
 		uniqueness,
 		subpixel,
+		min_disp,
 		stream);
 }
 
@@ -280,6 +284,7 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue(
 	int pitch,
 	float uniqueness,
 	bool subpixel,
+	int min_disp,
 	cudaStream_t stream)
 {
 	enqueue_winner_takes_all<MAX_DISPARITY>(
@@ -291,12 +296,14 @@ void WinnerTakesAll<MAX_DISPARITY>::enqueue(
 		pitch,
 		uniqueness,
 		subpixel,
+		min_disp,
 		stream);
 }
 
 
 template class WinnerTakesAll< 64>;
 template class WinnerTakesAll<128>;
+template class WinnerTakesAll<192>;
 template class WinnerTakesAll<256>;
 
 }
diff --git a/lib/libsgm/src/winner_takes_all.hpp b/lib/libsgm/src/winner_takes_all.hpp
index 3dbae82735581c6b1051fd973efd4346e24c2543..f2bb59765f031322c9bdbcfa833dc170edea5653 100644
--- a/lib/libsgm/src/winner_takes_all.hpp
+++ b/lib/libsgm/src/winner_takes_all.hpp
@@ -47,6 +47,7 @@ public:
 		int pitch,
 		float uniqueness,
 		bool subpixel,
+		int min_disp,
 		cudaStream_t stream);
 
 	void enqueue(
@@ -58,6 +59,7 @@ public:
 		int pitch,
 		float uniqueness,
 		bool subpixel,
+		int min_disp,
 		cudaStream_t stream);
 
 };