diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp
index 17cdced1d424566164dbc85eb22314bf3e24ad41..62b1f00857a1d8971f61248cda857814d49fc24e 100644
--- a/components/operators/include/ftl/operators/disparity.hpp
+++ b/components/operators/include/ftl/operators/disparity.hpp
@@ -65,6 +65,8 @@ class FixstarsSGM : public ftl::operators::Operator {
 	cv::cuda::GpuMat disp_int_;
 
 	cv::cuda::GpuMat P2_map_;
+	cv::cuda::GpuMat weights_;
+	cv::cuda::GpuMat weightsF_;
 	cv::cuda::GpuMat edges_;
 	cv::Ptr<cv::cuda::CannyEdgeDetector> canny_;
 
diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index 0a8bdf0dd016c2137286b06bff78dd6e72a19560..8ad3a41fb3e7bf034146336ebf8d5464e0d3f9e7 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -5,6 +5,7 @@
 
 #include <opencv2/cudaimgproc.hpp>
 #include <opencv2/cudaarithm.hpp>
+#include <opencv2/cudafilters.hpp>
 
 using cv::Size;
 using cv::cuda::GpuMat;
@@ -15,6 +16,33 @@ using ftl::rgbd::Frame;
 using ftl::rgbd::Source;
 using ftl::operators::FixstarsSGM;
 
+
+static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) {
+	if (in.isGpuMat() && out.isGpuMat()) {
+		cv::cuda::GpuMat im;
+		cv::cuda::GpuMat im2;
+		cv::cuda::GpuMat mean;
+		cv::cuda::GpuMat mean2;
+
+		mean.create(in.size(), CV_32FC1);
+		mean2.create(in.size(), CV_32FC1);
+		im2.create(in.size(), CV_32FC1);
+		in.getGpuMat().convertTo(im, CV_32FC1, cvstream);
+
+		cv::cuda::multiply(im, im, im2, 1.0, CV_32FC1, cvstream);
+		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter->apply(im, mean, cvstream);   // E[X]
+		filter->apply(im2, mean2, cvstream); // E[X^2]
+		cv::cuda::multiply(mean, mean, mean, 1.0, -1, cvstream); // (E[X])^2
+
+		// NOTE: floating point accuracy in subtraction
+		// (cv::cuda::createBoxFilter only supports float and 8 bit integer types)
+		cv::cuda::subtract(mean2, mean, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2
+	}
+	else { throw std::exception(); /* todo CPU version */ }
+}
+
+
 void FixstarsSGM::computeP2(cudaStream_t &stream) {
 	const int P3 = config()->value("P3", P2_);
 	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
@@ -115,6 +143,8 @@ bool FixstarsSGM::init() {
 	lbw_.create(size_, CV_8UC1);
 	rbw_.create(size_, CV_8UC1);
 	disp_int_.create(size_, CV_16SC1);
+	weights_.create(size_, CV_32FC1);
+	weights_.setTo(1.0);
 
 	LOG(INFO) << "INIT FIXSTARS";
 
@@ -164,8 +194,19 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 
 	//cvstream.waitForCompletion();
 	computeP2(stream);
-	//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
-	ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, stream);
+
+	bool use_variance = config()->value("use_variance", true);
+	if (use_variance) {
+		variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream);
+		float minweight = std::min(1.0f, std::max(0.0f, config()->value("var_minweight", 0.5f)));
+		cv::cuda::normalize(weightsF_, weightsF_, minweight, 1.0, cv::NORM_MINMAX, -1, cv::noArray(), cvstream);
+		weightsF_.convertTo(weights_, CV_8UC1, 255.0f);
+
+		//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
+		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream);
+	} else {
+		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, stream);
+	}
 
 	// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
 	// left_pixels.setTo(0);
diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h
index ae63656752f730f28d41e18141be125ecd86863b..279b3e5af2557fc2cbb93e660c96edfca6c1a006 100644
--- a/lib/libsgm/include/libsgm.h
+++ b/lib/libsgm/include/libsgm.h
@@ -84,7 +84,7 @@ namespace sgm {
 		* @attention
 		* output_depth_bits must be set to 16 when subpixel is enabled.
 		*/
-		LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, 
+		LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
 			EXECUTE_INOUT inout_type, const Parameters& param = Parameters());
 
 		/**
@@ -114,13 +114,13 @@ namespace sgm {
 		* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
 		* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 		*/
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
 
 		/**
 		 * Same as execute(left_pixels, right_pixels, dst) with image size parameters.
 		 * Dimensions must be smaller or equal to dimensions provided in constructor.
 		 */
-		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream);
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
 
 		/**
 		 * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values
@@ -129,7 +129,7 @@ namespace sgm {
 		LIBSGM_API void setMask(uint8_t* mask, int pitch);
 
 		/**
-		 * Update parameters. Returns true if successful. 
+		 * Update parameters. Returns true if successful.
 		 */
 		LIBSGM_API bool updateParameters(const Parameters &params);
 
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
index 860bb3d69a77924bf6953b42a7be8f297a6e71a9..5eba5372c3d67e9396aaf9569c8e7240c0c98759 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.cu
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -37,7 +37,9 @@ __global__ void aggregate_horizontal_path_kernel(
 	int height,
 	unsigned int p1,
 	const uint8_t* __restrict__ p2,
-	int p2_pitch)
+	int p2_pitch,
+	const uint8_t* __restrict__ w,
+	int w_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -146,7 +148,7 @@ __global__ void aggregate_horizontal_path_kernel(
 				for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){
 					local_costs[k] = __popc(left_value ^ right_buffer[j][k]);
 				}
-				dp[j].update(local_costs, p1, p2[x], shfl_mask);
+				dp[j].update(local_costs, p1, p2[x], w ? float(w[x])/255.0f : 1.0f, shfl_mask);
 				store_uint8_vector<DP_BLOCK_SIZE>(
 					&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
 					dp[j].dp);
@@ -167,6 +169,8 @@ void enqueue_aggregate_left2right_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -176,7 +180,7 @@ void enqueue_aggregate_left2right_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -189,6 +193,8 @@ void enqueue_aggregate_right2left_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -198,7 +204,7 @@ void enqueue_aggregate_right2left_path(
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 
@@ -211,6 +217,8 @@ template void enqueue_aggregate_left2right_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_left2right_path<128u>(
@@ -222,8 +230,10 @@ template void enqueue_aggregate_left2right_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_left2right_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -233,6 +243,8 @@ template void enqueue_aggregate_left2right_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<64u>(
@@ -244,6 +256,8 @@ template void enqueue_aggregate_right2left_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_right2left_path<128u>(
@@ -255,8 +269,10 @@ template void enqueue_aggregate_right2left_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_right2left_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -266,6 +282,8 @@ template void enqueue_aggregate_right2left_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp
index 950c52bca117252a02a18c78e40918b8eb5b8265..1b7a7e088f5d4dac793694ba7f4a0a876de2538b 100644
--- a/lib/libsgm/src/horizontal_path_aggregation.hpp
+++ b/lib/libsgm/src/horizontal_path_aggregation.hpp
@@ -32,6 +32,8 @@ void enqueue_aggregate_left2right_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -44,6 +46,8 @@ void enqueue_aggregate_right2left_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu
index 65934fb1bf53ba00e1f35fa577b83a028203ec07..97d9b1493c32e54a7bb1195f9a9261243b801444 100644
--- a/lib/libsgm/src/oblique_path_aggregation.cu
+++ b/lib/libsgm/src/oblique_path_aggregation.cu
@@ -33,7 +33,9 @@ __global__ void aggregate_oblique_path_kernel(
 	int height,
 	unsigned int p1,
 	const uint8_t* __restrict__ p2,
-	int p2_pitch)
+	int p2_pitch,
+	const uint8_t* __restrict__ w,
+	int w_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -105,7 +107,7 @@ __global__ void aggregate_oblique_path_kernel(
 			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
 				local_costs[j] = __popc(left_value ^ right_values[j]);
 			}
-			dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
+			dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
@@ -125,6 +127,8 @@ void enqueue_aggregate_upleft2downright_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -133,7 +137,7 @@ void enqueue_aggregate_upleft2downright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -146,6 +150,8 @@ void enqueue_aggregate_upright2downleft_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -154,7 +160,7 @@ void enqueue_aggregate_upright2downleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -167,6 +173,8 @@ void enqueue_aggregate_downright2upleft_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -175,7 +183,7 @@ void enqueue_aggregate_downright2upleft_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -188,6 +196,8 @@ void enqueue_aggregate_downleft2upright_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -196,7 +206,7 @@ void enqueue_aggregate_downleft2upright_path(
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 
@@ -209,6 +219,8 @@ template void enqueue_aggregate_upleft2downright_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upleft2downright_path<128u>(
@@ -220,8 +232,10 @@ template void enqueue_aggregate_upleft2downright_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_upleft2downright_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -231,6 +245,8 @@ template void enqueue_aggregate_upleft2downright_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<64u>(
@@ -242,6 +258,8 @@ template void enqueue_aggregate_upright2downleft_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_upright2downleft_path<128u>(
@@ -253,8 +271,10 @@ template void enqueue_aggregate_upright2downleft_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_upright2downleft_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -264,6 +284,8 @@ template void enqueue_aggregate_upright2downleft_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<64u>(
@@ -275,6 +297,8 @@ template void enqueue_aggregate_downright2upleft_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downright2upleft_path<128u>(
@@ -286,8 +310,10 @@ template void enqueue_aggregate_downright2upleft_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_downright2upleft_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -297,6 +323,8 @@ template void enqueue_aggregate_downright2upleft_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<64u>(
@@ -308,6 +336,8 @@ template void enqueue_aggregate_downleft2upright_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_downleft2upright_path<128u>(
@@ -319,8 +349,10 @@ template void enqueue_aggregate_downleft2upright_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_downleft2upright_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -330,6 +362,8 @@ template void enqueue_aggregate_downleft2upright_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp
index 36d61513c7693594fb65a4f38679083714694788..6504f37a2715ef64d7698b00afb5f6b0614b26af 100644
--- a/lib/libsgm/src/oblique_path_aggregation.hpp
+++ b/lib/libsgm/src/oblique_path_aggregation.hpp
@@ -32,6 +32,8 @@ void enqueue_aggregate_upleft2downright_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -44,6 +46,8 @@ void enqueue_aggregate_upright2downleft_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -56,6 +60,8 @@ void enqueue_aggregate_downright2upleft_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -68,6 +74,8 @@ void enqueue_aggregate_downleft2upright_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
index d0eac331b580b1df493f41a7a280779fe5611908..16567de556520b10a8fcbb3560bcd667ab1fb7b6 100644
--- a/lib/libsgm/src/path_aggregation.cu
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -49,6 +49,8 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS;
@@ -59,28 +61,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 	cudaStreamSynchronize(stream);
 	path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 0 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[0]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]);
 	path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 1 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[1]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[1]);
 	path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 2 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[2]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[2]);
 	path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 3 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[3]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[3]);
 	path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 4 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[4]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[4]);
 	path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 5 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[5]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[5]);
 	path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 6 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[6]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[6]);
 	path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 7 * buffer_step,
-		left, right, width, height, p1, p2, p2_pitch, m_streams[7]);
+		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[7]);
 	for(unsigned int i = 0; i < NUM_PATHS; ++i){
 		cudaEventRecord(m_events[i], m_streams[i]);
 		cudaStreamWaitEvent(stream, m_events[i], 0);
diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp
index 221411d2687c084c824d47f2eaba40fee23306b2..0b019a3b556fb92969acff843ad7431bc3d57b0e 100644
--- a/lib/libsgm/src/path_aggregation.hpp
+++ b/lib/libsgm/src/path_aggregation.hpp
@@ -31,7 +31,7 @@ private:
 	DeviceBuffer<cost_type> m_cost_buffer;
 	cudaStream_t m_streams[NUM_PATHS];
 	cudaEvent_t m_events[NUM_PATHS];
-	
+
 public:
 	PathAggregation();
 	~PathAggregation();
@@ -48,6 +48,8 @@ public:
 		unsigned int p1,
 		const uint8_t *p2,
 		int p2_pitch,
+		const uint8_t* w,
+		int w_pitch,
 		cudaStream_t stream);
 
 };
diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp
index b560e732ff20e5983186e9137b70ab91e76a5f9f..c897684e7003512ccd0c9e17d7a084a39319766b 100644
--- a/lib/libsgm/src/path_aggregation_common.hpp
+++ b/lib/libsgm/src/path_aggregation_common.hpp
@@ -32,7 +32,7 @@ struct DynamicProgramming {
 		DP_BLOCK_SIZE >= 2,
 		"DP_BLOCK_SIZE must be greater than or equal to 2");
 	static_assert(
-		(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,	
+		(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,
 		"SUBGROUP_SIZE must be a power of 2");
 
 	uint32_t last_min;
@@ -45,7 +45,7 @@ struct DynamicProgramming {
 	}
 
 	__device__ void update(
-		uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
+		uint32_t *local_costs, uint32_t p1, uint32_t p2, float w, uint32_t mask)
 	{
 		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
 
@@ -62,14 +62,14 @@ struct DynamicProgramming {
 			uint32_t out = min(dp[k] - last_min, p2);
 			if(lane_id != 0){ out = min(out, prev - last_min + p1); }
 			out = min(out, dp[k + 1] - last_min + p1);
-			lazy_out = local_min = out + local_costs[k];
+			lazy_out = local_min = out + round(local_costs[k]*w);
 		}
 		for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){
 			uint32_t out = min(dp[k] - last_min, p2);
 			out = min(out, dp[k - 1] - last_min + p1);
 			out = min(out, dp[k + 1] - last_min + p1);
 			dp[k - 1] = lazy_out;
-			lazy_out = out + local_costs[k];
+			lazy_out = out + round(local_costs[k]*w);
 			local_min = min(local_min, lazy_out);
 		}
 		{
@@ -85,7 +85,7 @@ struct DynamicProgramming {
 				out = min(out, next - last_min + p1);
 			}
 			dp[k - 1] = lazy_out;
-			dp[k] = out + local_costs[k];
+			dp[k] = out + round(local_costs[k]*w);
 			local_min = min(local_min, dp[k]);
 		}
 		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu
index fc62a0d47ba30a9af24e58eb0a513ab2eab64501..2b350da9562962aa9e10b4554bb6b6c4517e1023 100644
--- a/lib/libsgm/src/sgm.cu
+++ b/lib/libsgm/src/sgm.cu
@@ -53,6 +53,8 @@ public:
 		int dst_pitch,
 		unsigned int penalty1,
 		const uint8_t *penalty2,
+		const uint8_t *weights,
+		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
 		cudaStream_t stream)
@@ -66,7 +68,8 @@ public:
 			m_census_right.get_output(),
 			width, height,
 			penalty1, penalty2,
-			src_pitch,
+			src_pitch, // bug?
+			weights, weights_pitch,
 			stream);
 		m_winner_takes_all.enqueue(
 			dest_left, dest_right,
@@ -99,6 +102,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 	int dst_pitch,
 	unsigned int penalty1,
 	const uint8_t *penalty2,
+	const uint8_t *weights,
+	int weights_pitch,
 	float uniqueness,
 	bool subpixel,
 	cudaStream_t stream)
@@ -109,6 +114,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
 		width, height,
 		src_pitch, dst_pitch,
 		penalty1, penalty2,
+		weights, weights_pitch,
 		uniqueness, subpixel,
 		stream);
 	//cudaStreamSynchronize(0);
@@ -126,6 +132,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 	int dst_pitch,
 	unsigned int penalty1,
 	const uint8_t *penalty2,
+	const uint8_t *weights,
+	int weights_pitch,
 	float uniqueness,
 	bool subpixel,
 	cudaStream_t stream)
@@ -136,6 +144,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
 		width, height,
 		src_pitch, dst_pitch,
 		penalty1, penalty2,
+		weights, weights_pitch,
 		uniqueness, subpixel,
 		stream);
 }
diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp
index effbe2f9e1a5a4ad181115547f3f0fc5242e88f8..f0c3c80ee107b95890d6dc8578c72768a70abdd4 100644
--- a/lib/libsgm/src/sgm.hpp
+++ b/lib/libsgm/src/sgm.hpp
@@ -49,6 +49,8 @@ public:
 		int dst_pitch,
 		unsigned int penalty1,
 		const uint8_t *penalty2,
+		const uint8_t *weights,
+		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
 		cudaStream_t stream);
@@ -64,6 +66,8 @@ public:
 		int dst_pitch,
 		unsigned int penalty1,
 		const uint8_t *penalty2,
+		const uint8_t *weights,
+		int weights_pitch,
 		float uniqueness,
 		bool subpixel,
 		cudaStream_t stream);
diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp
index 252f16d9ed7373d3e1a4c550a502d1ab57c199bc..a07f2daea087e38f4102c0cb92bf7d1664655c3a 100644
--- a/lib/libsgm/src/stereo_sgm.cpp
+++ b/lib/libsgm/src/stereo_sgm.cpp
@@ -28,8 +28,8 @@ namespace sgm {
 	class SemiGlobalMatchingBase {
 	public:
 		using output_type = sgm::output_type;
-		virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, 
-			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
+		virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
 
 		virtual ~SemiGlobalMatchingBase() {}
 	};
@@ -38,9 +38,9 @@ namespace sgm {
 	class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase {
 	public:
 		void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
-			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) override
+			int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) override
 		{
-			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel, stream);
+			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, stream);
 		}
 	private:
 		SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_;
@@ -54,7 +54,7 @@ namespace sgm {
 		void* d_tmp_left_disp;
 		void* d_tmp_right_disp;
 		uint8_t* d_mask;
-		
+
 		SemiGlobalMatchingBase* sgm_engine;
 
 		CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, int src_pitch_, int dst_pitch_, EXECUTE_INOUT inout_type_) {
@@ -80,7 +80,7 @@ namespace sgm {
 				CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * src_pitch_ * height_));
 				CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * src_pitch_ * height_));
 			}
-			
+
 			CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * dst_pitch_ * height_));
 			CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * dst_pitch_ * height_));
 
@@ -149,7 +149,8 @@ namespace sgm {
 		if (cu_res_) { delete cu_res_; }
 	}
 
-	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream) {
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch,
+		const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
 
 		const void *d_input_left, *d_input_right;
 
@@ -171,9 +172,9 @@ namespace sgm {
 
 		if (is_cuda_output(inout_type_) && output_depth_bits_ == 16)
 			d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
-		
+
 		cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
-			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, param_.uniqueness, param_.subpixel, stream);
+			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, stream);
 
 		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream);
 		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream);
@@ -197,8 +198,8 @@ namespace sgm {
 		}
 	}
 
-	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream) {
-		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, stream);
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
+		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, stream);
 	}
 
 	bool StereoSGM::updateParameters(const Parameters &params) {
@@ -208,7 +209,7 @@ namespace sgm {
 		if ((params.uniqueness < 0.0) || (params.uniqueness > 1.0)) {
 			return false;
 		}
-		
+
 		Parameters params_ = params;
 		std::swap(params_, this->param_);
 		return true;
diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu
index 54ebafc9c6c19af41db4052ec5af8edd04b4f570..6fee96892cf038a0583c576688e8456e0c3ec7d7 100644
--- a/lib/libsgm/src/vertical_path_aggregation.cu
+++ b/lib/libsgm/src/vertical_path_aggregation.cu
@@ -33,7 +33,9 @@ __global__ void aggregate_vertical_path_kernel(
 	int height,
 	unsigned int p1,
 	const uint8_t* __restrict__ p2,
-	int p2_pitch)
+	int p2_pitch,
+	const uint8_t* __restrict__ w,
+	int w_pitch)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
@@ -103,7 +105,7 @@ __global__ void aggregate_vertical_path_kernel(
 			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
 				local_costs[j] = __popc(left_value ^ right_values[j]);
 			}
-			dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
+			dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
@@ -122,6 +124,8 @@ void enqueue_aggregate_up2down_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -130,7 +134,7 @@ void enqueue_aggregate_up2down_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 template <unsigned int MAX_DISPARITY>
@@ -143,6 +147,8 @@ void enqueue_aggregate_down2up_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
@@ -151,7 +157,7 @@ void enqueue_aggregate_down2up_path(
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
-		dest, left, right, width, height, p1, p2, p2_pitch);
+		dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
 }
 
 
@@ -164,6 +170,8 @@ template void enqueue_aggregate_up2down_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_up2down_path<128u>(
@@ -175,8 +183,10 @@ template void enqueue_aggregate_up2down_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_up2down_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -186,6 +196,8 @@ template void enqueue_aggregate_up2down_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<64u>(
@@ -197,6 +209,8 @@ template void enqueue_aggregate_down2up_path<64u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template void enqueue_aggregate_down2up_path<128u>(
@@ -208,8 +222,10 @@ template void enqueue_aggregate_down2up_path<128u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
-	
+
 template void enqueue_aggregate_down2up_path<256u>(
 	cost_type *dest,
 	const feature_type *left,
@@ -219,6 +235,8 @@ template void enqueue_aggregate_down2up_path<256u>(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }
diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp
index fb7334adc8f9f08439747df623a953f67cac57f9..434dd9231614b821ae1dfcbb439e648a9dd10b9a 100644
--- a/lib/libsgm/src/vertical_path_aggregation.hpp
+++ b/lib/libsgm/src/vertical_path_aggregation.hpp
@@ -32,6 +32,8 @@ void enqueue_aggregate_up2down_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 template <unsigned int MAX_DISPARITY>
@@ -44,6 +46,8 @@ void enqueue_aggregate_down2up_path(
 	unsigned int p1,
 	const uint8_t *p2,
 	int p2_pitch,
+	const uint8_t* w,
+	int w_pitch,
 	cudaStream_t stream);
 
 }