diff --git a/applications/vision/src/main.cpp b/applications/vision/src/main.cpp
index 7e93cf5dc8b237069d266804c11922d945aa9546..974cb398e784206f1ed98822b52281fece4059f7 100644
--- a/applications/vision/src/main.cpp
+++ b/applications/vision/src/main.cpp
@@ -62,6 +62,17 @@ using json = nlohmann::json;
 
 static bool quiet = false;
 
+static void _cudaCallback(void *ud) {
+	auto *cb = (std::function<void()>*)ud;
+	(*cb)();
+	delete cb;
+}
+
+// TODO: Move this to a common location
+static void cudaCallback(cudaStream_t stream, const std::function<void()> &cb) {
+	cudaSafeCall(cudaLaunchHostFunc(stream, _cudaCallback, (void*)(new std::function<void()>(cb))));
+}
+
 static void run(ftl::Configurable *root) {
 	Universe *net = ftl::create<Universe>(root, "net");
 	ftl::ctrl::Master ctrl(root, net);
@@ -106,7 +117,7 @@ static void run(ftl::Configurable *root) {
 					//LOG(INFO) << "LATENCY: " << float(latency)/1000.0f << "ms";
 
 					if (clock_adjust != 0) {
-						LOG(INFO) << "Clock adjustment: " << clock_adjust << ", latency=" << float(latency)/1000.0f << "ms";
+						LOG(INFO) << "Clock adjustment: " << clock_adjust << ", latency=" << float(latency)/2000.0f << "ms";
 						ftl::timer::setClockAdjustment(clock_adjust);
 					}		
 				});
@@ -222,22 +233,36 @@ static void run(ftl::Configurable *root) {
 
 		fs->set(ftl::data::FSFlag::AUTO_SEND);
 
-		// Do all processing in another thread...
-		ftl::pool.push([sender,&stats_count,&latency,&frames,&stats_time,pipeline,&busy,fs](int id) mutable {
-			if (busy.test_and_set()) {
-				LOG(WARNING) << "Depth pipeline drop: " << fs->timestamp();
-				fs->firstFrame().message(ftl::data::Message::Warning_PIPELINE_DROP, "Depth pipeline drop");
-				return;
-			}
-			pipeline->apply(*fs, *fs);
-			busy.clear();
+		if (busy.test_and_set()) {
+			LOG(WARNING) << "Depth pipeline drop: " << fs->timestamp();
+			fs->firstFrame().message(ftl::data::Message::Warning_PIPELINE_DROP, "Depth pipeline drop");
+		} else {
+			pipeline->apply(*fs, *fs, nullptr, fs->frames[0].stream());
 
-			++frames;
-			latency += float(ftl::timer::get_time() - fs->timestamp());
+			cudaCallback(fs->frames[0].stream(), [fs,&frames,&latency,&busy]() {
+				busy.clear();
 
-			// Destruct frameset as soon as possible to send the data...
-			if (fs->hasAnyChanged(Channel::Depth)) fs->flush(Channel::Depth);
-			const_cast<ftl::data::FrameSetPtr&>(fs).reset();
+				// Must be in another thread and not callback directly
+				ftl::pool.push([fs,&frames,&latency](int id) {
+					if (fs->hasAnyChanged(Channel::Depth)) fs->flush(Channel::Depth);
+					++frames;
+					latency += float(ftl::timer::get_time() - fs->timestamp());
+					const_cast<ftl::data::FrameSetPtr&>(fs).reset();
+				});
+
+				// FIXME: Possible issue if fs is flushed in this thread...
+			});
+		}
+
+		// Do some encoding (eg. colour) whilst pipeline runs
+		ftl::pool.push([fs,&stats_count,&latency,&frames,&stats_time,&busy](int id){
+			if (fs->hasAnyChanged(Channel::Audio)) {
+				fs->flush(ftl::codecs::Channel::Audio);
+			}
+
+			// Make sure upload has completed.
+			cudaSafeCall(cudaEventSynchronize(fs->frames[0].uploadEvent()));
+			fs->flush(ftl::codecs::Channel::Colour);
 
 			if (!quiet && --stats_count <= 0) {
 				latency /= float(frames);
@@ -252,13 +277,6 @@ static void run(ftl::Configurable *root) {
 			}
 		});
 
-		// Lock colour right now to encode in parallel, same for audio
-		ftl::pool.push([fs](int id){ fs->flush(ftl::codecs::Channel::Colour); });
-
-		if (fs->hasAnyChanged(Channel::Audio)) {
-			ftl::pool.push([fs](int id){ fs->flush(ftl::codecs::Channel::Audio); });
-		}
-
 		return true;
 	});
 
diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp
index e24759185a9cee76e99ea95baccb14a8aa6f2844..4aba9bbd8e179c86ddc0d6305fb14b3579a6fb23 100644
--- a/components/operators/include/ftl/operators/disparity.hpp
+++ b/components/operators/include/ftl/operators/disparity.hpp
@@ -8,6 +8,7 @@
 #endif
 
 #include <opencv2/cudastereo.hpp>
+#include <opencv2/cudafilters.hpp>
 
 #ifdef HAVE_LIBSGM
 #include <libsgm.h>
@@ -57,6 +58,7 @@ class FixstarsSGM : public ftl::operators::Operator {
 	bool updateParameters();
 	bool updateP2Parameters();
 	void computeP2(cudaStream_t &stream);
+	void _variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream);
 
 	sgm::StereoSGM *ssgm_;
 	cv::Size size_;
@@ -71,6 +73,12 @@ class FixstarsSGM : public ftl::operators::Operator {
 	cv::cuda::GpuMat weightsF_;
 	cv::cuda::GpuMat edges_;
 	cv::Ptr<cv::cuda::CannyEdgeDetector> canny_;
+	cv::Ptr<cv::cuda::Filter> filter_;
+
+	cv::cuda::GpuMat im_;
+	cv::cuda::GpuMat im2_;
+	cv::cuda::GpuMat mean_;
+	cv::cuda::GpuMat mean2_;
 
 	int P1_;
 	int P2_;
diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp
index 51b01cb86f02571212ef7ab7c4164504e6e68069..5344e3c729f14f91be40dcc3a56bd07946819ad1 100644
--- a/components/operators/src/depth.cpp
+++ b/components/operators/src/depth.cpp
@@ -144,7 +144,8 @@ void DepthChannel::_createPipeline(size_t size) {
 	pipe_->append<ftl::operators::ColourChannels>("colour");  // Convert BGR to BGRA
 	pipe_->append<ftl::operators::CrossSupport>("cross");
 	#ifdef HAVE_OPTFLOW
-	pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow, Channel::Colour2, Channel::Flow2);
+	// FIXME: OpenCV Nvidia OptFlow has a horrible implementation that causes device syncs
+	//pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow, Channel::Colour2, Channel::Flow2);
 	//if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity);
 	#endif
 	#ifdef HAVE_LIBSGM
diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp
index dd3da86981d5742cd24561fb98121d3a584b9405..176dc8d10bf23ba91ebc3cf1976b7e2e08a96766 100644
--- a/components/operators/src/disparity/fixstars_sgm.cpp
+++ b/components/operators/src/disparity/fixstars_sgm.cpp
@@ -19,27 +19,22 @@ using ftl::operators::FixstarsSGM;
 using ftl::operators::Buffer;
 
 
-static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) {
+void FixstarsSGM::_variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) {
 	if (in.isGpuMat() && out.isGpuMat()) {
-		cv::cuda::GpuMat im;
-		cv::cuda::GpuMat im2;
-		cv::cuda::GpuMat mean;
-		cv::cuda::GpuMat mean2;
-
-		mean.create(in.size(), CV_32FC1);
-		mean2.create(in.size(), CV_32FC1);
-		im2.create(in.size(), CV_32FC1);
-		in.getGpuMat().convertTo(im, CV_32FC1, cvstream);
-
-		cv::cuda::multiply(im, im, im2, 1.0, CV_32FC1, cvstream);
-		auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
-		filter->apply(im, mean, cvstream);   // E[X]
-		filter->apply(im2, mean2, cvstream); // E[X^2]
-		cv::cuda::multiply(mean, mean, mean, 1.0, -1, cvstream); // (E[X])^2
+		mean_.create(in.size(), CV_32FC1);
+		mean2_.create(in.size(), CV_32FC1);
+		im2_.create(in.size(), CV_32FC1);
+		in.getGpuMat().convertTo(im_, CV_32FC1, cvstream);
+
+		cv::cuda::multiply(im_, im_, im2_, 1.0, CV_32FC1, cvstream);
+		if (!filter_) filter_ = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
+		filter_->apply(im_, mean_, cvstream);   // E[X]
+		filter_->apply(im2_, mean2_, cvstream); // E[X^2]
+		cv::cuda::multiply(mean_, mean_, mean_, 1.0, -1, cvstream); // (E[X])^2
 
 		// NOTE: floating point accuracy in subtraction
 		// (cv::cuda::createBoxFilter only supports float and 8 bit integer types)
-		cv::cuda::subtract(mean2, mean, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2
+		cv::cuda::subtract(mean2_, mean_, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2
 	}
 	else { throw std::exception(); /* todo CPU version */ }
 }
@@ -215,10 +210,10 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 
 	bool use_variance = config()->value("use_variance", true);
 	if (use_variance) {
-		variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream);
+		_variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream);
 		float minweight = std::min(1.0f, std::max(0.0f, config()->value("var_minweight", 0.5f)));
 		cv::cuda::normalize(weightsF_, weightsF_, minweight, 1.0, cv::NORM_MINMAX, -1, cv::noArray(), cvstream);
-		weightsF_.convertTo(weights_, CV_8UC1, 255.0f);
+		weightsF_.convertTo(weights_, CV_8UC1, 255.0f, cvstream);
 
 		//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
 		ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream);
@@ -242,7 +237,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
 	}
 
 	if (config()->value("show_P2_map", false)) {
-		cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA);
+		cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA, 0, cvstream);
 	}
 	if (config()->value("show_rpe", false)) {
 		ftl::cuda::show_rpe(disp, in.set<GpuMat>(Channel::Left), r, 100.0f, stream);
diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
index c1dd611c00e6830232a01d6f1eb86b643cd477cb..9ed95f6eab0831f12b502a36858c837f37edd8ad 100644
--- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
+++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu
@@ -328,8 +328,8 @@ namespace ftl { namespace cuda { namespace device
                 }
                 
 
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
+            //if (stream == 0)
+            //    cudaSafeCall( cudaDeviceSynchronize() );
         }
 
         // These are commented out since we don't use them and it slows compile
diff --git a/components/operators/src/mask.cpp b/components/operators/src/mask.cpp
index ca31069b2dfef3930b6e45ce328eb6be5bd970d7..3a2e7d8331f7fa27aad4c06973cab46f9fe1d23d 100644
--- a/components/operators/src/mask.cpp
+++ b/components/operators/src/mask.cpp
@@ -32,9 +32,10 @@ bool DiscontinuityMask::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaS
 	}
 
 	if (!out.hasChannel(Channel::Mask)) {
+		cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
 		auto &m = out.create<cv::cuda::GpuMat>(Channel::Mask);
 		m.create(in.get<cv::cuda::GpuMat>(Channel::Depth).size(), CV_8UC1);
-		m.setTo(cv::Scalar(0));
+		m.setTo(cv::Scalar(0), cvstream);
 	}
 
 	/*ftl::cuda::discontinuity(
diff --git a/components/operators/src/weighting.cpp b/components/operators/src/weighting.cpp
index a9d26b0dbe2a412b5202a535ae9f400c145e7fb1..33f1a922d7ec8306899db071c43fd5eeac94f4e8 100644
--- a/components/operators/src/weighting.cpp
+++ b/components/operators/src/weighting.cpp
@@ -45,9 +45,10 @@ bool PixelWeights::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream
 	Channel dchan = (in.hasChannel(Channel::Depth)) ? Channel::Depth : Channel::GroundTruth;
 
 	if (!out.hasChannel(Channel::Mask)) {
+		cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
 		auto &m = out.create<cv::cuda::GpuMat>(Channel::Mask);
 		m.create(in.get<cv::cuda::GpuMat>(dchan).size(), CV_8UC1);
-		m.setTo(cv::Scalar(0));
+		m.setTo(cv::Scalar(0), cvstream);
 	}
 
 	if (output_normals) {
diff --git a/components/rgbd-sources/src/frame.cpp b/components/rgbd-sources/src/frame.cpp
index 722e3606a1c36322bf8f079bf21701959a696c74..7d10552533e9fe44f7132fa1a6cc86751ed91f79 100644
--- a/components/rgbd-sources/src/frame.cpp
+++ b/components/rgbd-sources/src/frame.cpp
@@ -96,6 +96,7 @@ cv::cuda::GpuMat &VideoFrame::setGPU() {
 void ftl::rgbd::Frame::upload(ftl::codecs::Channel c) {
 	auto &vframe = set<VideoFrame>(c);
 	const auto &cpumat = vframe.getCPU();
+	LOG(WARNING) << "Sync Upload: " << int(c);
 	vframe.createGPU().upload(cpumat);
 }
 
diff --git a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
index c7572c8e79f3264d0a49acafeea5c8ad8f0db821..ae22bff07fa2645279cb92540af957ad7cec92fe 100644
--- a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
+++ b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp
@@ -338,6 +338,7 @@ void StereoVideoSource::updateParameters(ftl::rgbd::Frame &frame) {
 
 bool StereoVideoSource::capture(int64_t ts) {
 	cap_status_ = lsrc_->grab();
+	if (!cap_status_) LOG(WARNING) << "Capture failed";
 	return cap_status_;
 }
 
@@ -365,7 +366,7 @@ bool StereoVideoSource::retrieve(ftl::rgbd::Frame &frame) {
 		do_update_params_ = false;
 	}
 
-	auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream_);
+	auto cvstream = cv::cuda::StreamAccessor::wrapStream(frame.stream());
 
 	if (lsrc_->isStereo()) {
 		cv::cuda::GpuMat &left = frame.create<cv::cuda::GpuMat>(Channel::Left);
@@ -386,7 +387,10 @@ bool StereoVideoSource::retrieve(ftl::rgbd::Frame &frame) {
 	//LOG(INFO) << "Channel size: " << hres.size();
 
 	//pipeline_input_->apply(frame, frame, nullptr, stream_);
-	cudaSafeCall(cudaStreamSynchronize(stream_));
+	//cudaSafeCall(cudaStreamSynchronize(stream_));
+
+	cudaSafeCall(cudaEventRecord(frame.uploadEvent(), frame.stream()));
+	// FIXME: Currently possible that previous upload not finished
 
 	return true;
 }
diff --git a/components/streams/src/feed.cpp b/components/streams/src/feed.cpp
index e3d803609875b0583001c63ff5d95d31d5fc6416..bb0ef55f53e5ea74559a0b1fb8248884a9920af2 100644
--- a/components/streams/src/feed.cpp
+++ b/components/streams/src/feed.cpp
@@ -184,7 +184,8 @@ Feed::Feed(nlohmann::json &config, ftl::net::Universe*net) :
 
 			lk.unlock();
 
-			if (pipeline) pipeline->apply(*fs, *fs, 0);
+			if (pipeline) pipeline->apply(*fs, *fs, nullptr, fs->frames[0].stream());
+			cudaSafeCall(cudaStreamSynchronize(fs->frames[0].stream()));
 
 			lk.lock();
 
diff --git a/components/streams/src/sender.cpp b/components/streams/src/sender.cpp
index 69e74f5cd3e01d1126fa71d15a8c47ca9d2d5598..44aadc932f563b82059a3e634ec7b11efcb9d783 100644
--- a/components/streams/src/sender.cpp
+++ b/components/streams/src/sender.cpp
@@ -793,7 +793,7 @@ int Sender::_generateTiles(const ftl::rgbd::FrameSet &fs, int offset, Channel c,
 		} else {
 			cv::Rect roi((count % tx)*rwidth, (count / tx)*rheight, rwidth, rheight);
 			cv::cuda::GpuMat sroi = surface.surface(roi);
-			sroi.setTo(cv::Scalar(0));
+			sroi.setTo(cv::Scalar(0), stream);
 		}
 
 		++count;
diff --git a/components/structures/include/ftl/data/new_frame.hpp b/components/structures/include/ftl/data/new_frame.hpp
index 94c539086e6a24802cb4ca980345a36987c8be47..ebbceed9061a380e4ade9c5b6b011a163c3887b3 100644
--- a/components/structures/include/ftl/data/new_frame.hpp
+++ b/components/structures/include/ftl/data/new_frame.hpp
@@ -20,6 +20,8 @@
 #include <ftl/handle.hpp>
 #include <ftl/data/messages.hpp>
 
+#include <cuda_runtime.h>
+
 template<typename T> struct is_list : public std::false_type {};
 
 template<typename T>
@@ -630,6 +632,14 @@ class Frame {
 
 	inline FrameMode mode() const { return mode_; }
 
+	// ==== CUDA Functions =====================================================
+
+	cudaStream_t stream();
+
+	cudaEvent_t uploadEvent();
+
+	cudaEvent_t pipeEvent();
+
 	// ==== Wrapper functions ==================================================
 
 	void message(ftl::data::Message code, const std::string &msg);
@@ -681,6 +691,9 @@ class Frame {
 	FrameStatus status_;
 	FrameMode mode_ = FrameMode::PRIMARY;
 	uint64_t available_ = 0;
+	cudaStream_t stream_=0;
+	cudaEvent_t upload_event_=0;
+	cudaEvent_t pipe_event_=0;
 
 	inline void restart(int64_t ts) {
 		timestamp_ = ts;
diff --git a/components/structures/src/new_frame.cpp b/components/structures/src/new_frame.cpp
index 576896702e3e524639d59f903341fce1bc3c3916..36a1c7d06cc59ea9a710bb6a0035fdb6951a9c70 100644
--- a/components/structures/src/new_frame.cpp
+++ b/components/structures/src/new_frame.cpp
@@ -1,6 +1,7 @@
 #include <ftl/data/new_frame.hpp>
 #include <ftl/data/framepool.hpp>
 #include <ftl/timer.hpp>
+#include <ftl/cuda_common.hpp>
 
 using ftl::data::Frame;
 using ftl::data::Session;
@@ -84,6 +85,27 @@ Frame::~Frame() {
 	}
 };
 
+cudaStream_t Frame::stream() {
+	if (stream_ == 0) {
+		cudaSafeCall( cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) );
+	}
+	return stream_;
+}
+
+cudaEvent_t Frame::uploadEvent() {
+	if (upload_event_ == 0) {
+		cudaSafeCall( cudaEventCreate(&upload_event_) );
+	}
+	return upload_event_;
+}
+
+cudaEvent_t Frame::pipeEvent() {
+	if (pipe_event_ == 0) {
+		cudaSafeCall( cudaEventCreate(&pipe_event_) );
+	}
+	return pipe_event_;
+}
+
 bool ftl::data::Frame::hasAll(const std::unordered_set<ftl::codecs::Channel> &cs) {
 	for (auto &a : cs) {
 		if (!has(a)) return false;
@@ -346,6 +368,12 @@ void Frame::moveTo(Frame &f) {
 	f.changed_ = std::move(changed_);
 	f.packet_rx = (int)packet_rx;
 	f.packet_tx = (int)packet_tx;
+	f.stream_ = stream_;
+	f.upload_event_ = upload_event_;
+	f.pipe_event_ = pipe_event_;
+	stream_ = 0;
+	pipe_event_ = 0;
+	upload_event_ = 0;
 	status_ = FrameStatus::RELEASED;
 }
 
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
index 16567de556520b10a8fcbb3560bcd667ab1fb7b6..e5019ec3a67ddb1113ce2a7f1a873983359e5fb4 100644
--- a/lib/libsgm/src/path_aggregation.cu
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -29,6 +29,7 @@ PathAggregation<MAX_DISPARITY>::PathAggregation()
 		cudaStreamCreate(&m_streams[i]);
 		cudaEventCreate(&m_events[i]);
 	}
+	cudaEventCreate(&m_event);
 }
 
 template <size_t MAX_DISPARITY>
@@ -38,6 +39,7 @@ PathAggregation<MAX_DISPARITY>::~PathAggregation(){
 		cudaStreamDestroy(m_streams[i]);
 		cudaEventDestroy(m_events[i]);
 	}
+	cudaEventDestroy(m_event);
 }
 
 template <size_t MAX_DISPARITY>
@@ -58,7 +60,13 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
 		m_cost_buffer = DeviceBuffer<cost_type>(buffer_size);
 	}
 	const size_t buffer_step = width * height * MAX_DISPARITY;
-	cudaStreamSynchronize(stream);
+	//cudaStreamSynchronize(stream);
+	cudaEventRecord(m_event, stream);
+
+	for(unsigned int i = 0; i < NUM_PATHS; ++i){
+		cudaStreamWaitEvent(m_streams[i], m_event, 0);
+	}
+
 	path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
 		m_cost_buffer.data() + 0 * buffer_step,
 		left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]);
diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp
index 0b019a3b556fb92969acff843ad7431bc3d57b0e..7df96996da46711f7ddcb76dcbc44132ca9f194c 100644
--- a/lib/libsgm/src/path_aggregation.hpp
+++ b/lib/libsgm/src/path_aggregation.hpp
@@ -31,6 +31,7 @@ private:
 	DeviceBuffer<cost_type> m_cost_buffer;
 	cudaStream_t m_streams[NUM_PATHS];
 	cudaEvent_t m_events[NUM_PATHS];
+	cudaEvent_t m_event;
 
 public:
 	PathAggregation();