diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp
index afc9f92e2df1f22a0547325d03fbcbdcc1febc19..1e2863fac1043904284a0bd3828e38dcec72ebbe 100644
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -125,7 +125,25 @@ static ftl::rgbd::Generator *createFileGenerator(ftl::Configurable *root, const
 	return gen;
 }
 
+static void threadSetCUDADevice() {
+	// Ensure all threads have correct cuda device
+	std::atomic<int> ijobs = 0;
+	for (int i=0; i<ftl::pool.size(); ++i) {
+		ftl::pool.push([&ijobs](int id) {
+			ftl::cuda::setDevice();
+			++ijobs;
+			while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
+		});
+	}
+	while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
+}
+
 static void run(ftl::Configurable *root) {
+	// Use other GPU if available.
+	ftl::cuda::setDevice(ftl::cuda::deviceCount()-1);
+	threadSetCUDADevice();
+	
+
 	Universe *net = ftl::create<Universe>(root, "net");
 	ftl::ctrl::Master ctrl(root, net);
 
@@ -188,6 +206,7 @@ static void run(ftl::Configurable *root) {
 			LOG(INFO) << "Found " << (max_stream+1) << " sources in " << path;
 
 			auto *gen = createFileGenerator(root, path);
+
 			auto reconstr = ftl::create<ftl::Reconstruction>(root, std::string("recon")+std::to_string(i), std::to_string(i));
 			reconstr->setGenerator(gen);
 			reconstr->onFrameSet([sender,i](ftl::rgbd::FrameSet &fs) {
diff --git a/components/codecs/include/ftl/codecs/decoder.hpp b/components/codecs/include/ftl/codecs/decoder.hpp
index 25dcc6a2334539cff28bfe241e705dbe3ba6cefb..5ebce8ed5ef5a3b234445f9be60f27258046530f 100644
--- a/components/codecs/include/ftl/codecs/decoder.hpp
+++ b/components/codecs/include/ftl/codecs/decoder.hpp
@@ -3,6 +3,7 @@
 
 #include <opencv2/opencv.hpp>
 #include <opencv2/core/cuda.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
 
 #include <ftl/codecs/packet.hpp>
 
@@ -32,17 +33,17 @@ void free(Decoder *&e);
  */
 class Decoder {
 	public:
-	Decoder() {};
-	virtual ~Decoder() {};
+	Decoder() { cudaStreamCreate(&stream_); };
+	virtual ~Decoder() { cudaStreamDestroy(stream_); };
 
 	virtual bool decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out)=0;
 
 	virtual bool accepts(const ftl::codecs::Packet &)=0;
 
-	cv::cuda::Stream &stream() { return stream_; }
+	cudaStream_t stream() { return stream_; }
 
 	protected:
-	cv::cuda::Stream stream_;
+	cudaStream_t stream_;
 };
 
 }
diff --git a/components/codecs/src/nvpipe_decoder.cpp b/components/codecs/src/nvpipe_decoder.cpp
index 339bb665bfbdb1b9be80f6cc8c1d1732c38b63c0..5a61969a748547b9a1a666227433b9eec0f90d5c 100644
--- a/components/codecs/src/nvpipe_decoder.cpp
+++ b/components/codecs/src/nvpipe_decoder.cpp
@@ -44,7 +44,7 @@ bool NvPipeDecoder::_checkIFrame(ftl::codecs::codec_t codec, const unsigned char
 }
 
 bool NvPipeDecoder::decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out) {
-	cudaSetDevice(0);
+	//cudaSetDevice(0);
 	UNIQUE_LOCK(mutex_,lk);
 	if (pkt.codec != codec_t::HEVC && pkt.codec != codec_t::H264 && pkt.codec != codec_t::HEVC_LOSSLESS && pkt.codec != codec_t::H264_LOSSLESS) return false;
 
diff --git a/components/codecs/src/nvpipe_encoder.cpp b/components/codecs/src/nvpipe_encoder.cpp
index d34078cc5798b0d8f739c74f1c7d0b4101e6c0ca..c8a44e47c223dac248f204010330757a289ebf75 100644
--- a/components/codecs/src/nvpipe_encoder.cpp
+++ b/components/codecs/src/nvpipe_encoder.cpp
@@ -93,7 +93,7 @@ static uint64_t calculateBitrate(definition_t def, float ratescale) {
 }
 
 bool NvPipeEncoder::encode(const cv::cuda::GpuMat &in, ftl::codecs::Packet &pkt) {
-	cudaSetDevice(0);
+	//cudaSetDevice(0);
 
 	if (pkt.codec != codec_t::Any && !supports(pkt.codec)) {
 		pkt.codec = codec_t::Invalid;
diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp
index 1b4f3a47afb000a9d1e05d264a5dd79a9d1c3e4a..576b383a97f9a63b00cdbfbabedd32bec0270912 100644
--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -28,6 +28,12 @@ bool hasCompute(int major, int minor);
 
 int deviceCount();
 
+int getDevice();
+
+void setDevice(int);
+
+void setDevice();
+
 template <typename T>
 struct Float;
 
diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp
index b29c1df08ba14d61a17d8b7afce290b353c25ce6..01b0bb346e113d3d3b3be52dec21f5a8ffdfd81a 100644
--- a/components/common/cpp/src/cuda_common.cpp
+++ b/components/common/cpp/src/cuda_common.cpp
@@ -2,6 +2,7 @@
 
 using ftl::cuda::TextureObjectBase;
 
+static int dev_to_use = 0;
 static int dev_count = 0;
 static std::vector<cudaDeviceProp> properties;
 
@@ -36,6 +37,19 @@ int ftl::cuda::deviceCount() {
 	return dev_count;
 }
 
+int ftl::cuda::getDevice() {
+	return dev_to_use;
+}
+
+void ftl::cuda::setDevice(int id) {
+	dev_to_use = id;
+	ftl::cuda::setDevice();
+}
+
+void ftl::cuda::setDevice() {
+	cudaSafeCall(cudaSetDevice(dev_to_use));
+}
+
 TextureObjectBase::~TextureObjectBase() {
 	free();
 }
diff --git a/components/streams/src/receiver.cpp b/components/streams/src/receiver.cpp
index 88bb1e858e978352c2007aae1a908a75d13e9f17..fbe2321f35dea3f37194889c72662208355785a5 100644
--- a/components/streams/src/receiver.cpp
+++ b/components/streams/src/receiver.cpp
@@ -140,6 +140,11 @@ void Receiver::_processAudio(const StreamPacket &spkt, const Packet &pkt) {
 }
 
 void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
+	//ftl::cuda::setDevice();
+	//int dev;
+	//cudaSafeCall(cudaGetDevice(&dev));
+	//LOG(INFO) << "Cuda device = " << dev;
+
 	const ftl::codecs::Channel rchan = spkt.channel;
 	const unsigned int channum = (unsigned int)spkt.channel;
 	InternalVideoStates &iframe = _getVideoFrame(spkt);
@@ -179,6 +184,7 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 	//LOG(INFO) << "Decode surface: " << (width*tx) << "x" << (height*ty);
 
 	auto &surface = iframe.surface[static_cast<int>(spkt.channel)];
+
 	surface.create(height*ty, width*tx, ((isFloatChannel(spkt.channel)) ? ((pkt.flags & 0x2) ? CV_16UC4 : CV_16U) : CV_8UC4));
 
 	// Do the actual decode
@@ -200,6 +206,8 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 		//return;
 	}
 
+	auto cvstream = cv::cuda::StreamAccessor::wrapStream(decoder->stream());
+
 	/*if (spkt.channel == Channel::Depth && (pkt.flags & 0x2)) {
 	cv::Mat tmp;
 	surface.download(tmp);
@@ -212,19 +220,19 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 
 		cv::Rect roi((i % tx)*width, (i / tx)*height, width, height);
 		cv::cuda::GpuMat sroi = surface(roi);
-
+		
 		// Do colour conversion
 		if (isFloatChannel(rchan) && (pkt.flags & 0x2)) {
 			//LOG(INFO) << "VUYA Convert";
-			ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, decoder->stream());
+			ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, cvstream);
 		} else if (isFloatChannel(rchan)) {
-			sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, decoder->stream());
+			sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, cvstream);
 		} else {
-			cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, decoder->stream());
+			cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, cvstream);
 		}
 	}
 
-	decoder->stream().waitForCompletion();
+	cudaSafeCall(cudaStreamSynchronize(decoder->stream()));
 
 	for (int i=0; i<pkt.frame_count; ++i) {
 		InternalVideoStates &frame = _getVideoFrame(spkt,i);