From 82e4aadea4802da2a3fa803c8c7e3242132f2287 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nicolas.pope@utu.fi>
Date: Fri, 24 Jan 2020 10:28:24 +0200
Subject: [PATCH] Implements #92 Use of second GPU

---
 applications/reconstruct/src/main.cpp         | 19 +++++++++++++++++++
 .../codecs/include/ftl/codecs/decoder.hpp     |  9 +++++----
 components/codecs/src/nvpipe_decoder.cpp      |  2 +-
 components/codecs/src/nvpipe_encoder.cpp      |  2 +-
 .../common/cpp/include/ftl/cuda_common.hpp    |  6 ++++++
 components/common/cpp/src/cuda_common.cpp     | 14 ++++++++++++++
 components/streams/src/receiver.cpp           | 18 +++++++++++++-----
 7 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp
index afc9f92e2..1e2863fac 100644
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -125,7 +125,25 @@ static ftl::rgbd::Generator *createFileGenerator(ftl::Configurable *root, const
 	return gen;
 }
 
+static void threadSetCUDADevice() {
+	// Ensure all threads have correct cuda device
+	std::atomic<int> ijobs = 0;
+	for (int i=0; i<ftl::pool.size(); ++i) {
+		ftl::pool.push([&ijobs](int id) {
+			ftl::cuda::setDevice();
+			++ijobs;
+			while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
+		});
+	}
+	while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
+}
+
 static void run(ftl::Configurable *root) {
+	// Use other GPU if available.
+	ftl::cuda::setDevice(ftl::cuda::deviceCount()-1);
+	threadSetCUDADevice();
+	
+
 	Universe *net = ftl::create<Universe>(root, "net");
 	ftl::ctrl::Master ctrl(root, net);
 
@@ -188,6 +206,7 @@ static void run(ftl::Configurable *root) {
 			LOG(INFO) << "Found " << (max_stream+1) << " sources in " << path;
 
 			auto *gen = createFileGenerator(root, path);
+
 			auto reconstr = ftl::create<ftl::Reconstruction>(root, std::string("recon")+std::to_string(i), std::to_string(i));
 			reconstr->setGenerator(gen);
 			reconstr->onFrameSet([sender,i](ftl::rgbd::FrameSet &fs) {
diff --git a/components/codecs/include/ftl/codecs/decoder.hpp b/components/codecs/include/ftl/codecs/decoder.hpp
index 25dcc6a23..5ebce8ed5 100644
--- a/components/codecs/include/ftl/codecs/decoder.hpp
+++ b/components/codecs/include/ftl/codecs/decoder.hpp
@@ -3,6 +3,7 @@
 
 #include <opencv2/opencv.hpp>
 #include <opencv2/core/cuda.hpp>
+#include <opencv2/core/cuda_stream_accessor.hpp>
 
 #include <ftl/codecs/packet.hpp>
 
@@ -32,17 +33,17 @@ void free(Decoder *&e);
  */
 class Decoder {
 	public:
-	Decoder() {};
-	virtual ~Decoder() {};
+	Decoder() { cudaStreamCreate(&stream_); };
+	virtual ~Decoder() { cudaStreamDestroy(stream_); };
 
 	virtual bool decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out)=0;
 
 	virtual bool accepts(const ftl::codecs::Packet &)=0;
 
-	cv::cuda::Stream &stream() { return stream_; }
+	cudaStream_t stream() { return stream_; }
 
 	protected:
-	cv::cuda::Stream stream_;
+	cudaStream_t stream_;
 };
 
 }
diff --git a/components/codecs/src/nvpipe_decoder.cpp b/components/codecs/src/nvpipe_decoder.cpp
index 339bb665b..5a61969a7 100644
--- a/components/codecs/src/nvpipe_decoder.cpp
+++ b/components/codecs/src/nvpipe_decoder.cpp
@@ -44,7 +44,7 @@ bool NvPipeDecoder::_checkIFrame(ftl::codecs::codec_t codec, const unsigned char
 }
 
 bool NvPipeDecoder::decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out) {
-	cudaSetDevice(0);
+	//cudaSetDevice(0);
 	UNIQUE_LOCK(mutex_,lk);
 	if (pkt.codec != codec_t::HEVC && pkt.codec != codec_t::H264 && pkt.codec != codec_t::HEVC_LOSSLESS && pkt.codec != codec_t::H264_LOSSLESS) return false;
 
diff --git a/components/codecs/src/nvpipe_encoder.cpp b/components/codecs/src/nvpipe_encoder.cpp
index d34078cc5..c8a44e47c 100644
--- a/components/codecs/src/nvpipe_encoder.cpp
+++ b/components/codecs/src/nvpipe_encoder.cpp
@@ -93,7 +93,7 @@ static uint64_t calculateBitrate(definition_t def, float ratescale) {
 }
 
 bool NvPipeEncoder::encode(const cv::cuda::GpuMat &in, ftl::codecs::Packet &pkt) {
-	cudaSetDevice(0);
+	//cudaSetDevice(0);
 
 	if (pkt.codec != codec_t::Any && !supports(pkt.codec)) {
 		pkt.codec = codec_t::Invalid;
diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp
index 1b4f3a47a..576b383a9 100644
--- a/components/common/cpp/include/ftl/cuda_common.hpp
+++ b/components/common/cpp/include/ftl/cuda_common.hpp
@@ -28,6 +28,12 @@ bool hasCompute(int major, int minor);
 
 int deviceCount();
 
+int getDevice();
+
+void setDevice(int);
+
+void setDevice();
+
 template <typename T>
 struct Float;
 
diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp
index b29c1df08..01b0bb346 100644
--- a/components/common/cpp/src/cuda_common.cpp
+++ b/components/common/cpp/src/cuda_common.cpp
@@ -2,6 +2,7 @@
 
 using ftl::cuda::TextureObjectBase;
 
+static int dev_to_use = 0;
 static int dev_count = 0;
 static std::vector<cudaDeviceProp> properties;
 
@@ -36,6 +37,19 @@ int ftl::cuda::deviceCount() {
 	return dev_count;
 }
 
+int ftl::cuda::getDevice() {
+	return dev_to_use;
+}
+
+void ftl::cuda::setDevice(int id) {
+	dev_to_use = id;
+	ftl::cuda::setDevice();
+}
+
+void ftl::cuda::setDevice() {
+	cudaSafeCall(cudaSetDevice(dev_to_use));
+}
+
 TextureObjectBase::~TextureObjectBase() {
 	free();
 }
diff --git a/components/streams/src/receiver.cpp b/components/streams/src/receiver.cpp
index 88bb1e858..fbe2321f3 100644
--- a/components/streams/src/receiver.cpp
+++ b/components/streams/src/receiver.cpp
@@ -140,6 +140,11 @@ void Receiver::_processAudio(const StreamPacket &spkt, const Packet &pkt) {
 }
 
 void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
+	//ftl::cuda::setDevice();
+	//int dev;
+	//cudaSafeCall(cudaGetDevice(&dev));
+	//LOG(INFO) << "Cuda device = " << dev;
+
 	const ftl::codecs::Channel rchan = spkt.channel;
 	const unsigned int channum = (unsigned int)spkt.channel;
 	InternalVideoStates &iframe = _getVideoFrame(spkt);
@@ -179,6 +184,7 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 	//LOG(INFO) << "Decode surface: " << (width*tx) << "x" << (height*ty);
 
 	auto &surface = iframe.surface[static_cast<int>(spkt.channel)];
+
 	surface.create(height*ty, width*tx, ((isFloatChannel(spkt.channel)) ? ((pkt.flags & 0x2) ? CV_16UC4 : CV_16U) : CV_8UC4));
 
 	// Do the actual decode
@@ -200,6 +206,8 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 		//return;
 	}
 
+	auto cvstream = cv::cuda::StreamAccessor::wrapStream(decoder->stream());
+
 	/*if (spkt.channel == Channel::Depth && (pkt.flags & 0x2)) {
 	cv::Mat tmp;
 	surface.download(tmp);
@@ -212,19 +220,19 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
 
 		cv::Rect roi((i % tx)*width, (i / tx)*height, width, height);
 		cv::cuda::GpuMat sroi = surface(roi);
-
+		
 		// Do colour conversion
 		if (isFloatChannel(rchan) && (pkt.flags & 0x2)) {
 			//LOG(INFO) << "VUYA Convert";
-			ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, decoder->stream());
+			ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, cvstream);
 		} else if (isFloatChannel(rchan)) {
-			sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, decoder->stream());
+			sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, cvstream);
 		} else {
-			cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, decoder->stream());
+			cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, cvstream);
 		}
 	}
 
-	decoder->stream().waitForCompletion();
+	cudaSafeCall(cudaStreamSynchronize(decoder->stream()));
 
 	for (int i=0; i<pkt.frame_count; ++i) {
 		InternalVideoStates &frame = _getVideoFrame(spkt,i);
-- 
GitLab