diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp index afc9f92e2df1f22a0547325d03fbcbdcc1febc19..1e2863fac1043904284a0bd3828e38dcec72ebbe 100644 --- a/applications/reconstruct/src/main.cpp +++ b/applications/reconstruct/src/main.cpp @@ -125,7 +125,25 @@ static ftl::rgbd::Generator *createFileGenerator(ftl::Configurable *root, const return gen; } +static void threadSetCUDADevice() { + // Ensure all threads have correct cuda device + std::atomic<int> ijobs = 0; + for (int i=0; i<ftl::pool.size(); ++i) { + ftl::pool.push([&ijobs](int id) { + ftl::cuda::setDevice(); + ++ijobs; + while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10)); + }); + } + while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10)); +} + static void run(ftl::Configurable *root) { + // Use other GPU if available. + ftl::cuda::setDevice(ftl::cuda::deviceCount()-1); + threadSetCUDADevice(); + + Universe *net = ftl::create<Universe>(root, "net"); ftl::ctrl::Master ctrl(root, net); @@ -188,6 +206,7 @@ static void run(ftl::Configurable *root) { LOG(INFO) << "Found " << (max_stream+1) << " sources in " << path; auto *gen = createFileGenerator(root, path); + auto reconstr = ftl::create<ftl::Reconstruction>(root, std::string("recon")+std::to_string(i), std::to_string(i)); reconstr->setGenerator(gen); reconstr->onFrameSet([sender,i](ftl::rgbd::FrameSet &fs) { diff --git a/components/codecs/include/ftl/codecs/decoder.hpp b/components/codecs/include/ftl/codecs/decoder.hpp index 25dcc6a2334539cff28bfe241e705dbe3ba6cefb..5ebce8ed5ef5a3b234445f9be60f27258046530f 100644 --- a/components/codecs/include/ftl/codecs/decoder.hpp +++ b/components/codecs/include/ftl/codecs/decoder.hpp @@ -3,6 +3,7 @@ #include <opencv2/opencv.hpp> #include <opencv2/core/cuda.hpp> +#include <opencv2/core/cuda_stream_accessor.hpp> #include <ftl/codecs/packet.hpp> @@ -32,17 +33,17 @@ void free(Decoder *&e); */ class Decoder { public: - Decoder() {}; - virtual ~Decoder() {}; + Decoder() { cudaStreamCreate(&stream_); }; + virtual ~Decoder() { cudaStreamDestroy(stream_); }; virtual bool decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out)=0; virtual bool accepts(const ftl::codecs::Packet &)=0; - cv::cuda::Stream &stream() { return stream_; } + cudaStream_t stream() { return stream_; } protected: - cv::cuda::Stream stream_; + cudaStream_t stream_; }; } diff --git a/components/codecs/src/nvpipe_decoder.cpp b/components/codecs/src/nvpipe_decoder.cpp index 339bb665bfbdb1b9be80f6cc8c1d1732c38b63c0..5a61969a748547b9a1a666227433b9eec0f90d5c 100644 --- a/components/codecs/src/nvpipe_decoder.cpp +++ b/components/codecs/src/nvpipe_decoder.cpp @@ -44,7 +44,7 @@ bool NvPipeDecoder::_checkIFrame(ftl::codecs::codec_t codec, const unsigned char } bool NvPipeDecoder::decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out) { - cudaSetDevice(0); + //cudaSetDevice(0); UNIQUE_LOCK(mutex_,lk); if (pkt.codec != codec_t::HEVC && pkt.codec != codec_t::H264 && pkt.codec != codec_t::HEVC_LOSSLESS && pkt.codec != codec_t::H264_LOSSLESS) return false; diff --git a/components/codecs/src/nvpipe_encoder.cpp b/components/codecs/src/nvpipe_encoder.cpp index d34078cc5798b0d8f739c74f1c7d0b4101e6c0ca..c8a44e47c223dac248f204010330757a289ebf75 100644 --- a/components/codecs/src/nvpipe_encoder.cpp +++ b/components/codecs/src/nvpipe_encoder.cpp @@ -93,7 +93,7 @@ static uint64_t calculateBitrate(definition_t def, float ratescale) { } bool NvPipeEncoder::encode(const cv::cuda::GpuMat &in, ftl::codecs::Packet &pkt) { - cudaSetDevice(0); + //cudaSetDevice(0); if (pkt.codec != codec_t::Any && !supports(pkt.codec)) { pkt.codec = codec_t::Invalid; diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp index 1b4f3a47afb000a9d1e05d264a5dd79a9d1c3e4a..576b383a97f9a63b00cdbfbabedd32bec0270912 100644 --- a/components/common/cpp/include/ftl/cuda_common.hpp +++ b/components/common/cpp/include/ftl/cuda_common.hpp @@ -28,6 +28,12 @@ bool hasCompute(int major, int minor); int deviceCount(); +int getDevice(); + +void setDevice(int); + +void setDevice(); + template <typename T> struct Float; diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp index b29c1df08ba14d61a17d8b7afce290b353c25ce6..01b0bb346e113d3d3b3be52dec21f5a8ffdfd81a 100644 --- a/components/common/cpp/src/cuda_common.cpp +++ b/components/common/cpp/src/cuda_common.cpp @@ -2,6 +2,7 @@ using ftl::cuda::TextureObjectBase; +static int dev_to_use = 0; static int dev_count = 0; static std::vector<cudaDeviceProp> properties; @@ -36,6 +37,19 @@ int ftl::cuda::deviceCount() { return dev_count; } +int ftl::cuda::getDevice() { + return dev_to_use; +} + +void ftl::cuda::setDevice(int id) { + dev_to_use = id; + ftl::cuda::setDevice(); +} + +void ftl::cuda::setDevice() { + cudaSafeCall(cudaSetDevice(dev_to_use)); +} + TextureObjectBase::~TextureObjectBase() { free(); } diff --git a/components/streams/src/receiver.cpp b/components/streams/src/receiver.cpp index 88bb1e858e978352c2007aae1a908a75d13e9f17..fbe2321f35dea3f37194889c72662208355785a5 100644 --- a/components/streams/src/receiver.cpp +++ b/components/streams/src/receiver.cpp @@ -140,6 +140,11 @@ void Receiver::_processAudio(const StreamPacket &spkt, const Packet &pkt) { } void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) { + //ftl::cuda::setDevice(); + //int dev; + //cudaSafeCall(cudaGetDevice(&dev)); + //LOG(INFO) << "Cuda device = " << dev; + const ftl::codecs::Channel rchan = spkt.channel; const unsigned int channum = (unsigned int)spkt.channel; InternalVideoStates &iframe = _getVideoFrame(spkt); @@ -179,6 +184,7 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) { //LOG(INFO) << "Decode surface: " << (width*tx) << "x" << (height*ty); auto &surface = iframe.surface[static_cast<int>(spkt.channel)]; + surface.create(height*ty, width*tx, ((isFloatChannel(spkt.channel)) ? ((pkt.flags & 0x2) ? CV_16UC4 : CV_16U) : CV_8UC4)); // Do the actual decode @@ -200,6 +206,8 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) { //return; } + auto cvstream = cv::cuda::StreamAccessor::wrapStream(decoder->stream()); + /*if (spkt.channel == Channel::Depth && (pkt.flags & 0x2)) { cv::Mat tmp; surface.download(tmp); @@ -212,19 +220,19 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) { cv::Rect roi((i % tx)*width, (i / tx)*height, width, height); cv::cuda::GpuMat sroi = surface(roi); - + // Do colour conversion if (isFloatChannel(rchan) && (pkt.flags & 0x2)) { //LOG(INFO) << "VUYA Convert"; - ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, decoder->stream()); + ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, cvstream); } else if (isFloatChannel(rchan)) { - sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, decoder->stream()); + sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, cvstream); } else { - cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, decoder->stream()); + cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, cvstream); } } - decoder->stream().waitForCompletion(); + cudaSafeCall(cudaStreamSynchronize(decoder->stream())); for (int i=0; i<pkt.frame_count; ++i) { InternalVideoStates &frame = _getVideoFrame(spkt,i);