Skip to content
Snippets Groups Projects
Commit 82e4aade authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Implements #92 Use of second GPU

parent 4028afaf
No related branches found
No related tags found
No related merge requests found
......@@ -125,7 +125,25 @@ static ftl::rgbd::Generator *createFileGenerator(ftl::Configurable *root, const
return gen;
}
static void threadSetCUDADevice() {
// Ensure all threads have correct cuda device
std::atomic<int> ijobs = 0;
for (int i=0; i<ftl::pool.size(); ++i) {
ftl::pool.push([&ijobs](int id) {
ftl::cuda::setDevice();
++ijobs;
while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
});
}
while (ijobs < ftl::pool.size()) std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
static void run(ftl::Configurable *root) {
// Use other GPU if available.
ftl::cuda::setDevice(ftl::cuda::deviceCount()-1);
threadSetCUDADevice();
Universe *net = ftl::create<Universe>(root, "net");
ftl::ctrl::Master ctrl(root, net);
......@@ -188,6 +206,7 @@ static void run(ftl::Configurable *root) {
LOG(INFO) << "Found " << (max_stream+1) << " sources in " << path;
auto *gen = createFileGenerator(root, path);
auto reconstr = ftl::create<ftl::Reconstruction>(root, std::string("recon")+std::to_string(i), std::to_string(i));
reconstr->setGenerator(gen);
reconstr->onFrameSet([sender,i](ftl::rgbd::FrameSet &fs) {
......
......@@ -3,6 +3,7 @@
#include <opencv2/opencv.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <ftl/codecs/packet.hpp>
......@@ -32,17 +33,17 @@ void free(Decoder *&e);
*/
class Decoder {
public:
Decoder() {};
virtual ~Decoder() {};
Decoder() { cudaStreamCreate(&stream_); };
virtual ~Decoder() { cudaStreamDestroy(stream_); };
virtual bool decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out)=0;
virtual bool accepts(const ftl::codecs::Packet &)=0;
cv::cuda::Stream &stream() { return stream_; }
cudaStream_t stream() { return stream_; }
protected:
cv::cuda::Stream stream_;
cudaStream_t stream_;
};
}
......
......@@ -44,7 +44,7 @@ bool NvPipeDecoder::_checkIFrame(ftl::codecs::codec_t codec, const unsigned char
}
bool NvPipeDecoder::decode(const ftl::codecs::Packet &pkt, cv::cuda::GpuMat &out) {
cudaSetDevice(0);
//cudaSetDevice(0);
UNIQUE_LOCK(mutex_,lk);
if (pkt.codec != codec_t::HEVC && pkt.codec != codec_t::H264 && pkt.codec != codec_t::HEVC_LOSSLESS && pkt.codec != codec_t::H264_LOSSLESS) return false;
......
......@@ -93,7 +93,7 @@ static uint64_t calculateBitrate(definition_t def, float ratescale) {
}
bool NvPipeEncoder::encode(const cv::cuda::GpuMat &in, ftl::codecs::Packet &pkt) {
cudaSetDevice(0);
//cudaSetDevice(0);
if (pkt.codec != codec_t::Any && !supports(pkt.codec)) {
pkt.codec = codec_t::Invalid;
......
......@@ -28,6 +28,12 @@ bool hasCompute(int major, int minor);
int deviceCount();
int getDevice();
void setDevice(int);
void setDevice();
template <typename T>
struct Float;
......
......@@ -2,6 +2,7 @@
using ftl::cuda::TextureObjectBase;
static int dev_to_use = 0;
static int dev_count = 0;
static std::vector<cudaDeviceProp> properties;
......@@ -36,6 +37,19 @@ int ftl::cuda::deviceCount() {
return dev_count;
}
int ftl::cuda::getDevice() {
return dev_to_use;
}
void ftl::cuda::setDevice(int id) {
dev_to_use = id;
ftl::cuda::setDevice();
}
void ftl::cuda::setDevice() {
cudaSafeCall(cudaSetDevice(dev_to_use));
}
TextureObjectBase::~TextureObjectBase() {
free();
}
......
......@@ -140,6 +140,11 @@ void Receiver::_processAudio(const StreamPacket &spkt, const Packet &pkt) {
}
void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
//ftl::cuda::setDevice();
//int dev;
//cudaSafeCall(cudaGetDevice(&dev));
//LOG(INFO) << "Cuda device = " << dev;
const ftl::codecs::Channel rchan = spkt.channel;
const unsigned int channum = (unsigned int)spkt.channel;
InternalVideoStates &iframe = _getVideoFrame(spkt);
......@@ -179,6 +184,7 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
//LOG(INFO) << "Decode surface: " << (width*tx) << "x" << (height*ty);
auto &surface = iframe.surface[static_cast<int>(spkt.channel)];
surface.create(height*ty, width*tx, ((isFloatChannel(spkt.channel)) ? ((pkt.flags & 0x2) ? CV_16UC4 : CV_16U) : CV_8UC4));
// Do the actual decode
......@@ -200,6 +206,8 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
//return;
}
auto cvstream = cv::cuda::StreamAccessor::wrapStream(decoder->stream());
/*if (spkt.channel == Channel::Depth && (pkt.flags & 0x2)) {
cv::Mat tmp;
surface.download(tmp);
......@@ -212,19 +220,19 @@ void Receiver::_processVideo(const StreamPacket &spkt, const Packet &pkt) {
cv::Rect roi((i % tx)*width, (i / tx)*height, width, height);
cv::cuda::GpuMat sroi = surface(roi);
// Do colour conversion
if (isFloatChannel(rchan) && (pkt.flags & 0x2)) {
//LOG(INFO) << "VUYA Convert";
ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, decoder->stream());
ftl::cuda::vuya_to_depth(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), sroi, 16.0f, cvstream);
} else if (isFloatChannel(rchan)) {
sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, decoder->stream());
sroi.convertTo(frame.frame.get<cv::cuda::GpuMat>(spkt.channel), CV_32FC1, 1.0f/1000.0f, cvstream);
} else {
cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, decoder->stream());
cv::cuda::cvtColor(sroi, frame.frame.get<cv::cuda::GpuMat>(spkt.channel), cv::COLOR_RGBA2BGRA, 0, cvstream);
}
}
decoder->stream().waitForCompletion();
cudaSafeCall(cudaStreamSynchronize(decoder->stream()));
for (int i=0; i<pkt.frame_count; ++i) {
InternalVideoStates &frame = _getVideoFrame(spkt,i);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment