diff --git a/applications/vision/src/main.cpp b/applications/vision/src/main.cpp index 7e93cf5dc8b237069d266804c11922d945aa9546..974cb398e784206f1ed98822b52281fece4059f7 100644 --- a/applications/vision/src/main.cpp +++ b/applications/vision/src/main.cpp @@ -62,6 +62,17 @@ using json = nlohmann::json; static bool quiet = false; +static void _cudaCallback(void *ud) { + auto *cb = (std::function<void()>*)ud; + (*cb)(); + delete cb; +} + +// TODO: Move this to a common location +static void cudaCallback(cudaStream_t stream, const std::function<void()> &cb) { + cudaSafeCall(cudaLaunchHostFunc(stream, _cudaCallback, (void*)(new std::function<void()>(cb)))); +} + static void run(ftl::Configurable *root) { Universe *net = ftl::create<Universe>(root, "net"); ftl::ctrl::Master ctrl(root, net); @@ -106,7 +117,7 @@ static void run(ftl::Configurable *root) { //LOG(INFO) << "LATENCY: " << float(latency)/1000.0f << "ms"; if (clock_adjust != 0) { - LOG(INFO) << "Clock adjustment: " << clock_adjust << ", latency=" << float(latency)/1000.0f << "ms"; + LOG(INFO) << "Clock adjustment: " << clock_adjust << ", latency=" << float(latency)/2000.0f << "ms"; ftl::timer::setClockAdjustment(clock_adjust); } }); @@ -222,22 +233,36 @@ static void run(ftl::Configurable *root) { fs->set(ftl::data::FSFlag::AUTO_SEND); - // Do all processing in another thread... - ftl::pool.push([sender,&stats_count,&latency,&frames,&stats_time,pipeline,&busy,fs](int id) mutable { - if (busy.test_and_set()) { - LOG(WARNING) << "Depth pipeline drop: " << fs->timestamp(); - fs->firstFrame().message(ftl::data::Message::Warning_PIPELINE_DROP, "Depth pipeline drop"); - return; - } - pipeline->apply(*fs, *fs); - busy.clear(); + if (busy.test_and_set()) { + LOG(WARNING) << "Depth pipeline drop: " << fs->timestamp(); + fs->firstFrame().message(ftl::data::Message::Warning_PIPELINE_DROP, "Depth pipeline drop"); + } else { + pipeline->apply(*fs, *fs, nullptr, fs->frames[0].stream()); - ++frames; - latency += float(ftl::timer::get_time() - fs->timestamp()); + cudaCallback(fs->frames[0].stream(), [fs,&frames,&latency,&busy]() { + busy.clear(); - // Destruct frameset as soon as possible to send the data... - if (fs->hasAnyChanged(Channel::Depth)) fs->flush(Channel::Depth); - const_cast<ftl::data::FrameSetPtr&>(fs).reset(); + // Must be in another thread and not callback directly + ftl::pool.push([fs,&frames,&latency](int id) { + if (fs->hasAnyChanged(Channel::Depth)) fs->flush(Channel::Depth); + ++frames; + latency += float(ftl::timer::get_time() - fs->timestamp()); + const_cast<ftl::data::FrameSetPtr&>(fs).reset(); + }); + + // FIXME: Possible issue if fs is flushed in this thread... + }); + } + + // Do some encoding (eg. colour) whilst pipeline runs + ftl::pool.push([fs,&stats_count,&latency,&frames,&stats_time,&busy](int id){ + if (fs->hasAnyChanged(Channel::Audio)) { + fs->flush(ftl::codecs::Channel::Audio); + } + + // Make sure upload has completed. + cudaSafeCall(cudaEventSynchronize(fs->frames[0].uploadEvent())); + fs->flush(ftl::codecs::Channel::Colour); if (!quiet && --stats_count <= 0) { latency /= float(frames); @@ -252,13 +277,6 @@ static void run(ftl::Configurable *root) { } }); - // Lock colour right now to encode in parallel, same for audio - ftl::pool.push([fs](int id){ fs->flush(ftl::codecs::Channel::Colour); }); - - if (fs->hasAnyChanged(Channel::Audio)) { - ftl::pool.push([fs](int id){ fs->flush(ftl::codecs::Channel::Audio); }); - } - return true; }); diff --git a/components/operators/include/ftl/operators/disparity.hpp b/components/operators/include/ftl/operators/disparity.hpp index e24759185a9cee76e99ea95baccb14a8aa6f2844..4aba9bbd8e179c86ddc0d6305fb14b3579a6fb23 100644 --- a/components/operators/include/ftl/operators/disparity.hpp +++ b/components/operators/include/ftl/operators/disparity.hpp @@ -8,6 +8,7 @@ #endif #include <opencv2/cudastereo.hpp> +#include <opencv2/cudafilters.hpp> #ifdef HAVE_LIBSGM #include <libsgm.h> @@ -57,6 +58,7 @@ class FixstarsSGM : public ftl::operators::Operator { bool updateParameters(); bool updateP2Parameters(); void computeP2(cudaStream_t &stream); + void _variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream); sgm::StereoSGM *ssgm_; cv::Size size_; @@ -71,6 +73,12 @@ class FixstarsSGM : public ftl::operators::Operator { cv::cuda::GpuMat weightsF_; cv::cuda::GpuMat edges_; cv::Ptr<cv::cuda::CannyEdgeDetector> canny_; + cv::Ptr<cv::cuda::Filter> filter_; + + cv::cuda::GpuMat im_; + cv::cuda::GpuMat im2_; + cv::cuda::GpuMat mean_; + cv::cuda::GpuMat mean2_; int P1_; int P2_; diff --git a/components/operators/src/depth.cpp b/components/operators/src/depth.cpp index 51b01cb86f02571212ef7ab7c4164504e6e68069..5344e3c729f14f91be40dcc3a56bd07946819ad1 100644 --- a/components/operators/src/depth.cpp +++ b/components/operators/src/depth.cpp @@ -144,7 +144,8 @@ void DepthChannel::_createPipeline(size_t size) { pipe_->append<ftl::operators::ColourChannels>("colour"); // Convert BGR to BGRA pipe_->append<ftl::operators::CrossSupport>("cross"); #ifdef HAVE_OPTFLOW - pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow, Channel::Colour2, Channel::Flow2); + // FIXME: OpenCV Nvidia OptFlow has a horrible implementation that causes device syncs + //pipe_->append<ftl::operators::NVOpticalFlow>("optflow", Channel::Colour, Channel::Flow, Channel::Colour2, Channel::Flow2); //if (size == 1) pipe_->append<ftl::operators::OpticalFlowTemporalSmoothing>("optflow_filter", Channel::Disparity); #endif #ifdef HAVE_LIBSGM diff --git a/components/operators/src/disparity/fixstars_sgm.cpp b/components/operators/src/disparity/fixstars_sgm.cpp index dd3da86981d5742cd24561fb98121d3a584b9405..176dc8d10bf23ba91ebc3cf1976b7e2e08a96766 100644 --- a/components/operators/src/disparity/fixstars_sgm.cpp +++ b/components/operators/src/disparity/fixstars_sgm.cpp @@ -19,27 +19,22 @@ using ftl::operators::FixstarsSGM; using ftl::operators::Buffer; -static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) { +void FixstarsSGM::_variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) { if (in.isGpuMat() && out.isGpuMat()) { - cv::cuda::GpuMat im; - cv::cuda::GpuMat im2; - cv::cuda::GpuMat mean; - cv::cuda::GpuMat mean2; - - mean.create(in.size(), CV_32FC1); - mean2.create(in.size(), CV_32FC1); - im2.create(in.size(), CV_32FC1); - in.getGpuMat().convertTo(im, CV_32FC1, cvstream); - - cv::cuda::multiply(im, im, im2, 1.0, CV_32FC1, cvstream); - auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize)); - filter->apply(im, mean, cvstream); // E[X] - filter->apply(im2, mean2, cvstream); // E[X^2] - cv::cuda::multiply(mean, mean, mean, 1.0, -1, cvstream); // (E[X])^2 + mean_.create(in.size(), CV_32FC1); + mean2_.create(in.size(), CV_32FC1); + im2_.create(in.size(), CV_32FC1); + in.getGpuMat().convertTo(im_, CV_32FC1, cvstream); + + cv::cuda::multiply(im_, im_, im2_, 1.0, CV_32FC1, cvstream); + if (!filter_) filter_ = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize)); + filter_->apply(im_, mean_, cvstream); // E[X] + filter_->apply(im2_, mean2_, cvstream); // E[X^2] + cv::cuda::multiply(mean_, mean_, mean_, 1.0, -1, cvstream); // (E[X])^2 // NOTE: floating point accuracy in subtraction // (cv::cuda::createBoxFilter only supports float and 8 bit integer types) - cv::cuda::subtract(mean2, mean, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2 + cv::cuda::subtract(mean2_, mean_, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2 } else { throw std::exception(); /* todo CPU version */ } } @@ -215,10 +210,10 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { bool use_variance = config()->value("use_variance", true); if (use_variance) { - variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream); + _variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream); float minweight = std::min(1.0f, std::max(0.0f, config()->value("var_minweight", 0.5f))); cv::cuda::normalize(weightsF_, weightsF_, minweight, 1.0, cv::NORM_MINMAX, -1, cv::noArray(), cvstream); - weightsF_.convertTo(weights_, CV_8UC1, 255.0f); + weightsF_.convertTo(weights_, CV_8UC1, 255.0f, cvstream); //if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step; ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream); @@ -242,7 +237,7 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) { } if (config()->value("show_P2_map", false)) { - cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA); + cv::cuda::cvtColor(P2_map_, out.get<GpuMat>(Channel::Colour), cv::COLOR_GRAY2BGRA, 0, cvstream); } if (config()->value("show_rpe", false)) { ftl::cuda::show_rpe(disp, in.set<GpuMat>(Channel::Left), r, 100.0f, stream); diff --git a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu index c1dd611c00e6830232a01d6f1eb86b643cd477cb..9ed95f6eab0831f12b502a36858c837f37edd8ad 100644 --- a/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu +++ b/components/operators/src/disparity/opencv/disparity_bilateral_filter.cu @@ -328,8 +328,8 @@ namespace ftl { namespace cuda { namespace device } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + //if (stream == 0) + // cudaSafeCall( cudaDeviceSynchronize() ); } // These are commented out since we don't use them and it slows compile diff --git a/components/operators/src/mask.cpp b/components/operators/src/mask.cpp index ca31069b2dfef3930b6e45ce328eb6be5bd970d7..3a2e7d8331f7fa27aad4c06973cab46f9fe1d23d 100644 --- a/components/operators/src/mask.cpp +++ b/components/operators/src/mask.cpp @@ -32,9 +32,10 @@ bool DiscontinuityMask::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaS } if (!out.hasChannel(Channel::Mask)) { + cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto &m = out.create<cv::cuda::GpuMat>(Channel::Mask); m.create(in.get<cv::cuda::GpuMat>(Channel::Depth).size(), CV_8UC1); - m.setTo(cv::Scalar(0)); + m.setTo(cv::Scalar(0), cvstream); } /*ftl::cuda::discontinuity( diff --git a/components/operators/src/weighting.cpp b/components/operators/src/weighting.cpp index a9d26b0dbe2a412b5202a535ae9f400c145e7fb1..33f1a922d7ec8306899db071c43fd5eeac94f4e8 100644 --- a/components/operators/src/weighting.cpp +++ b/components/operators/src/weighting.cpp @@ -45,9 +45,10 @@ bool PixelWeights::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream Channel dchan = (in.hasChannel(Channel::Depth)) ? Channel::Depth : Channel::GroundTruth; if (!out.hasChannel(Channel::Mask)) { + cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream); auto &m = out.create<cv::cuda::GpuMat>(Channel::Mask); m.create(in.get<cv::cuda::GpuMat>(dchan).size(), CV_8UC1); - m.setTo(cv::Scalar(0)); + m.setTo(cv::Scalar(0), cvstream); } if (output_normals) { diff --git a/components/rgbd-sources/src/frame.cpp b/components/rgbd-sources/src/frame.cpp index 722e3606a1c36322bf8f079bf21701959a696c74..7d10552533e9fe44f7132fa1a6cc86751ed91f79 100644 --- a/components/rgbd-sources/src/frame.cpp +++ b/components/rgbd-sources/src/frame.cpp @@ -96,6 +96,7 @@ cv::cuda::GpuMat &VideoFrame::setGPU() { void ftl::rgbd::Frame::upload(ftl::codecs::Channel c) { auto &vframe = set<VideoFrame>(c); const auto &cpumat = vframe.getCPU(); + LOG(WARNING) << "Sync Upload: " << int(c); vframe.createGPU().upload(cpumat); } diff --git a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp index c7572c8e79f3264d0a49acafeea5c8ad8f0db821..ae22bff07fa2645279cb92540af957ad7cec92fe 100644 --- a/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp +++ b/components/rgbd-sources/src/sources/stereovideo/stereovideo.cpp @@ -338,6 +338,7 @@ void StereoVideoSource::updateParameters(ftl::rgbd::Frame &frame) { bool StereoVideoSource::capture(int64_t ts) { cap_status_ = lsrc_->grab(); + if (!cap_status_) LOG(WARNING) << "Capture failed"; return cap_status_; } @@ -365,7 +366,7 @@ bool StereoVideoSource::retrieve(ftl::rgbd::Frame &frame) { do_update_params_ = false; } - auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream_); + auto cvstream = cv::cuda::StreamAccessor::wrapStream(frame.stream()); if (lsrc_->isStereo()) { cv::cuda::GpuMat &left = frame.create<cv::cuda::GpuMat>(Channel::Left); @@ -386,7 +387,10 @@ bool StereoVideoSource::retrieve(ftl::rgbd::Frame &frame) { //LOG(INFO) << "Channel size: " << hres.size(); //pipeline_input_->apply(frame, frame, nullptr, stream_); - cudaSafeCall(cudaStreamSynchronize(stream_)); + //cudaSafeCall(cudaStreamSynchronize(stream_)); + + cudaSafeCall(cudaEventRecord(frame.uploadEvent(), frame.stream())); + // FIXME: Currently possible that previous upload not finished return true; } diff --git a/components/streams/src/feed.cpp b/components/streams/src/feed.cpp index e3d803609875b0583001c63ff5d95d31d5fc6416..bb0ef55f53e5ea74559a0b1fb8248884a9920af2 100644 --- a/components/streams/src/feed.cpp +++ b/components/streams/src/feed.cpp @@ -184,7 +184,8 @@ Feed::Feed(nlohmann::json &config, ftl::net::Universe*net) : lk.unlock(); - if (pipeline) pipeline->apply(*fs, *fs, 0); + if (pipeline) pipeline->apply(*fs, *fs, nullptr, fs->frames[0].stream()); + cudaSafeCall(cudaStreamSynchronize(fs->frames[0].stream())); lk.lock(); diff --git a/components/streams/src/sender.cpp b/components/streams/src/sender.cpp index 69e74f5cd3e01d1126fa71d15a8c47ca9d2d5598..44aadc932f563b82059a3e634ec7b11efcb9d783 100644 --- a/components/streams/src/sender.cpp +++ b/components/streams/src/sender.cpp @@ -793,7 +793,7 @@ int Sender::_generateTiles(const ftl::rgbd::FrameSet &fs, int offset, Channel c, } else { cv::Rect roi((count % tx)*rwidth, (count / tx)*rheight, rwidth, rheight); cv::cuda::GpuMat sroi = surface.surface(roi); - sroi.setTo(cv::Scalar(0)); + sroi.setTo(cv::Scalar(0), stream); } ++count; diff --git a/components/structures/include/ftl/data/new_frame.hpp b/components/structures/include/ftl/data/new_frame.hpp index 94c539086e6a24802cb4ca980345a36987c8be47..ebbceed9061a380e4ade9c5b6b011a163c3887b3 100644 --- a/components/structures/include/ftl/data/new_frame.hpp +++ b/components/structures/include/ftl/data/new_frame.hpp @@ -20,6 +20,8 @@ #include <ftl/handle.hpp> #include <ftl/data/messages.hpp> +#include <cuda_runtime.h> + template<typename T> struct is_list : public std::false_type {}; template<typename T> @@ -630,6 +632,14 @@ class Frame { inline FrameMode mode() const { return mode_; } + // ==== CUDA Functions ===================================================== + + cudaStream_t stream(); + + cudaEvent_t uploadEvent(); + + cudaEvent_t pipeEvent(); + // ==== Wrapper functions ================================================== void message(ftl::data::Message code, const std::string &msg); @@ -681,6 +691,9 @@ class Frame { FrameStatus status_; FrameMode mode_ = FrameMode::PRIMARY; uint64_t available_ = 0; + cudaStream_t stream_=0; + cudaEvent_t upload_event_=0; + cudaEvent_t pipe_event_=0; inline void restart(int64_t ts) { timestamp_ = ts; diff --git a/components/structures/src/new_frame.cpp b/components/structures/src/new_frame.cpp index 576896702e3e524639d59f903341fce1bc3c3916..36a1c7d06cc59ea9a710bb6a0035fdb6951a9c70 100644 --- a/components/structures/src/new_frame.cpp +++ b/components/structures/src/new_frame.cpp @@ -1,6 +1,7 @@ #include <ftl/data/new_frame.hpp> #include <ftl/data/framepool.hpp> #include <ftl/timer.hpp> +#include <ftl/cuda_common.hpp> using ftl::data::Frame; using ftl::data::Session; @@ -84,6 +85,27 @@ Frame::~Frame() { } }; +cudaStream_t Frame::stream() { + if (stream_ == 0) { + cudaSafeCall( cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking) ); + } + return stream_; +} + +cudaEvent_t Frame::uploadEvent() { + if (upload_event_ == 0) { + cudaSafeCall( cudaEventCreate(&upload_event_) ); + } + return upload_event_; +} + +cudaEvent_t Frame::pipeEvent() { + if (pipe_event_ == 0) { + cudaSafeCall( cudaEventCreate(&pipe_event_) ); + } + return pipe_event_; +} + bool ftl::data::Frame::hasAll(const std::unordered_set<ftl::codecs::Channel> &cs) { for (auto &a : cs) { if (!has(a)) return false; @@ -346,6 +368,12 @@ void Frame::moveTo(Frame &f) { f.changed_ = std::move(changed_); f.packet_rx = (int)packet_rx; f.packet_tx = (int)packet_tx; + f.stream_ = stream_; + f.upload_event_ = upload_event_; + f.pipe_event_ = pipe_event_; + stream_ = 0; + pipe_event_ = 0; + upload_event_ = 0; status_ = FrameStatus::RELEASED; } diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu index 16567de556520b10a8fcbb3560bcd667ab1fb7b6..e5019ec3a67ddb1113ce2a7f1a873983359e5fb4 100644 --- a/lib/libsgm/src/path_aggregation.cu +++ b/lib/libsgm/src/path_aggregation.cu @@ -29,6 +29,7 @@ PathAggregation<MAX_DISPARITY>::PathAggregation() cudaStreamCreate(&m_streams[i]); cudaEventCreate(&m_events[i]); } + cudaEventCreate(&m_event); } template <size_t MAX_DISPARITY> @@ -38,6 +39,7 @@ PathAggregation<MAX_DISPARITY>::~PathAggregation(){ cudaStreamDestroy(m_streams[i]); cudaEventDestroy(m_events[i]); } + cudaEventDestroy(m_event); } template <size_t MAX_DISPARITY> @@ -58,7 +60,13 @@ void PathAggregation<MAX_DISPARITY>::enqueue( m_cost_buffer = DeviceBuffer<cost_type>(buffer_size); } const size_t buffer_step = width * height * MAX_DISPARITY; - cudaStreamSynchronize(stream); + //cudaStreamSynchronize(stream); + cudaEventRecord(m_event, stream); + + for(unsigned int i = 0; i < NUM_PATHS; ++i){ + cudaStreamWaitEvent(m_streams[i], m_event, 0); + } + path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>( m_cost_buffer.data() + 0 * buffer_step, left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]); diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp index 0b019a3b556fb92969acff843ad7431bc3d57b0e..7df96996da46711f7ddcb76dcbc44132ca9f194c 100644 --- a/lib/libsgm/src/path_aggregation.hpp +++ b/lib/libsgm/src/path_aggregation.hpp @@ -31,6 +31,7 @@ private: DeviceBuffer<cost_type> m_cost_buffer; cudaStream_t m_streams[NUM_PATHS]; cudaEvent_t m_events[NUM_PATHS]; + cudaEvent_t m_event; public: PathAggregation();