diff --git a/components/rgbd-sources/CMakeLists.txt b/components/rgbd-sources/CMakeLists.txt index ff7d290981c1a84c5ee211219ae0651f85c58c77..3ac2e2de5f173a12bd7107d8d40b5d2eca3b4b5d 100644 --- a/components/rgbd-sources/CMakeLists.txt +++ b/components/rgbd-sources/CMakeLists.txt @@ -37,6 +37,7 @@ endif (LIBSGM_FOUND) if (CUDA_FOUND) list(APPEND RGBDSRC src/algorithms/disp2depth.cu + src/algorithms/offilter.cu # "src/algorithms/opencv_cuda_bm.cpp" # "src/algorithms/opencv_cuda_bp.cpp" # "src/algorithms/rtcensus.cu" diff --git a/components/rgbd-sources/include/ftl/offilter.hpp b/components/rgbd-sources/include/ftl/offilter.hpp index 4c4fbbb9837ef39e80f9aad68c62ae5d82d4671e..c5e7a9cf7b02d11906043f8652ce615fe1ab5f52 100644 --- a/components/rgbd-sources/include/ftl/offilter.hpp +++ b/components/rgbd-sources/include/ftl/offilter.hpp @@ -1,6 +1,7 @@ #pragma once #include <ftl/config.h> +#include <ftl/rgbd/frame.hpp> #ifdef HAVE_OPTFLOW #include <opencv2/core.hpp> @@ -12,23 +13,15 @@ namespace rgbd { class OFDisparityFilter { public: - OFDisparityFilter() : n_max_(0), threshold_(0.0), size_(0, 0) {} // TODO: invalid state + OFDisparityFilter() : n_max_(0), threshold_(0.0) {} OFDisparityFilter(cv::Size size, int n_frames, float threshold); - void filter(cv::Mat &disp, const cv::Mat &rgb); + void filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream); private: - int n_; int n_max_; float threshold_; - cv::Size size_; - cv::Mat disp_; - cv::Mat gray_; - - cv::Mat flowxy_; - cv::Mat flowxy_up_; - - cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> nvof_; + cv::cuda::GpuMat disp_old_; }; } diff --git a/components/rgbd-sources/include/qsort.h b/components/rgbd-sources/include/qsort.h new file mode 100644 index 0000000000000000000000000000000000000000..62a76b836c1b13861fc8a5a12ff0fc0eb7936f8a --- /dev/null +++ b/components/rgbd-sources/include/qsort.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2013, 2017 Alexey Tourbin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This is a traditional Quicksort implementation which mostly follows + * [Sedgewick 1978]. Sorting is performed entirely on array indices, + * while actual access to the array elements is abstracted out with the + * user-defined `LESS` and `SWAP` primitives. + * + * Synopsis: + * QSORT(N, LESS, SWAP); + * where + * N - the number of elements in A[]; + * LESS(i, j) - compares A[i] to A[j]; + * SWAP(i, j) - exchanges A[i] with A[j]. + */ + +#ifndef QSORT_H +#define QSORT_H + +/* Sort 3 elements. */ +#define Q_SORT3(q_a1, q_a2, q_a3, Q_LESS, Q_SWAP) \ +do { \ + if (Q_LESS(q_a2, q_a1)) { \ + if (Q_LESS(q_a3, q_a2)) \ + Q_SWAP(q_a1, q_a3); \ + else { \ + Q_SWAP(q_a1, q_a2); \ + if (Q_LESS(q_a3, q_a2)) \ + Q_SWAP(q_a2, q_a3); \ + } \ + } \ + else if (Q_LESS(q_a3, q_a2)) { \ + Q_SWAP(q_a2, q_a3); \ + if (Q_LESS(q_a2, q_a1)) \ + Q_SWAP(q_a1, q_a2); \ + } \ +} while (0) + +/* Partition [q_l,q_r] around a pivot. After partitioning, + * [q_l,q_j] are the elements that are less than or equal to the pivot, + * while [q_i,q_r] are the elements greater than or equal to the pivot. */ +#define Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP) \ +do { \ + /* The middle element, not to be confused with the median. */ \ + Q_UINT q_m = q_l + ((q_r - q_l) >> 1); \ + /* Reorder the second, the middle, and the last items. \ + * As [Edelkamp Weiss 2016] explain, using the second element \ + * instead of the first one helps avoid bad behaviour for \ + * decreasingly sorted arrays. This method is used in recent \ + * versions of gcc's std::sort, see gcc bug 58437#c13, although \ + * the details are somewhat different (cf. #c14). */ \ + Q_SORT3(q_l + 1, q_m, q_r, Q_LESS, Q_SWAP); \ + /* Place the median at the beginning. */ \ + Q_SWAP(q_l, q_m); \ + /* Partition [q_l+2, q_r-1] around the median which is in q_l. \ + * q_i and q_j are initially off by one, they get decremented \ + * in the do-while loops. */ \ + q_i = q_l + 1; q_j = q_r; \ + while (1) { \ + do q_i++; while (Q_LESS(q_i, q_l)); \ + do q_j--; while (Q_LESS(q_l, q_j)); \ + if (q_i >= q_j) break; /* Sedgewick says "until j < i" */ \ + Q_SWAP(q_i, q_j); \ + } \ + /* Compensate for the i==j case. */ \ + q_i = q_j + 1; \ + /* Put the median to its final place. */ \ + Q_SWAP(q_l, q_j); \ + /* The median is not part of the left subfile. */ \ + q_j--; \ +} while (0) + +/* Insertion sort is applied to small subfiles - this is contrary to + * Sedgewick's suggestion to run a separate insertion sort pass after + * the partitioning is done. The reason I don't like a separate pass + * is that it triggers extra comparisons, because it can't see that the + * medians are already in their final positions and need not be rechecked. + * Since I do not assume that comparisons are cheap, I also do not try + * to eliminate the (q_j > q_l) boundary check. */ +#define Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP) \ +do { \ + Q_UINT q_i, q_j; \ + /* For each item starting with the second... */ \ + for (q_i = q_l + 1; q_i <= q_r; q_i++) \ + /* move it down the array so that the first part is sorted. */ \ + for (q_j = q_i; q_j > q_l && (Q_LESS(q_j, q_j - 1)); q_j--) \ + Q_SWAP(q_j, q_j - 1); \ +} while (0) + +/* When the size of [q_l,q_r], i.e. q_r-q_l+1, is greater than or equal to + * Q_THRESH, the algorithm performs recursive partitioning. When the size + * drops below Q_THRESH, the algorithm switches to insertion sort. + * The minimum valid value is probably 5 (with 5 items, the second and + * the middle items, the middle itself being rounded down, are distinct). */ +#define Q_THRESH 16 + +/* The main loop. */ +#define Q_LOOP(Q_UINT, Q_N, Q_LESS, Q_SWAP) \ +do { \ + Q_UINT q_l = 0; \ + Q_UINT q_r = (Q_N) - 1; \ + Q_UINT q_sp = 0; /* the number of frames pushed to the stack */ \ + struct { Q_UINT q_l, q_r; } \ + /* On 32-bit platforms, to sort a "char[3GB+]" array, \ + * it may take full 32 stack frames. On 64-bit CPUs, \ + * though, the address space is limited to 48 bits. \ + * The usage is further reduced if Q_N has a 32-bit type. */ \ + q_st[sizeof(Q_UINT) > 4 && sizeof(Q_N) > 4 ? 48 : 32]; \ + while (1) { \ + if (q_r - q_l + 1 >= Q_THRESH) { \ + Q_UINT q_i, q_j; \ + Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP); \ + /* Now have two subfiles: [q_l,q_j] and [q_i,q_r]. \ + * Dealing with them depends on which one is bigger. */ \ + if (q_j - q_l >= q_r - q_i) \ + Q_SUBFILES(q_l, q_j, q_i, q_r); \ + else \ + Q_SUBFILES(q_i, q_r, q_l, q_j); \ + } \ + else { \ + Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP); \ + /* Pop subfiles from the stack, until it gets empty. */ \ + if (q_sp == 0) break; \ + q_sp--; \ + q_l = q_st[q_sp].q_l; \ + q_r = q_st[q_sp].q_r; \ + } \ + } \ +} while (0) + +/* The missing part: dealing with subfiles. + * Assumes that the first subfile is not smaller than the second. */ +#define Q_SUBFILES(q_l1, q_r1, q_l2, q_r2) \ +do { \ + /* If the second subfile is only a single element, it needs \ + * no further processing. The first subfile will be processed \ + * on the next iteration (both subfiles cannot be only a single \ + * element, due to Q_THRESH). */ \ + if (q_l2 == q_r2) { \ + q_l = q_l1; \ + q_r = q_r1; \ + } \ + else { \ + /* Otherwise, both subfiles need processing. \ + * Push the larger subfile onto the stack. */ \ + q_st[q_sp].q_l = q_l1; \ + q_st[q_sp].q_r = q_r1; \ + q_sp++; \ + /* Process the smaller subfile on the next iteration. */ \ + q_l = q_l2; \ + q_r = q_r2; \ + } \ +} while (0) + +/* And now, ladies and gentlemen, may I proudly present to you... */ +#define QSORT(Q_N, Q_LESS, Q_SWAP) \ +do { \ + if ((Q_N) > 1) \ + /* We could check sizeof(Q_N) and use "unsigned", but at least \ + * on x86_64, this has the performance penalty of up to 5%. */ \ + Q_LOOP(unsigned long, Q_N, Q_LESS, Q_SWAP); \ +} while (0) + +#endif + +/* ex:set ts=8 sts=4 sw=4 noet: */ \ No newline at end of file diff --git a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp index 782338fc2be41d56a4d6fcd4f4920f6d920e663f..de0ae29326cb8bd1d0e738e049abf40389b52040 100644 --- a/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp +++ b/components/rgbd-sources/src/algorithms/fixstars_sgm.cpp @@ -131,11 +131,7 @@ void FixstarsSGM::compute(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream) dispt_scaled.convertTo(disp, CV_32F, 1.0f / 16.0f, stream); #ifdef HAVE_OPTFLOW - if (use_off_) - { - frame.getChannel<Mat>(ftl::rgbd::kChanDisparity); - off_.filter(frame.setChannel<Mat>(ftl::rgbd::kChanDisparity), Mat(lbw_)); - } + if (use_off_) { off_.filter(frame, stream); } #endif } diff --git a/components/rgbd-sources/src/algorithms/offilter.cu b/components/rgbd-sources/src/algorithms/offilter.cu new file mode 100644 index 0000000000000000000000000000000000000000..6feee5ae1daf9fc8b4b165370dbb8646b1d7b8a1 --- /dev/null +++ b/components/rgbd-sources/src/algorithms/offilter.cu @@ -0,0 +1,91 @@ +#include <ftl/cuda_common.hpp> +#include <ftl/rgbd/camera.hpp> +#include <opencv2/core/cuda_stream_accessor.hpp> +#include <qsort.h> + +__device__ void quicksort(float A[], size_t n) +{ + float tmp; + #define LESS(i, j) A[i] < A[j] + #define SWAP(i, j) tmp = A[i], A[i] = A[j], A[j] = tmp + QSORT(n, LESS, SWAP); +} + +template<typename T> +__device__ static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO + +static const int max_history = 32; // TODO dynamic shared memory + +__global__ void temporal_median_filter_kernel( + cv::cuda::PtrStepSz<float> disp, + cv::cuda::PtrStepSz<int16_t> optflow, + cv::cuda::PtrStepSz<float> history, + int n_max, + int16_t threshold, // fixed point 10.5 + float granularity // 4 for Turing +) +{ + float sorted[max_history]; // TODO: dynamic shared memory + for (STRIDE_Y(y, disp.rows)) { + for (STRIDE_X(x, disp.cols)) { + + int flowx = optflow(round(y / granularity), 2 * round(x / granularity)); + int flowy = optflow(round(y / granularity), 2 * round(x / granularity) + 1); + + if ((abs(flowx) + abs(flowy)) > threshold) + { + // last element in history[x][y][t] + history(y, (x + 1) * n_max - 1) = 0.0; + return; + } + + int count = history(y, (x + 1) * n_max - 1); + int n = count % (n_max - 1); + + if (isValidDisparity(disp(y, x))) + { + history(y, (x + 1) * n_max - 1) += 1.0; + count++; + history(y, x * n_max + n) = disp(y, x); + } + + int n_end = count; + if (n_end >= n_max) { n_end = n_max - 1; } + + if (n_end != 0) + { + for (size_t i = 0; i < n_end; i++) + { + sorted[i] = history(y, x * n_max + i); + } + + quicksort(sorted, n_end); + disp(y, x) = sorted[n_end / 2]; + } + }} +} + +namespace ftl { +namespace cuda { + +void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow, + cv::cuda::GpuMat &history, int n, float threshold, + cv::cuda::Stream &stream) +{ + dim3 grid(1, 1, 1); + dim3 threads(128, 1, 1); + grid.x = cv::cuda::device::divUp(disp.cols, 128); + grid.y = cv::cuda::device::divUp(disp.rows, 1); + + // TODO: dynamic shared memory + temporal_median_filter_kernel<<<grid, threads, 0, cv::cuda::StreamAccessor::getStream(stream)>>> + ( disp, optflow, history, n, + round(threshold * (1 << 5)), // TODO: documentation; 10.5 format + 4 // TODO: (4 pixels granularity for Turing) + ); + + cudaSafeCall(cudaGetLastError()); +} + +} +} \ No newline at end of file diff --git a/components/rgbd-sources/src/cuda_algorithms.hpp b/components/rgbd-sources/src/cuda_algorithms.hpp index 0aa7399c0a24035bdbbb0442b3c429ddd03d145c..439c16cfc21fef08086bb69b7e84b1c8b49fec74 100644 --- a/components/rgbd-sources/src/cuda_algorithms.hpp +++ b/components/rgbd-sources/src/cuda_algorithms.hpp @@ -41,6 +41,9 @@ namespace cuda { void disparity_to_depth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat &depth, const ftl::rgbd::Camera &c, cv::cuda::Stream &stream); + void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow, + cv::cuda::GpuMat &history, int n_max, float threshold, + cv::cuda::Stream &stream); } } diff --git a/components/rgbd-sources/src/offilter.cpp b/components/rgbd-sources/src/offilter.cpp index 03db4807a7f9fa045708d3cf17bf32556b7bf5b5..8344540b9f0dc1cea1ff62e694d7ed774201a5f5 100644 --- a/components/rgbd-sources/src/offilter.cpp +++ b/components/rgbd-sources/src/offilter.cpp @@ -1,4 +1,5 @@ #include "ftl/offilter.hpp" +#include "cuda_algorithms.hpp" #ifdef HAVE_OPTFLOW @@ -14,101 +15,26 @@ using std::vector; template<typename T> static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO OFDisparityFilter::OFDisparityFilter(Size size, int n_frames, float threshold) : - n_(0), n_max_(n_frames), threshold_(threshold), size_(size) + n_max_(n_frames + 1), threshold_(threshold) { + CHECK((n_max_ > 1) && (n_max_ <= 32)) << "History length must be between 0 and 31!"; + disp_old_ = cv::cuda::GpuMat(cv::Size(size.width * n_max_, size.height), CV_32FC1); - disp_ = Mat::zeros(cv::Size(size.width * n_frames, size.height), CV_64FC1); - gray_ = Mat::zeros(size, CV_8UC1); - - nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height, + /*nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height, cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW, - true, false, false, 0); + true, false, false, 0);*/ } -void OFDisparityFilter::filter(Mat &disp, const Mat &gray) +void OFDisparityFilter::filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream) { + const cv::cuda::GpuMat &optflow = frame.getChannel<cv::cuda::GpuMat>(kChanFlow, stream); + frame.getChannel<cv::cuda::GpuMat>(kChanDisparity, stream); + stream.waitForCompletion(); + if (optflow.empty()) { return; } - const int n = n_; - n_ = (n_ + 1) % n_max_; - - nvof_->calc(gray, gray_, flowxy_); - nvof_->upSampler( flowxy_, size_.width, size_.height, - nvof_->getGridSize(), flowxy_up_); - - CHECK(disp.type() == CV_32FC1); - CHECK(gray.type() == CV_8UC1); - CHECK(flowxy_up_.type() == CV_32FC2); - - gray.copyTo(gray_); - - vector<float> values(n_max_); - - for (int y = 0; y < size_.height; y++) - { - float *D = disp_.ptr<float>(y); - float *d = disp.ptr<float>(y); - float *flow = flowxy_up_.ptr<float>(y); - - for (int x = 0; x < size_.width; x++) - { - const float flow_l1 = abs(flow[2*x]) + abs(flow[2*x + 1]); - - if (flow_l1 < threshold_) - { - values.clear(); - - if (isValidDisparity(d[x])) - { - bool updated = false; - for (int i = 0; i < n_max_; i++) - { - float &val = D[n_max_ * x + (n_max_ - i + n) % n_max_]; - if (!isValidDisparity(val)) - { - val = d[x]; - updated = true; - } - } - if (!updated) { D[n_max_ * x + n] = d[x]; } - } - - for (int i = 0; i < n_max_; i++) - { - float &val = D[n_max_ * x + i]; - if (isValidDisparity(val)) { values.push_back(val); } - } - - if (values.size() > 0) { - const auto median_it = values.begin() + values.size() / 2; - std::nth_element(values.begin(), median_it , values.end()); - d[x] = *median_it; - } - - /* - if (isValidDepth(d[x]) && isValidDepth(D[x])) - { - D[x] = D[x] * 0.66 + d[x] * (1.0 - 0.66); - } - if (isValidDepth(D[x])) - { - d[x] = D[x]; - } - else - { - D[x] = d[x]; - } - */ - } - else - { - for (int i = 0; i < n_max_; i++) - { - D[n_max_ * x + i] = 0.0; - } - } - } - } + cv::cuda::GpuMat &disp = frame.setChannel<cv::cuda::GpuMat>(kChanDisparity); + ftl::cuda::optflow_filter(disp, optflow, disp_old_, n_max_, threshold_, stream); } #endif // HAVE_OPTFLOW diff --git a/components/rgbd-sources/src/stereovideo.cpp b/components/rgbd-sources/src/stereovideo.cpp index ebc72d854e8d35a480f3d4ef5873e8b58d5bb086..d718d1f7ed1f29bac101f639317671acf420a882 100644 --- a/components/rgbd-sources/src/stereovideo.cpp +++ b/components/rgbd-sources/src/stereovideo.cpp @@ -192,9 +192,9 @@ bool StereoVideoSource::retrieve() { if (frames_[1].hasChannel(kChanLeftGray)) { - auto &left_gray_prev = frame.getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_); + auto &left_gray_prev = frames_[1].getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_); auto &optflow = frame.setChannel<cv::cuda::GpuMat>(kChanFlow); - nvof_->calc(left_gray, left_gray_prev, optflow_, stream2_); + nvof_->calc(left_gray, left_gray_prev, optflow, stream2_); // nvof_->upSampler() isn't implemented with CUDA // cv::cuda::resize() does not work wiht 2-channel input // cv::cuda::resize(optflow_, optflow, left.size(), 0.0, 0.0, cv::INTER_NEAREST, stream2_);