Skip to content
Snippets Groups Projects
Commit 3f8005e1 authored by Sebastian Hahta's avatar Sebastian Hahta Committed by Nicolas Pope
Browse files

Place all channels to same Frame class.

Simplifies StereoVideoSource::swap() method.
parent 59cd828f
No related branches found
No related tags found
No related merge requests found
...@@ -37,6 +37,7 @@ endif (LIBSGM_FOUND) ...@@ -37,6 +37,7 @@ endif (LIBSGM_FOUND)
if (CUDA_FOUND) if (CUDA_FOUND)
list(APPEND RGBDSRC list(APPEND RGBDSRC
src/algorithms/disp2depth.cu src/algorithms/disp2depth.cu
src/algorithms/offilter.cu
# "src/algorithms/opencv_cuda_bm.cpp" # "src/algorithms/opencv_cuda_bm.cpp"
# "src/algorithms/opencv_cuda_bp.cpp" # "src/algorithms/opencv_cuda_bp.cpp"
# "src/algorithms/rtcensus.cu" # "src/algorithms/rtcensus.cu"
......
#pragma once #pragma once
#include <ftl/config.h> #include <ftl/config.h>
#include <ftl/rgbd/frame.hpp>
#ifdef HAVE_OPTFLOW #ifdef HAVE_OPTFLOW
#include <opencv2/core.hpp> #include <opencv2/core.hpp>
...@@ -12,23 +13,15 @@ namespace rgbd { ...@@ -12,23 +13,15 @@ namespace rgbd {
class OFDisparityFilter { class OFDisparityFilter {
public: public:
OFDisparityFilter() : n_max_(0), threshold_(0.0), size_(0, 0) {} // TODO: invalid state OFDisparityFilter() : n_max_(0), threshold_(0.0) {}
OFDisparityFilter(cv::Size size, int n_frames, float threshold); OFDisparityFilter(cv::Size size, int n_frames, float threshold);
void filter(cv::Mat &disp, const cv::Mat &rgb); void filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream);
private: private:
int n_;
int n_max_; int n_max_;
float threshold_; float threshold_;
cv::Size size_;
cv::Mat disp_; cv::cuda::GpuMat disp_old_;
cv::Mat gray_;
cv::Mat flowxy_;
cv::Mat flowxy_up_;
cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> nvof_;
}; };
} }
......
/*
* Copyright (c) 2013, 2017 Alexey Tourbin
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*
* This is a traditional Quicksort implementation which mostly follows
* [Sedgewick 1978]. Sorting is performed entirely on array indices,
* while actual access to the array elements is abstracted out with the
* user-defined `LESS` and `SWAP` primitives.
*
* Synopsis:
* QSORT(N, LESS, SWAP);
* where
* N - the number of elements in A[];
* LESS(i, j) - compares A[i] to A[j];
* SWAP(i, j) - exchanges A[i] with A[j].
*/
#ifndef QSORT_H
#define QSORT_H
/* Sort 3 elements. */
#define Q_SORT3(q_a1, q_a2, q_a3, Q_LESS, Q_SWAP) \
do { \
if (Q_LESS(q_a2, q_a1)) { \
if (Q_LESS(q_a3, q_a2)) \
Q_SWAP(q_a1, q_a3); \
else { \
Q_SWAP(q_a1, q_a2); \
if (Q_LESS(q_a3, q_a2)) \
Q_SWAP(q_a2, q_a3); \
} \
} \
else if (Q_LESS(q_a3, q_a2)) { \
Q_SWAP(q_a2, q_a3); \
if (Q_LESS(q_a2, q_a1)) \
Q_SWAP(q_a1, q_a2); \
} \
} while (0)
/* Partition [q_l,q_r] around a pivot. After partitioning,
* [q_l,q_j] are the elements that are less than or equal to the pivot,
* while [q_i,q_r] are the elements greater than or equal to the pivot. */
#define Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP) \
do { \
/* The middle element, not to be confused with the median. */ \
Q_UINT q_m = q_l + ((q_r - q_l) >> 1); \
/* Reorder the second, the middle, and the last items. \
* As [Edelkamp Weiss 2016] explain, using the second element \
* instead of the first one helps avoid bad behaviour for \
* decreasingly sorted arrays. This method is used in recent \
* versions of gcc's std::sort, see gcc bug 58437#c13, although \
* the details are somewhat different (cf. #c14). */ \
Q_SORT3(q_l + 1, q_m, q_r, Q_LESS, Q_SWAP); \
/* Place the median at the beginning. */ \
Q_SWAP(q_l, q_m); \
/* Partition [q_l+2, q_r-1] around the median which is in q_l. \
* q_i and q_j are initially off by one, they get decremented \
* in the do-while loops. */ \
q_i = q_l + 1; q_j = q_r; \
while (1) { \
do q_i++; while (Q_LESS(q_i, q_l)); \
do q_j--; while (Q_LESS(q_l, q_j)); \
if (q_i >= q_j) break; /* Sedgewick says "until j < i" */ \
Q_SWAP(q_i, q_j); \
} \
/* Compensate for the i==j case. */ \
q_i = q_j + 1; \
/* Put the median to its final place. */ \
Q_SWAP(q_l, q_j); \
/* The median is not part of the left subfile. */ \
q_j--; \
} while (0)
/* Insertion sort is applied to small subfiles - this is contrary to
* Sedgewick's suggestion to run a separate insertion sort pass after
* the partitioning is done. The reason I don't like a separate pass
* is that it triggers extra comparisons, because it can't see that the
* medians are already in their final positions and need not be rechecked.
* Since I do not assume that comparisons are cheap, I also do not try
* to eliminate the (q_j > q_l) boundary check. */
#define Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP) \
do { \
Q_UINT q_i, q_j; \
/* For each item starting with the second... */ \
for (q_i = q_l + 1; q_i <= q_r; q_i++) \
/* move it down the array so that the first part is sorted. */ \
for (q_j = q_i; q_j > q_l && (Q_LESS(q_j, q_j - 1)); q_j--) \
Q_SWAP(q_j, q_j - 1); \
} while (0)
/* When the size of [q_l,q_r], i.e. q_r-q_l+1, is greater than or equal to
* Q_THRESH, the algorithm performs recursive partitioning. When the size
* drops below Q_THRESH, the algorithm switches to insertion sort.
* The minimum valid value is probably 5 (with 5 items, the second and
* the middle items, the middle itself being rounded down, are distinct). */
#define Q_THRESH 16
/* The main loop. */
#define Q_LOOP(Q_UINT, Q_N, Q_LESS, Q_SWAP) \
do { \
Q_UINT q_l = 0; \
Q_UINT q_r = (Q_N) - 1; \
Q_UINT q_sp = 0; /* the number of frames pushed to the stack */ \
struct { Q_UINT q_l, q_r; } \
/* On 32-bit platforms, to sort a "char[3GB+]" array, \
* it may take full 32 stack frames. On 64-bit CPUs, \
* though, the address space is limited to 48 bits. \
* The usage is further reduced if Q_N has a 32-bit type. */ \
q_st[sizeof(Q_UINT) > 4 && sizeof(Q_N) > 4 ? 48 : 32]; \
while (1) { \
if (q_r - q_l + 1 >= Q_THRESH) { \
Q_UINT q_i, q_j; \
Q_PARTITION(q_l, q_r, q_i, q_j, Q_UINT, Q_LESS, Q_SWAP); \
/* Now have two subfiles: [q_l,q_j] and [q_i,q_r]. \
* Dealing with them depends on which one is bigger. */ \
if (q_j - q_l >= q_r - q_i) \
Q_SUBFILES(q_l, q_j, q_i, q_r); \
else \
Q_SUBFILES(q_i, q_r, q_l, q_j); \
} \
else { \
Q_INSERTION_SORT(q_l, q_r, Q_UINT, Q_LESS, Q_SWAP); \
/* Pop subfiles from the stack, until it gets empty. */ \
if (q_sp == 0) break; \
q_sp--; \
q_l = q_st[q_sp].q_l; \
q_r = q_st[q_sp].q_r; \
} \
} \
} while (0)
/* The missing part: dealing with subfiles.
* Assumes that the first subfile is not smaller than the second. */
#define Q_SUBFILES(q_l1, q_r1, q_l2, q_r2) \
do { \
/* If the second subfile is only a single element, it needs \
* no further processing. The first subfile will be processed \
* on the next iteration (both subfiles cannot be only a single \
* element, due to Q_THRESH). */ \
if (q_l2 == q_r2) { \
q_l = q_l1; \
q_r = q_r1; \
} \
else { \
/* Otherwise, both subfiles need processing. \
* Push the larger subfile onto the stack. */ \
q_st[q_sp].q_l = q_l1; \
q_st[q_sp].q_r = q_r1; \
q_sp++; \
/* Process the smaller subfile on the next iteration. */ \
q_l = q_l2; \
q_r = q_r2; \
} \
} while (0)
/* And now, ladies and gentlemen, may I proudly present to you... */
#define QSORT(Q_N, Q_LESS, Q_SWAP) \
do { \
if ((Q_N) > 1) \
/* We could check sizeof(Q_N) and use "unsigned", but at least \
* on x86_64, this has the performance penalty of up to 5%. */ \
Q_LOOP(unsigned long, Q_N, Q_LESS, Q_SWAP); \
} while (0)
#endif
/* ex:set ts=8 sts=4 sw=4 noet: */
\ No newline at end of file
...@@ -131,11 +131,7 @@ void FixstarsSGM::compute(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream) ...@@ -131,11 +131,7 @@ void FixstarsSGM::compute(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
dispt_scaled.convertTo(disp, CV_32F, 1.0f / 16.0f, stream); dispt_scaled.convertTo(disp, CV_32F, 1.0f / 16.0f, stream);
#ifdef HAVE_OPTFLOW #ifdef HAVE_OPTFLOW
if (use_off_) if (use_off_) { off_.filter(frame, stream); }
{
frame.getChannel<Mat>(ftl::rgbd::kChanDisparity);
off_.filter(frame.setChannel<Mat>(ftl::rgbd::kChanDisparity), Mat(lbw_));
}
#endif #endif
} }
......
#include <ftl/cuda_common.hpp>
#include <ftl/rgbd/camera.hpp>
#include <opencv2/core/cuda_stream_accessor.hpp>
#include <qsort.h>
__device__ void quicksort(float A[], size_t n)
{
float tmp;
#define LESS(i, j) A[i] < A[j]
#define SWAP(i, j) tmp = A[i], A[i] = A[j], A[j] = tmp
QSORT(n, LESS, SWAP);
}
template<typename T>
__device__ static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
static const int max_history = 32; // TODO dynamic shared memory
__global__ void temporal_median_filter_kernel(
cv::cuda::PtrStepSz<float> disp,
cv::cuda::PtrStepSz<int16_t> optflow,
cv::cuda::PtrStepSz<float> history,
int n_max,
int16_t threshold, // fixed point 10.5
float granularity // 4 for Turing
)
{
float sorted[max_history]; // TODO: dynamic shared memory
for (STRIDE_Y(y, disp.rows)) {
for (STRIDE_X(x, disp.cols)) {
int flowx = optflow(round(y / granularity), 2 * round(x / granularity));
int flowy = optflow(round(y / granularity), 2 * round(x / granularity) + 1);
if ((abs(flowx) + abs(flowy)) > threshold)
{
// last element in history[x][y][t]
history(y, (x + 1) * n_max - 1) = 0.0;
return;
}
int count = history(y, (x + 1) * n_max - 1);
int n = count % (n_max - 1);
if (isValidDisparity(disp(y, x)))
{
history(y, (x + 1) * n_max - 1) += 1.0;
count++;
history(y, x * n_max + n) = disp(y, x);
}
int n_end = count;
if (n_end >= n_max) { n_end = n_max - 1; }
if (n_end != 0)
{
for (size_t i = 0; i < n_end; i++)
{
sorted[i] = history(y, x * n_max + i);
}
quicksort(sorted, n_end);
disp(y, x) = sorted[n_end / 2];
}
}}
}
namespace ftl {
namespace cuda {
void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
cv::cuda::GpuMat &history, int n, float threshold,
cv::cuda::Stream &stream)
{
dim3 grid(1, 1, 1);
dim3 threads(128, 1, 1);
grid.x = cv::cuda::device::divUp(disp.cols, 128);
grid.y = cv::cuda::device::divUp(disp.rows, 1);
// TODO: dynamic shared memory
temporal_median_filter_kernel<<<grid, threads, 0, cv::cuda::StreamAccessor::getStream(stream)>>>
( disp, optflow, history, n,
round(threshold * (1 << 5)), // TODO: documentation; 10.5 format
4 // TODO: (4 pixels granularity for Turing)
);
cudaSafeCall(cudaGetLastError());
}
}
}
\ No newline at end of file
...@@ -41,6 +41,9 @@ namespace cuda { ...@@ -41,6 +41,9 @@ namespace cuda {
void disparity_to_depth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat &depth, void disparity_to_depth(const cv::cuda::GpuMat &disparity, cv::cuda::GpuMat &depth,
const ftl::rgbd::Camera &c, cv::cuda::Stream &stream); const ftl::rgbd::Camera &c, cv::cuda::Stream &stream);
void optflow_filter(cv::cuda::GpuMat &disp, const cv::cuda::GpuMat &optflow,
cv::cuda::GpuMat &history, int n_max, float threshold,
cv::cuda::Stream &stream);
} }
} }
......
#include "ftl/offilter.hpp" #include "ftl/offilter.hpp"
#include "cuda_algorithms.hpp"
#ifdef HAVE_OPTFLOW #ifdef HAVE_OPTFLOW
...@@ -14,101 +15,26 @@ using std::vector; ...@@ -14,101 +15,26 @@ using std::vector;
template<typename T> static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO template<typename T> static bool inline isValidDisparity(T d) { return (0.0 < d) && (d < 256.0); } // TODO
OFDisparityFilter::OFDisparityFilter(Size size, int n_frames, float threshold) : OFDisparityFilter::OFDisparityFilter(Size size, int n_frames, float threshold) :
n_(0), n_max_(n_frames), threshold_(threshold), size_(size) n_max_(n_frames + 1), threshold_(threshold)
{ {
CHECK((n_max_ > 1) && (n_max_ <= 32)) << "History length must be between 0 and 31!";
disp_old_ = cv::cuda::GpuMat(cv::Size(size.width * n_max_, size.height), CV_32FC1);
disp_ = Mat::zeros(cv::Size(size.width * n_frames, size.height), CV_64FC1); /*nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
gray_ = Mat::zeros(size, CV_8UC1);
nvof_ = cv::cuda::NvidiaOpticalFlow_1_0::create(size.width, size.height,
cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW, cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW,
true, false, false, 0); true, false, false, 0);*/
} }
void OFDisparityFilter::filter(Mat &disp, const Mat &gray) void OFDisparityFilter::filter(ftl::rgbd::Frame &frame, cv::cuda::Stream &stream)
{ {
const cv::cuda::GpuMat &optflow = frame.getChannel<cv::cuda::GpuMat>(kChanFlow, stream);
frame.getChannel<cv::cuda::GpuMat>(kChanDisparity, stream);
stream.waitForCompletion();
if (optflow.empty()) { return; }
const int n = n_; cv::cuda::GpuMat &disp = frame.setChannel<cv::cuda::GpuMat>(kChanDisparity);
n_ = (n_ + 1) % n_max_; ftl::cuda::optflow_filter(disp, optflow, disp_old_, n_max_, threshold_, stream);
nvof_->calc(gray, gray_, flowxy_);
nvof_->upSampler( flowxy_, size_.width, size_.height,
nvof_->getGridSize(), flowxy_up_);
CHECK(disp.type() == CV_32FC1);
CHECK(gray.type() == CV_8UC1);
CHECK(flowxy_up_.type() == CV_32FC2);
gray.copyTo(gray_);
vector<float> values(n_max_);
for (int y = 0; y < size_.height; y++)
{
float *D = disp_.ptr<float>(y);
float *d = disp.ptr<float>(y);
float *flow = flowxy_up_.ptr<float>(y);
for (int x = 0; x < size_.width; x++)
{
const float flow_l1 = abs(flow[2*x]) + abs(flow[2*x + 1]);
if (flow_l1 < threshold_)
{
values.clear();
if (isValidDisparity(d[x]))
{
bool updated = false;
for (int i = 0; i < n_max_; i++)
{
float &val = D[n_max_ * x + (n_max_ - i + n) % n_max_];
if (!isValidDisparity(val))
{
val = d[x];
updated = true;
}
}
if (!updated) { D[n_max_ * x + n] = d[x]; }
}
for (int i = 0; i < n_max_; i++)
{
float &val = D[n_max_ * x + i];
if (isValidDisparity(val)) { values.push_back(val); }
}
if (values.size() > 0) {
const auto median_it = values.begin() + values.size() / 2;
std::nth_element(values.begin(), median_it , values.end());
d[x] = *median_it;
}
/*
if (isValidDepth(d[x]) && isValidDepth(D[x]))
{
D[x] = D[x] * 0.66 + d[x] * (1.0 - 0.66);
}
if (isValidDepth(D[x]))
{
d[x] = D[x];
}
else
{
D[x] = d[x];
}
*/
}
else
{
for (int i = 0; i < n_max_; i++)
{
D[n_max_ * x + i] = 0.0;
}
}
}
}
} }
#endif // HAVE_OPTFLOW #endif // HAVE_OPTFLOW
...@@ -192,9 +192,9 @@ bool StereoVideoSource::retrieve() { ...@@ -192,9 +192,9 @@ bool StereoVideoSource::retrieve() {
if (frames_[1].hasChannel(kChanLeftGray)) if (frames_[1].hasChannel(kChanLeftGray))
{ {
auto &left_gray_prev = frame.getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_); auto &left_gray_prev = frames_[1].getChannel<cv::cuda::GpuMat>(kChanLeftGray, stream2_);
auto &optflow = frame.setChannel<cv::cuda::GpuMat>(kChanFlow); auto &optflow = frame.setChannel<cv::cuda::GpuMat>(kChanFlow);
nvof_->calc(left_gray, left_gray_prev, optflow_, stream2_); nvof_->calc(left_gray, left_gray_prev, optflow, stream2_);
// nvof_->upSampler() isn't implemented with CUDA // nvof_->upSampler() isn't implemented with CUDA
// cv::cuda::resize() does not work wiht 2-channel input // cv::cuda::resize() does not work wiht 2-channel input
// cv::cuda::resize(optflow_, optflow, left.size(), 0.0, 0.0, cv::INTER_NEAREST, stream2_); // cv::cuda::resize(optflow_, optflow, left.size(), 0.0, 0.0, cv::INTER_NEAREST, stream2_);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment