Newer
Older
/* Created by Nicolas Pope and Sebastian Hahta
*
* Implementation of algorithm presented in article(s):
*
* [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable
* for embedded real-time systems
* [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based
* Stereo Matching Algorithm on Embedded and Multi-Core Hardware
* [3] Humenberger, Engelke, Kubinger: A Census-Based Stereo Vision Algorithm Using Modified Semi-Global Matching
* and Plane Fitting to Improve Matching Quality.
*
* Equation numbering uses [1] unless otherwise stated
*/
#include <ftl/algorithms/rtcensus.hpp>
using ftl::algorithms::RTCensus;
using cv::Mat;
using cv::Point;
using cv::Size;
using std::tuple;
using std::get;
using std::make_tuple;
using std::bitset;
static ftl::Disparity::Register rtcensus("rtcensus", RTCensus::create);
/* (8) and (16) */
/* (14) and (15), based on (9) */
static vector<uint64_t> sparse_census_16x16(const Mat &arr) {
vector<uint64_t> result;
result.resize(arr.cols*arr.rows,0);
/* Loops adapted to avoid edge out-of-bounds checks */
for (size_t v=7; v<arr.rows-7; v++) {
for (size_t u=7; u<arr.cols-7; u++) {
uint64_t r = 0;
/* 16x16 sparse kernel to 8x8 mask (64 bits) */
for (int n=-7; n<=7; n+=2) {
auto u_ = u + n;
for (int m=-7; m<=7; m+=2) {
r |= XHI(arr.at<uint8_t>(v,u), arr.at<uint8_t>(v_,u_));
}
}
/*
* (19) note: M and N not the same as in (17), see also (8) in [2].
* DSI: Disparity Space Image. LTR/RTL matching can be with sign +1/-1
*/
static void dsi_ca(vector<uint16_t> &result, const vector<uint64_t> &census_R, const vector<uint64_t> &census_L, size_t w, size_t h, size_t d_start, size_t d_stop, int sign=1) {
assert( census_R.size() == w*h);
assert( census_L.size() == w*h);
assert( d_stop-d_start > 0 );
auto ds = d_stop - d_start; // Number of disparities to check
result.resize(census_R.size()*ds, 0);
// Change bounds depending upon disparity direction
// Adapt bounds to avoid out-of-bounds checks
for (size_t v=2; v<h-2; v++) {
for (size_t u=(sign>0)?2:ds+2; u<eu; u++) {
const size_t ix = v*w*ds+u*ds;
// 5x5 window size
for (int n=-2; n<=2; n++) {
const auto u_ = u + n;
for (int m=-2; m<=2; m++) {
const auto v_ = (v + m)*w;
auto r = census_R[u_+v_];
for (size_t d=0; d<ds; d++) {
const auto d_ = d * sign;
result[ix+d] += bitset<64>(r^l).count(); // Hamming distance
/*
* Find the minimum value in a sub array.
* TODO This can be removed entirely (see CUDA version)
*/
static size_t arrmin(vector<uint16_t> &a, size_t ix, size_t len) {
uint32_t m = UINT32_MAX;
size_t mi = 0;
for (size_t i=ix; i<ix+len; i++) {
if (a[i] < m) {
m = a[i];
mi = i;
}
/*
* WTA + subpixel disparity (parabilic fitting) (20)
* TODO remove need to pass tuples (see CUDA version)
*/
static inline double fit_parabola(tuple<size_t,uint16_t> p, tuple<size_t,uint16_t> pl, tuple<size_t,uint16_t> pr) {
double a = get<1>(pr) - get<1>(pl);
double b = 2 * (2 * get<1>(p) - get<1>(pl) - get<1>(pr));
return static_cast<double>(get<0>(p)) + (a / b);
static cv::Mat d_sub(vector<uint16_t> &dsi, size_t w, size_t h, size_t ds) {
for (size_t v=0; v<h; v++) {
const size_t vwds = v * w * ds;
for (size_t u=0; u<w; u++) {
const size_t uds = u*ds;
auto d_min = arrmin(dsi, vwds+uds, ds);
double d_sub;
if (d_min == 0 || d_min == ds-1) d_sub = d_min;
else {
// TODO Remove use of tuples
auto p = make_tuple(d_min, dsi[d_min+vwds+uds]);
auto pl = make_tuple(d_min-1, dsi[d_min-1+vwds+uds]);
auto pr = make_tuple(d_min+1, dsi[d_min+1+vwds+uds]);
// TODO Parameter pass not return
/*
* consistency between LR and RL disparity (23) and (24)
*/
static cv::Mat consistency(cv::Mat &d_sub_r, cv::Mat &d_sub_l) {
size_t w = d_sub_r.cols;
size_t h = d_sub_r.rows;
Mat result = Mat::zeros(Size(w,h), CV_32FC1);
for (size_t v=0; v<h; v++) {
for (size_t u=0; u<w; u++) {
auto a = (int)(d_sub_l.at<double>(v,u));
if (u-a < 0) continue;
auto b = d_sub_r.at<double>(v,u-a);
if (std::abs(a-b) <= 1.0) result.at<float>(v,u) = std::abs((a+b)/2);
else result.at<float>(v,u) = 0.0f;
RTCensus::RTCensus(nlohmann::json &config)
: Disparity(config),
gamma_(0.0f),
tau_(0.0f),
use_cuda_(config.value("use_cuda",true)),
alternate_(false) {}
/*
* Choose the implementation and perform disparity calculation.
*/
void RTCensus::compute(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) {
#if defined HAVE_CUDA
if (use_cuda_) {
computeCUDA(l,r,disp);
} else {
computeCPU(l,r,disp);
}
#else // !HAVE_CUDA
computeCPU(l,r,disp);
#endif
}
void RTCensus::computeCPU(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) {
size_t d_min = min_disp_;
size_t d_max = max_disp_;
Mat lbw, rbw;
cv::cvtColor(l, lbw, cv::COLOR_BGR2GRAY);
cv::cvtColor(r, rbw, cv::COLOR_BGR2GRAY);
auto start = std::chrono::high_resolution_clock::now();
auto census_R = sparse_census_16x16(rbw);
auto census_L = sparse_census_16x16(lbw);
std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
LOG(INFO) << "Census in " << elapsed.count() << "s";
start = std::chrono::high_resolution_clock::now();
vector<uint16_t> dsi_ca_R,dsi_ca_L;
dsi_ca(dsi_ca_R,census_R, census_L, l.cols, l.rows, d_min, d_max);
dsi_ca(dsi_ca_L,census_L, census_R, l.cols, l.rows, d_min, d_max, -1);
elapsed = std::chrono::high_resolution_clock::now() - start;
LOG(INFO) << "DSI in " << elapsed.count() << "s";
auto disp_R = d_sub(dsi_ca_R, l.cols, l.rows, d_max-d_min);
auto disp_L = d_sub(dsi_ca_L, l.cols, l.rows, d_max-d_min);
LOG(INFO) << "Disp done";
disp = consistency(disp_R, disp_L);
#if defined HAVE_CUDA
using namespace cv::cuda;
using namespace cv;
#include <vector_types.h>
namespace ftl { namespace gpu {
void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &s=0);
}}
void RTCensus::computeCUDA(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) {
// Initialise gpu memory here because we need image size
if (disp_.empty()) disp_ = cuda::GpuMat(l.size(), CV_32FC1);
if (left_.empty()) left_ = cuda::GpuMat(l.size(), CV_8UC4);
if (left2_.empty()) left2_ = cuda::GpuMat(l.size(), CV_8UC4);
if (right_.empty()) right_ = cuda::GpuMat(l.size(), CV_8UC4);
Mat lhsv, rhsv;
cv::cvtColor(l, lhsv, COLOR_BGR2HSV);
cv::cvtColor(r, rhsv, COLOR_BGR2HSV);
int from_to[] = {0,0,1,1,2,2,-1,3};
Mat hsval(lhsv.size(), CV_8UC4);
Mat hsvar(rhsv.size(), CV_8UC4);
mixChannels(&lhsv, 1, &hsval, 1, from_to, 4);
mixChannels(&rhsv, 1, &hsvar, 1, from_to, 4);
// Send images to GPU
if (alternate_) left_.upload(hsval);
else left2_.upload(hsval);
right_.upload(hsvar);
auto start = std::chrono::high_resolution_clock::now();
ftl::gpu::rtcensus_call((alternate_)?left_:left2_, right_, disp_, max_disp_);
std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
LOG(INFO) << "CUDA census in " << elapsed.count() << "s";
alternate_ = !alternate_;
// Read disparity from GPU
disp_.download(disp);
}
#endif