diff --git a/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp b/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp new file mode 100644 index 0000000000000000000000000000000000000000..83344affffd04217d0368a91062a616c5eacec7d --- /dev/null +++ b/cv-node/include/ftl/algorithms/rtcensus_sgm.hpp @@ -0,0 +1,49 @@ +#ifndef _FTL_ALGORITHMS_RTCENSUSSGM_HPP_ +#define _FTL_ALGORITHMS_RTCENSUSSGM_HPP_ + +#include <opencv2/core.hpp> +#include <opencv2/opencv.hpp> +#include <ftl/disparity.hpp> +#include <nlohmann/json.hpp> + +#if defined HAVE_CUDA +#include <opencv2/core/cuda.hpp> +#endif + +namespace ftl { +namespace algorithms { +class RTCensusSGM : public ftl::Disparity { + public: + RTCensusSGM(nlohmann::json &config); + + void setGamma(float gamma) { gamma_ = gamma; } + void setTau(float tau) { tau_ = tau; } + + void compute(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp); + + static inline Disparity *create(nlohmann::json &config) { return new RTCensusSGM(config); } + + private: + float gamma_; + float tau_; + bool use_cuda_; + bool alternate_; + + #if defined HAVE_CUDA + cv::cuda::GpuMat disp_; + cv::cuda::GpuMat filtered_; + cv::cuda::GpuMat left_; + cv::cuda::GpuMat left2_; + cv::cuda::GpuMat right_; + #endif + + private: + #if defined HAVE_CUDA + void computeCUDA(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp); + #endif +}; +}; +}; + +#endif // _FTL_ALGORITHMS_RTCENSUSSGM_HPP_ + diff --git a/cv-node/src/algorithms/rtcensus_sgm.cpp b/cv-node/src/algorithms/rtcensus_sgm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d6b93a08b397637a4871f029b1a8041ca1085517 --- /dev/null +++ b/cv-node/src/algorithms/rtcensus_sgm.cpp @@ -0,0 +1,103 @@ +/* Created by Nicolas Pope and Sebastian Hahta + * + * Implementation of algorithm presented in article(s): + * + * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable + * for embedded real-time systems + * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based + * Stereo Matching Algorithm on Embedded and Multi-Core Hardware + * [3] Humenberger, Engelke, Kubinger: A Census-Based Stereo Vision Algorithm Using Modified Semi-Global Matching + * and Plane Fitting to Improve Matching Quality. + * + * Equation numbering uses [1] unless otherwise stated + */ + + +#include <ftl/algorithms/rtcensus_sgm.hpp> +#include <vector> +#include <tuple> +#include <bitset> +#include <cmath> +#include <glog/logging.h> + +using ftl::algorithms::RTCensusSGM; +using std::vector; +using cv::Mat; +using cv::Point; +using cv::Size; +using std::tuple; +using std::get; +using std::make_tuple; +using std::bitset; + +static ftl::Disparity::Register rtcensus("rtcensus_sgm", RTCensusSGM::create); + +RTCensusSGM::RTCensusSGM(nlohmann::json &config) + : Disparity(config), + gamma_(0.0f), + tau_(0.0f), + use_cuda_(config.value("use_cuda",true)), + alternate_(false) {} + +/* + * Choose the implementation and perform disparity calculation. + */ +void RTCensusSGM::compute(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) { + #if defined HAVE_CUDA + if (use_cuda_) { + computeCUDA(l,r,disp); + } else { + //computeCPU(l,r,disp); + LOG(ERROR) << "RTCensus SGM requires CUDA"; + } + #else // !HAVE_CUDA + //computeCPU(l,r,disp); + LOG(ERROR) << "RTCensus SGM requires CUDA"; + #endif +} + +#if defined HAVE_CUDA + +using namespace cv::cuda; +using namespace cv; + +#include <vector_types.h> + +namespace ftl { namespace gpu { +void rtcensus_sgm_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &s=0); +}} + +void RTCensusSGM::computeCUDA(const cv::Mat &l, const cv::Mat &r, cv::Mat &disp) { + // Initialise gpu memory here because we need image size + if (disp_.empty()) disp_ = cuda::GpuMat(l.size(), CV_32FC1); + if (left_.empty()) left_ = cuda::GpuMat(l.size(), CV_8UC4); + if (left2_.empty()) left2_ = cuda::GpuMat(l.size(), CV_8UC4); + if (right_.empty()) right_ = cuda::GpuMat(l.size(), CV_8UC4); + + Mat lhsv, rhsv; + cv::cvtColor(l, lhsv, COLOR_BGR2HSV); + cv::cvtColor(r, rhsv, COLOR_BGR2HSV); + int from_to[] = {0,0,1,1,2,2,-1,3}; + Mat hsval(lhsv.size(), CV_8UC4); + Mat hsvar(rhsv.size(), CV_8UC4); + mixChannels(&lhsv, 1, &hsval, 1, from_to, 4); + mixChannels(&rhsv, 1, &hsvar, 1, from_to, 4); + + // Send images to GPU + if (alternate_) left_.upload(hsval); + else left2_.upload(hsval); + right_.upload(hsvar); + + auto start = std::chrono::high_resolution_clock::now(); + ftl::gpu::rtcensus_sgm_call((alternate_)?left_:left2_, right_, disp_, max_disp_); + std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start; + LOG(INFO) << "CUDA rtcensus_sgm in " << elapsed.count() << "s"; + + alternate_ = !alternate_; + + // Read disparity from GPU + disp_.download(disp); +} + +#endif + diff --git a/cv-node/src/algorithms/rtcensus_sgm.cu b/cv-node/src/algorithms/rtcensus_sgm.cu new file mode 100644 index 0000000000000000000000000000000000000000..94b36ed1ce9ae91fa06f94d14e0ad2b7bea0dd21 --- /dev/null +++ b/cv-node/src/algorithms/rtcensus_sgm.cu @@ -0,0 +1,205 @@ +/* + * Author: Nicolas Pope and Sebastian Hahta (2019) + * Implementation of algorithm presented in article(s): + * + * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable + * for embedded real-time systems + * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based + * Stereo Matching Algorithm on Embedded and Multi-Core Hardware + * [3] Humenberger, Engelke, Kubinger: A Census-Based Stereo Vision Algorithm Using Modified Semi-Global Matching + * and Plane Fitting to Improve Matching Quality. + * + * Equation numbering uses [1] unless otherwise stated + * + */ + +#include <ftl/cuda_common.hpp> +#include <ftl/cuda_algorithms.hpp> + +using namespace cv::cuda; +using namespace cv; + + +#define BLOCK_W 60 +#define RADIUS 7 +#define RADIUS2 2 +#define ROWSperTHREAD 1 + +namespace ftl { +namespace gpu { + +// --- SUPPORT ----------------------------------------------------------------- + + +/* + * Parabolic interpolation between matched disparities either side. + * Results in subpixel disparity. (20). + */ +__device__ static float fit_parabola(size_t pi, uint16_t p, uint16_t pl, uint16_t pr) { + float a = pr - pl; + float b = 2 * (2 * p - pl - pr); + return static_cast<float>(pi) + (a / b); +} + +// --- KERNELS ----------------------------------------------------------------- + +/* Convert vector uint2 (32bit x2) into a single uint64_t */ +__forceinline__ __device__ static uint64_t uint2asull (uint2 a) { + uint64_t res; + asm ("mov.b64 %0, {%1,%2};" : "=l"(res) : "r"(a.x), "r"(a.y)); + return res; +} + +/* + * Generate left and right disparity images from census data. (18)(19)(25) + */ +__global__ static void disp_kernel(float *disp_l, float *disp_r, + int pitchL, int pitchR, + size_t width, size_t height, + cudaTextureObject_t censusL, cudaTextureObject_t censusR, + size_t ds) { + //extern __shared__ uint64_t cache[]; + + const int gamma = 35; + + int u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2; + int v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2; + int v_end = v_start + ROWSperTHREAD; + int maxdisp = ds; + + // Local cache + uint64_t l_cache_l1[5][5]; + uint64_t l_cache_l2[5][5]; + + if (v_end >= height) v_end = height; + if (u+maxdisp >= width) maxdisp = width-u; + + for (int v=v_start; v<v_end; v++) { + for (int m=-2; m<=2; m++) { + for (int n=-2; n<=2; n++) { + l_cache_l2[m+2][n+2] = uint2asull(tex2D<uint2>(censusL,u+n,v+m)); + l_cache_l1[m+2][n+2] = uint2asull(tex2D<uint2>(censusR,u+n,v+m)); + } + } + + uint16_t last_ham1 = 65535; + uint16_t last_ham2 = 65535; + uint16_t min_disp1 = 65535; + uint16_t min_disp2 = 65535; + uint16_t min_disp1b = 65535; + uint16_t min_disp2b = 65535; + uint16_t min_before1 = 0; + uint16_t min_before2 = 0; + uint16_t min_after1 = 0; + uint16_t min_after2 = 0; + int dix1 = 0; + int dix2 = 0; + + // (19) + for (int d=0; d<maxdisp; d++) { + uint16_t hamming1 = 0; + uint16_t hamming2 = 0; + + //if (u+2+ds >= width) break; + + for (int m=-2; m<=2; m++) { + const auto v_ = (v + m); + for (int n=-2; n<=2; n++) { + const auto u_ = u + n; + + // (18) + auto l1 = l_cache_l1[m+2][n+2]; + auto l2 = l_cache_l2[m+2][n+2]; + auto r1 = uint2asull(tex2D<uint2>(censusL, u_+d, v_)); + auto r2 = uint2asull(tex2D<uint2>(censusR, u_-d, v_)); + hamming1 += __popcll(r1^l1); + hamming2 += __popcll(r2^l2); + } + } + + // Find the two minimum costs + if (hamming1 < min_disp1) { + min_before1 = last_ham1; + min_disp1 = hamming1; + dix1 = d; + } else if (hamming1 < min_disp1b) { + min_disp1b = hamming1; + } + if (dix1 == d) min_after1 = hamming1; + last_ham1 = hamming1; + + if (hamming2 < min_disp2) { + min_before2 = last_ham2; + min_disp2 = hamming2; + dix2 = d; + } else if (hamming2 < min_disp2b) { + min_disp2b = hamming2; + } + if (dix2 == d) min_after2 = hamming2; + last_ham2 = hamming2; + + } + + //float d1 = (dix1 == 0 || dix1 == ds-1) ? (float)dix1 : fit_parabola(dix1, min_disp1, min_before1, min_after1); + //float d2 = (dix2 == 0 || dix2 == ds-1) ? (float)dix2 : fit_parabola(dix2, min_disp2, min_before2, min_after2); + + // TODO Allow for discontinuities with threshold + // Subpixel disparity (20) + float d1 = fit_parabola(dix1, min_disp1, min_before1, min_after1); + float d2 = fit_parabola(dix2, min_disp2, min_before2, min_after2); + + // Confidence filter based on (25) + disp_l[v*pitchL+u] = ((min_disp2b - min_disp2) >= gamma) ? d2 : NAN; + disp_r[v*pitchR+u] = ((min_disp1b - min_disp1) >= gamma) ? d1 : NAN; + } +} + +void rtcensus_sgm_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) { + // Make all the required texture steps + // TODO Could reduce re-allocations by caching these + ftl::cuda::TextureObject<uchar4> texLeft(l); + ftl::cuda::TextureObject<uchar4> texRight(r); + ftl::cuda::TextureObject<uint2> censusTexLeft(l.cols, l.rows); + ftl::cuda::TextureObject<uint2> censusTexRight(r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTexLeft(l.cols, l.rows); + ftl::cuda::TextureObject<float> dispTexRight(r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTex(r.cols, r.rows); + ftl::cuda::TextureObject<float> output(disp); + + // Calculate the census for left and right (14)(15)(16) + ftl::cuda::sparse_census(texLeft, texRight, censusTexLeft, censusTexRight); + + dim3 grid(1,1,1); + dim3 threads(BLOCK_W, 1, 1); + grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W); + grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD); + + // Calculate L and R disparities (18)(19)(20)(21)(22)(25) + disp_kernel<<<grid, threads>>>( + dispTexLeft.devicePtr(), dispTexRight.devicePtr(), + dispTexLeft.pitch()/sizeof(float), dispTexRight.pitch()/sizeof(float), + l.cols, l.rows, + censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), + num_disp); + cudaSafeCall( cudaGetLastError() ); + + // Check consistency between L and R disparities. (23)(24) + consistency(dispTexLeft, dispTexRight, dispTex); + + // TM in (7) of paper [3]. Eq (26) in [1] is wrong. + texture_filter(texLeft, dispTex, output, num_disp, 10.0); + + cudaSafeCall( cudaDeviceSynchronize() ); + + texLeft.free(); + texRight.free(); + censusTexLeft.free(); + censusTexRight.free(); + dispTexLeft.free(); + dispTexRight.free(); + dispTex.free(); + output.free(); +} + +}; +};