From f4852c7f360deeeca208e02d2555e47410cd7ac5 Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nicolas.pope@utu.fi> Date: Sat, 19 Sep 2020 17:19:32 +0300 Subject: [PATCH] Feature buckets experiment --- lib/libstereo/CMakeLists.txt | 6 +- lib/libstereo/include/stereo.hpp | 19 ++ lib/libstereo/middlebury/algorithms.hpp | 11 + lib/libstereo/src/algorithms/clustersf.cu | 143 +++++++++++ lib/libstereo/src/array1d.hpp | 137 +++++++++++ lib/libstereo/src/bucket1d.hpp | 130 ++++++++++ lib/libstereo/src/bucket2d.hpp | 136 +++++++++++ lib/libstereo/src/filters/focal_cluster.hpp | 90 +++++++ .../src/filters/salient_gradient.hpp | 225 ++++++++++++++++++ lib/libstereo/src/util.hpp | 40 ++++ 10 files changed, 935 insertions(+), 2 deletions(-) create mode 100644 lib/libstereo/src/algorithms/clustersf.cu create mode 100644 lib/libstereo/src/array1d.hpp create mode 100644 lib/libstereo/src/bucket1d.hpp create mode 100644 lib/libstereo/src/bucket2d.hpp create mode 100644 lib/libstereo/src/filters/focal_cluster.hpp create mode 100644 lib/libstereo/src/filters/salient_gradient.hpp diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt index f60cc79a1..a5f3255d2 100644 --- a/lib/libstereo/CMakeLists.txt +++ b/lib/libstereo/CMakeLists.txt @@ -49,7 +49,8 @@ if (LIBSTEREO_SHARED) src/algorithms/hwcensussgm.cu src/algorithms/brefcensussgm.cu src/algorithms/meancensussgm.cu - src/algorithms/hstablesgm.cu + src/algorithms/hstablesgm.cu + src/algorithms/clustersf.cu #src/stereo_hier_census.cu src/stereo_wcensussgm.cu src/stereo_census_adaptive.cu @@ -87,7 +88,8 @@ else() src/algorithms/hwcensussgm.cu src/algorithms/brefcensussgm.cu src/algorithms/meancensussgm.cu - src/algorithms/hstablesgm.cu + src/algorithms/hstablesgm.cu + src/algorithms/clustersf.cu #src/stereo_hier_census.cu src/stereo_wcensussgm.cu src/stereo_census_adaptive.cu diff --git a/lib/libstereo/include/stereo.hpp b/lib/libstereo/include/stereo.hpp index e5680c0f9..df9ccdd29 100644 --- a/lib/libstereo/include/stereo.hpp +++ b/lib/libstereo/include/stereo.hpp @@ -240,6 +240,25 @@ private: Impl *impl_; }; +/* Clustered Salient Features with focal point */ +class StereoCSF { +public: + StereoCSF(); + ~StereoCSF(); + + void compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity); + void setPrior(cv::InputArray disp) {}; + + struct Parameters { + bool debug = false; + }; + Parameters params; + +private: + struct Impl; + Impl *impl_; +}; + /** * STABLE Binary descriptor. This is a general implementation. * diff --git a/lib/libstereo/middlebury/algorithms.hpp b/lib/libstereo/middlebury/algorithms.hpp index 0b510df4a..094976d0c 100644 --- a/lib/libstereo/middlebury/algorithms.hpp +++ b/lib/libstereo/middlebury/algorithms.hpp @@ -32,6 +32,15 @@ namespace Impl { } }; + struct ClusterSF : public Algorithm { + ClusterSF() { } + + virtual void run(const MiddleburyData &data, cv::Mat &disparity) override { + StereoCSF stereo; + stereo.compute(data.imL, data.imR, disparity); + } + }; + struct ECensusSGM : public Algorithm { ECensusSGM() { P1 = 8.0f; P2 = 40.0f; } // Tuned to total error 2.0 @@ -361,6 +370,8 @@ namespace Impl { static const std::map<std::string, Algorithm*> algorithms = { { "censussgm", new Impl::CensusSGM() }, + { "cluster", new Impl::ClusterSF() }, + { "mcensussgm", new Impl::MeanCensusSGM() }, { "gctsgm", new Impl::GCTSgm() }, /* { "mcensussgm", new Impl::MeanCensusSGM() }, diff --git a/lib/libstereo/src/algorithms/clustersf.cu b/lib/libstereo/src/algorithms/clustersf.cu new file mode 100644 index 000000000..332a8e27b --- /dev/null +++ b/lib/libstereo/src/algorithms/clustersf.cu @@ -0,0 +1,143 @@ +#include "stereo.hpp" +#include "stereosgm.hpp" +#include "../filters/salient_gradient.hpp" +#include "../filters/focal_cluster.hpp" + +#include <opencv2/highgui.hpp> +#include <opencv2/imgproc.hpp> +#include <opencv2/cudaarithm.hpp> + +struct StereoCSF::Impl { + Array2D<uchar> l; + Array2D<uchar> r; + Array2D<uchar> gl; + Array2D<uchar> gr; + Array2D<uchar> temp; + Bucket1D<short2, 64> buckets_l; + Bucket2D<ushort, 64> buckets_r; + Array1D<int> focal; + + Impl(int width, int height) : + l(width, height), r(width, height), + gl(width, height), gr(width, height), temp(width, height), + buckets_l(height), buckets_r(16, height), focal(1024) {} +}; + +StereoCSF::StereoCSF() : impl_(nullptr) { + impl_ = new Impl(0, 0); +} + +void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disparity) { + + //cudaSetDevice(0); + + if (l.rows() != impl_->l.height || r.cols() != impl_->l.width) { + delete impl_; impl_ = nullptr; + impl_ = new Impl(l.cols(), l.rows()); + } + + mat2gray(l, impl_->l); + mat2gray(r, impl_->r); + + Array2D<float> disp_array(l.cols(), l.rows()); + disp_array.toGpuMat().setTo(cv::Scalar(0.0f)); + + Array2D<float> conf_array(l.cols(), l.rows()); + conf_array.toGpuMat().setTo(cv::Scalar(0.0f)); + + //int fx=1000; + //int fy=800; + for (int fx = 200; fx < l.cols()-200; fx += 50) { + for (int fy = 200; fy < l.rows()-200; fy += 50) { + short2 focal_pt_est = {short(l.cols()/2), short(l.rows()/2)}; + int focal_radius_est = 100000; + int focal_radius = 1000; + + for (int i=0; i<3; ++i) { + SalientGradientGrouped sgr = {focal_pt_est, focal_radius_est, impl_->r.data(), impl_->gr.data(), impl_->temp.data(), impl_->buckets_r.data(), impl_->r.width, impl_->r.height}; + parallel1DWarpSM(sgr, r.rows(), r.cols()); + + short2 focal_pt = {short(fx), short(fy)}; + SalientGradient sgl = {focal_pt, focal_radius, impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->buckets_l.data(), impl_->l.width, impl_->l.height}; + parallel1DWarpSM(sgl, l.rows(), l.cols()); + impl_->focal.toGpuMat().setTo(cv::Scalar(0)); + + FocalCluster fc = {focal_pt, impl_->buckets_l.data(), impl_->buckets_r.data(), impl_->focal.data(), 1024}; + parallel1DWarp(fc, l.rows(), 1); + + cv::Point max_disp; + cv::cuda::minMaxLoc(impl_->focal.toGpuMat(), nullptr, nullptr, nullptr, &max_disp); + + FocalSelector fs = {focal_pt, int(max_disp.x), impl_->buckets_l.data(), impl_->buckets_r.data(), impl_->focal.data(), disp_array.data(), conf_array.data(), 1024}; + parallel1DWarp(fs, l.rows(), 1); + + // Update right focal point estimate and radius + focal_pt_est.y = focal_pt.y; + focal_pt_est.x = focal_pt.x-max_disp.x; + focal_radius_est = focal_radius; + focal_radius /= 2; + //std::cout << "Focal disparity = " << max_disp.x << std::endl; + } + } + } + + disp_array.toGpuMat().download(disparity); + + cv::Mat gradtmp; + conf_array.toGpuMat().download(gradtmp); + gradtmp.convertTo(gradtmp, CV_8UC1, 255.0); + cv::applyColorMap(gradtmp, gradtmp, cv::COLORMAP_INFERNO); + cv::resize(gradtmp,gradtmp, cv::Size(gradtmp.cols/2, gradtmp.rows/2)); + cv::imshow("Confidence", gradtmp); + + cv::Mat tmp; + impl_->focal.toGpuMat().download(tmp); + + double minval; + double maxval; + int minloc[2]; + int maxloc[2]; + cv::minMaxIdx(tmp, &minval, &maxval, minloc, maxloc); + + std::cout << "Focal Disparity = " << maxloc[1] << std::endl; + + tmp.convertTo(tmp, CV_8UC1, 255.0/maxval); + cv::resize(tmp,tmp, cv::Size(tmp.cols, 100)); + + //#if OPENCV_VERSION >= 40102 + //cv::applyColorMap(tmp, tmp, cv::COLORMAP_TURBO); + //#else + cv::applyColorMap(tmp, tmp, cv::COLORMAP_INFERNO); + //#endif + cv::imshow("Disparity Hist", tmp); + + cv::Mat imgleft, imgright; + impl_->l.toGpuMat().download(imgleft); + impl_->r.toGpuMat().download(imgright); + + cv::cvtColor(imgleft,imgleft, cv::COLOR_GRAY2BGR); + cv::cvtColor(imgright,imgright, cv::COLOR_GRAY2BGR); + cv::drawMarker(imgleft, cv::Point(1000,800), cv::Scalar(0,0,255)); + cv::drawMarker(imgright, cv::Point(1000-maxloc[1],800), cv::Scalar(0,0,255)); + + cv::resize(imgleft,imgleft, cv::Size(imgleft.cols/2, imgleft.rows/2)); + cv::resize(imgright,imgright, cv::Size(imgright.cols/2, imgright.rows/2)); + cv::imshow("Left Focal", imgleft); + cv::imshow("Right Focal", imgright); + + //impl_->gr.toGpuMat().download(tmp); + //cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2)); + //cv::imshow("Gradients Right", tmp); + + cudaSafeCall(cudaDeviceSynchronize()); + disparity.create(l.rows(), l.cols(), CV_32F); + cv::waitKey(10000); +} + +StereoCSF::~StereoCSF() { + if (impl_) { + delete impl_; + impl_ = nullptr; + } +} + diff --git a/lib/libstereo/src/array1d.hpp b/lib/libstereo/src/array1d.hpp new file mode 100644 index 000000000..a2bdb80e3 --- /dev/null +++ b/lib/libstereo/src/array1d.hpp @@ -0,0 +1,137 @@ +#ifndef _FTL_LIBSTEREO_ARRAY1D_HPP_ +#define _FTL_LIBSTEREO_ARRAY1D_HPP_ + +#include "memory.hpp" + +template<typename T> +class Array1D { +public: + Array1D() : width(0), needs_free_(false) { + data_.data = nullptr; + } + + Array1D(int w) : width(w), needs_free_(true) { + data_.data = allocateMemory<T>(w); + } + + /*explicit Array1D(cv::Mat &m) : needs_free_(false) { + #ifdef USE_GPU + create(m.cols, m.rows); + cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyHostToDevice)); + #else + needs_free_ = false; + data_.data = (T*)m.data; + data_.pitch = m.step / sizeof(T); + width = m.cols; + height = m.rows; + #endif + } + + explicit Array2D(cv::cuda::GpuMat &m) : needs_free_(false) { + #ifdef USE_GPU + needs_free_ = false; + data_.data = (T*)m.data; + data_.pitch = m.step / sizeof(T); + width = m.cols; + height = m.rows; + #else + create(m.cols, m.rows); + cudaSafeCall(cudaMemcpy2D(data_.data, data_.pitch*sizeof(T), m.data, m.step, width*sizeof(T), height, cudaMemcpyDeviceToHost)); + #endif + }*/ + + ~Array1D() { + free(); + } + + void free() { + if (needs_free_ && data_.data) freeMemory(data_.data); + } + + Array1D<T> &operator=(const Array1D<T> &c) { + data_ = c.data_; + width = c.width; + needs_free_ = false; + return *this; + } + + struct Data { + __host__ __device__ inline T& operator() (const int x) { + return data[x]; + } + + __host__ __device__ inline const T& operator() (const int x) const { + return data[x]; + } + + T *data; + }; + + void create(int w) { + if (w == width) return; + width = w; + free(); + needs_free_ = true; + data_.data = allocateMemory<T>(w); + } + + inline Data &data() { return data_; } + inline const Data &data() const { return data_; } + + void toMat(cv::Mat &m) { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + gm.download(m); + #else + m = cv::Mat(1, width, cv::traits::Type<T>::value, data_.data); + #endif + } + + cv::Mat toMat() { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(1, width, cv::traits::Type<T>::value, data_.data); + #endif + } + + const cv::Mat toMat() const { + #ifdef USE_GPU + cv::cuda::GpuMat gm(1, width, cv::traits::Type<T>::value, (void*)data_.data); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(1, width, cv::traits::Type<T>::value, data_.data); + #endif + } + + void toGpuMat(cv::cuda::GpuMat &m) { + #ifdef USE_GPU + m = cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value, (void*)data_.data); + #else + // TODO + #endif + } + + cv::cuda::GpuMat toGpuMat() { + #ifdef USE_GPU + return cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value, (void*)data_.data); + #else + return cv::cuda::GpuMat(1, width, cv::traits::Type<T>::value); + #endif + } + + int width; + +private: + Data data_; + bool needs_free_; +}; + +#endif diff --git a/lib/libstereo/src/bucket1d.hpp b/lib/libstereo/src/bucket1d.hpp new file mode 100644 index 000000000..77f8ff962 --- /dev/null +++ b/lib/libstereo/src/bucket1d.hpp @@ -0,0 +1,130 @@ +#ifndef _FTL_LIBSTEREO_BUCKET1D_HPP_ +#define _FTL_LIBSTEREO_BUCKET1D_HPP_ + +#include "memory.hpp" + +template<typename T, int DEPTH> +class Bucket1D { +public: + Bucket1D() : width(0), needs_free_(false) { + data_.buckets = nullptr; + data_.counters = nullptr; + } + + Bucket1D(int w) : width(w), needs_free_(true) { + data_.counters = allocateMemory<uint>(w); + data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch); + } + + ~Bucket1D() { + free(); + } + + void free() { + if (needs_free_ && data_.buckets) { + freeMemory(data_.counters); + freeMemory(data_.buckets); + } + } + + Bucket1D<T, DEPTH> &operator=(const Bucket1D<T, DEPTH> &c) { + data_ = c.data_; + width = c.width; + needs_free_ = false; + return *this; + } + + struct Data { + __host__ __device__ inline uint& operator() (const int y) { + return counters[y]; + } + + __host__ __device__ inline const uint& operator() (const int y) const { + return counters[y]; + } + + __host__ __device__ inline const T& operator() (const int y, int d) const { + return buckets[d + y*bucket_pitch]; + } + + __host__ __device__ inline void add (const int y, T v) { + uint ix = atomicInc(&counters[y], DEPTH); + buckets[ix + y*bucket_pitch] = v; + } + + __host__ __device__ inline const T *ptr(const int y) const { return &buckets[y*bucket_pitch]; } + + uint bucket_pitch; + T *buckets; + uint *counters; + }; + + void create(int w) { + if (w == width) return; + width = w; + free(); + needs_free_ = true; + data_.counters = allocateMemory<uint>(w); + data_.buckets = allocateMemory2D<T>(DEPTH, w, data_.bucket_pitch); + } + + inline Data &data() { return data_; } + inline const Data &data() const { return data_; } + + void toMat(cv::Mat &m) { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + gm.download(m); + #else + m = cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + cv::Mat toMat() { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(1, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + const cv::Mat toMat() const { + #ifdef USE_GPU + cv::cuda::GpuMat gm(1, width, cv::traits::Type<int>::value, (void*)data_.counters); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + void toGpuMat(cv::cuda::GpuMat &m) { + #ifdef USE_GPU + m = cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters); + #else + // TODO + #endif + } + + cv::cuda::GpuMat toGpuMat() { + #ifdef USE_GPU + return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value, (void*)data_.counters); + #else + return cv::cuda::GpuMat(1, width, cv::traits::Type<int>::value); + #endif + } + + int width; + +private: + Data data_; + bool needs_free_; +}; + +#endif diff --git a/lib/libstereo/src/bucket2d.hpp b/lib/libstereo/src/bucket2d.hpp new file mode 100644 index 000000000..6eb71d42e --- /dev/null +++ b/lib/libstereo/src/bucket2d.hpp @@ -0,0 +1,136 @@ +#ifndef _FTL_LIBSTEREO_BUCKET2D_HPP_ +#define _FTL_LIBSTEREO_BUCKET2D_HPP_ + +#include "memory.hpp" + +template<typename T, int DEPTH> +class Bucket2D { +public: + Bucket2D() : width(0), height(0), needs_free_(false) { + data_.buckets = nullptr; + data_.counters = nullptr; + } + + Bucket2D(int w, int h) : width(w), height(h), needs_free_(true) { + data_.counters = allocateMemory2D<uint>(w, h, data_.counter_pitch); + data_.buckets = allocateMemory2D<T>(w*DEPTH, h, data_.bucket_pitch); + } + + ~Bucket2D() { + free(); + } + + void free() { + if (needs_free_ && data_.buckets) { + freeMemory(data_.counters); + freeMemory(data_.buckets); + } + } + + Bucket2D<T, DEPTH> &operator=(const Bucket2D<T, DEPTH> &c) { + data_ = c.data_; + width = c.width; + height = c.height; + needs_free_ = false; + return *this; + } + + struct Data { + __host__ __device__ inline uint& operator() (const int y, const int x) { + return counters[x + y*counter_pitch]; + } + + __host__ __device__ inline const uint& operator() (const int y, const int x) const { + return counters[x + y*counter_pitch]; + } + + __host__ __device__ inline const T& operator() (const int y, const int x, int d) const { + return buckets[d + x*DEPTH + y*bucket_pitch]; + } + + __host__ __device__ inline void add (const int y, const int x, T v) { + uint ix = atomicInc(&counters[x + y*counter_pitch], DEPTH); + buckets[ix + x*DEPTH + y*bucket_pitch] = v; + } + + __host__ __device__ inline uint *ptr(const int y) { return &counters[y*counter_pitch]; } + __host__ __device__ inline const uint *ptr(const int y) const { return &counters[y*counter_pitch]; } + __host__ __device__ inline const T *ptr(const int y, const int x) const { return &buckets[x*DEPTH+y*bucket_pitch]; } + + uint bucket_pitch; + uint counter_pitch; + T *buckets; + uint *counters; + }; + + void create(int w, int h) { + if (w == width && h == height) return; + width = w; + height = h; + free(); + needs_free_ = true; + data_.counters = allocateMemory2D<uint>(w, h, data_.counter_pitch); + data_.buckets = allocateMemory2D<T>(w*DEPTH, h, data_.bucket_pitch); + } + + inline Data &data() { return data_; } + inline const Data &data() const { return data_; } + + void toMat(cv::Mat &m) { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + gm.download(m); + #else + m = cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + cv::Mat toMat() { + #ifdef USE_GPU + cv::cuda::GpuMat gm; + toGpuMat(gm); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + const cv::Mat toMat() const { + #ifdef USE_GPU + cv::cuda::GpuMat gm(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint))); + cv::Mat m; + gm.download(m); + return m; + #else + return cv::Mat(height, width, cv::traits::Type<int>::value, data_.counters); + #endif + } + + void toGpuMat(cv::cuda::GpuMat &m) { + #ifdef USE_GPU + m = cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint))); + #else + // TODO + #endif + } + + cv::cuda::GpuMat toGpuMat() { + #ifdef USE_GPU + return cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value, (void*)data_.counters, size_t(size_t(data_.counter_pitch)*sizeof(uint))); + #else + return cv::cuda::GpuMat(height, width, cv::traits::Type<int>::value); + #endif + } + + int width; + int height; + +private: + Data data_; + bool needs_free_; +}; + +#endif diff --git a/lib/libstereo/src/filters/focal_cluster.hpp b/lib/libstereo/src/filters/focal_cluster.hpp new file mode 100644 index 000000000..32cd5be1a --- /dev/null +++ b/lib/libstereo/src/filters/focal_cluster.hpp @@ -0,0 +1,90 @@ +#ifndef _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_ +#define _FTL_LIBSTEREO_FILTERS_CLUSTER_HPP_ + +#include "../util.hpp" +#include "../array1d.hpp" +#include "../array2d.hpp" +#include "../bucket1d.hpp" +#include "../bucket2d.hpp" + +struct FocalCluster { + short2 focal_pt; + Bucket1D<short2, 64>::Data left; + Bucket2D<ushort, 64>::Data right; + Array1D<int>::Data histogram; + + int max_disparity = 1024; + + __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) { + for (int y=thread.y; y<size.y; y+=stride.y) { + int count = left(y); + // Stride a warp of threads over the features + for (int f=thread.x; f<count; f+=stride.x) { + // For each feature or features near to focal point + short2 feature = left(y,f); + int distx = feature.x - focal_pt.x; + + // - For each feature in right image that matches + const ushort *ptr = right.ptr(y, feature.y); + int count2 = right(y,feature.y); + for (int i=0; i<count2; ++i) { + // - Add focal dist to feature X and add to histogram + int disparity = max(0,focal_pt.x - int(ptr[i]) + distx); + if (disparity < max_disparity && disparity > 0) atomicAdd(&histogram(disparity), 1); + } + } + } + } +}; + +struct FocalSelector { + short2 focal_pt; + int focal_disp; + Bucket1D<short2, 64>::Data left; + Bucket2D<ushort, 64>::Data right; + Array1D<int>::Data histogram; + Array2D<float>::Data disparity; + Array2D<float>::Data confidence; + + int max_disparity = 1024; + + __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) { + for (int y=thread.y; y<size.y; y+=stride.y) { + int count = left(y); + // Stride a warp of threads over the features + for (int f=thread.x; f<count; f+=stride.x) { + // For each feature or features near to focal point + short2 feature = left(y,f); + int distx = feature.x - focal_pt.x; + + int best_disp = 0; + int max_v = 0; + + // - For each feature in right image that matches + const ushort *ptr = right.ptr(y, feature.y); + int count2 = right(y,feature.y); + for (int i=0; i<count2; ++i) { + // - Add focal dist to feature X and add to histogram + int d = max(0,focal_pt.x - int(ptr[i]) + distx); + + int v = histogram(d); + if (v > max_v) { + max_v = v; + best_disp = d; + } + } + + if (max_v > 0) { + //float conf = 1.0f - min(1.0f, float(abs(best_disp-focal_disp)) / 10.0f); + float conf = 1.0f - min(1.0f, float(abs(distx)) / 500.0f); + if (conf > confidence(y,feature.x)) { + disparity(y,feature.x) = float(best_disp); + confidence(y,feature.x) = conf; + } + } + } + } + } +}; + +#endif diff --git a/lib/libstereo/src/filters/salient_gradient.hpp b/lib/libstereo/src/filters/salient_gradient.hpp new file mode 100644 index 000000000..43454264f --- /dev/null +++ b/lib/libstereo/src/filters/salient_gradient.hpp @@ -0,0 +1,225 @@ +#ifndef _FTL_LIBSTEREO_FILTERS_SALIENT_GRADIENT_HPP_ +#define _FTL_LIBSTEREO_FILTERS_SALIENT_GRADIENT_HPP_ + +#include "../util.hpp" +#include "../array2d.hpp" +#include "../bucket2d.hpp" +#include "../bucket1d.hpp" + +/** + * Select salient gradient features and gather into scanline buckets that + * includes the x-coordinate and feature orientation. Not grouped by orientation. + * + * TODO: This needs to be a focal point modified version. Radius from focal + * point determines threshold, but the radius itself is determined by feature + * density around the focal point ... ultimately keeping the number of features + * within a reasonable very small level per scanline (say 2-8). + */ +struct SalientGradient { + short2 focal_pt; + int radius; + Array2D<uchar>::Data image; + Array2D<uchar>::Data angle; + Array2D<uchar>::Data magnitude; + Bucket1D<short2, 64>::Data buckets; + + int width, height; + + __cuda__ inline float2 calculateGradient(int x, int y) { + if (x < 1 || y < 1 || x >= width-1 || y >= height-1) return make_float2(0,0); + + float dx = -1.0f*float(image(y-1,x-1)) + -2.0f*float(image(y, x-1)) + -1*float(image(y+1, x-1)) + + float(image(y-1, x+1)) + 2.0f*float(image(y, x+1)) + float(image(y+1, x+1)); + float dy = float(image(y-1, x-1)) + 2.0f*float(image(y-1, x)) + float(image(y-1, x+1)) + + -1.0f*float(image(y+1, x-1)) + -2.0f*float(image(y+1, x)) + -1.0f*float(image(y+1, x+1)); + + float g = sqrt( (dx*dx) + (dy*dy) ); + float a = atan2(dy, dx); + return make_float2(g,a); + } + + struct WarpSharedMemory { + int gradient_histogram[32]; + }; + + inline __device__ int scan(volatile int *s_Data, int thread, int threshold) { + for (uint offset = 1; offset < 32; offset <<= 1) { + __syncwarp(); + uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; + __syncwarp(); + s_Data[thread] = t; + } + + uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold); + return __ffs(t); + } + + __device__ inline float weighting(float r, float h) { + if (r >= h) return 0.0f; + float rh = r / h; + rh = 1.0f - rh*rh; + return rh*rh*rh*rh; + } + + __device__ inline float calcWeight(int x, int y) { + float dx = focal_pt.x - x; + float dy = focal_pt.y - y; + float dist = sqrt(dx*dx + dy*dy); + float weight = weighting(dist, float(radius)); + return weight; + } + + __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) { + static constexpr float PI = 3.14f; + static constexpr float PI2 = PI/2.0f; + + for (int y=thread.y; y<size.y; y+=stride.y) { + // Reset histogram + //for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0; + wsm.gradient_histogram[thread.x] = 0; + //int maxmag = 0; + + for (int x=thread.x; x<size.x; x+=stride.x) { + auto g = calculateGradient(x,y); + //float weight = calcWeight(x,y); + angle(y,x) = uchar(min(15,int((g.y+PI2) / PI * 16.0f))); + magnitude(y,x) = uchar(g.x); + + //maxmag = max(maxmag,int(g.x)); + atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1); + //atomicAdd(&wsm.gradient_histogram[0], 1); + } + + //maxmag = int(float(warpMax(maxmag)) * 0.8f); + + uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32)); + + // Apply threshold + for (int x=thread.x; x<size.x; x+=stride.x) { + float weight = calcWeight(x,y); + float thresh = gthresh; //max(gthresh, focal_thresh); + //for (int u=-3; u<=3; ++u) { + // thresh = (x+u >= 0 && x+u < width) ? max(thresh, weight*float(magnitude(y,x+u))) : thresh; + //} + + float m = float(magnitude(y,x)) * weight; + if (m < thresh) angle(y,x) = 0; + //output(y,x) = (m < thresh) ? 0 : 255; + if (m >= thresh) { + int a = angle(y,x); + buckets.add(y, make_short2(x, a)); + } + + angle(y,x) = uchar(thresh*weight); + } + } + } +}; + +/** + * Find salient gradient features and gather in orientation groups scanline + * buckets. Adds features to orientations either side of actual orientation for + * a degree of tolerance to exact orientation. This allows fast search for + * features based upon scanline and orientation. + */ +struct SalientGradientGrouped { + short2 focal_pt; + int radius; + Array2D<uchar>::Data image; + Array2D<uchar>::Data angle; + Array2D<uchar>::Data magnitude; + Bucket2D<ushort, 64>::Data buckets; + + int width, height; + + __cuda__ inline float2 calculateGradient(int x, int y) { + if (x < 1 || y < 1 || x >= width-1 || y >= height-1) return make_float2(0,0); + + float dx = -1.0f*float(image(y-1,x-1)) + -2.0f*float(image(y, x-1)) + -1*float(image(y+1, x-1)) + + float(image(y-1, x+1)) + 2.0f*float(image(y, x+1)) + float(image(y+1, x+1)); + float dy = float(image(y-1, x-1)) + 2.0f*float(image(y-1, x)) + float(image(y-1, x+1)) + + -1.0f*float(image(y+1, x-1)) + -2.0f*float(image(y+1, x)) + -1.0f*float(image(y+1, x+1)); + + float g = sqrt( (dx*dx) + (dy*dy) ); + float a = atan2(dy, dx); + return make_float2(g,a); + } + + struct WarpSharedMemory { + int gradient_histogram[32]; + }; + + inline __device__ int scan(volatile int *s_Data, int thread, int threshold) { + for (uint offset = 1; offset < 32; offset <<= 1) { + __syncwarp(); + uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; + __syncwarp(); + s_Data[thread] = t; + } + + uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold); + return __ffs(t); + } + + __device__ inline float weighting(float r, float h) { + if (r >= h) return 0.0f; + float rh = r / h; + rh = 1.0f - rh*rh; + return rh*rh*rh*rh; + } + + __device__ inline float calcWeight(int x, int y) { + float dx = focal_pt.x - x; + float dy = focal_pt.y - y; + float dist = sqrt(dx*dx + dy*dy); + float weight = weighting(dist, float(radius)); + return weight; + } + + __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) { + static constexpr float PI = 3.14f; + static constexpr float PI2 = PI/2.0f; + + for (int y=thread.y; y<size.y; y+=stride.y) { + // Reset histogram + //for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0; + wsm.gradient_histogram[thread.x] = 0; + //int maxmag = 0; + + for (int x=thread.x; x<size.x; x+=stride.x) { + auto g = calculateGradient(x,y); + angle(y,x) = uchar(min(15,int((g.y+PI2) / PI * 16.0f))); + magnitude(y,x) = uchar(g.x); + + //maxmag = max(maxmag,int(g.x)); + atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1); + //atomicAdd(&wsm.gradient_histogram[0], 1); + } + + //maxmag = int(float(warpMax(maxmag)) * 0.8f); + + uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32)); + + // Apply threshold + for (int x=thread.x; x<size.x; x+=stride.x) { + float weight = calcWeight(x,y); + float thresh = gthresh; + //for (int u=-3; u<=3; ++u) { + // thresh = (x+u >= 0 && x+u < width) ? max(thresh, weight*float(magnitude(y,x+u))) : thresh; + //} + + float m = float(magnitude(y,x))*weight; + //if (m < thresh) angle(y,x) = 0; + //output(y,x) = (m < thresh) ? 0 : 255; + if (m >= thresh) { + int a = angle(y,x); + buckets.add(y, a, ushort(x)); + buckets.add(y, (a > 0) ? a-1 : 15, ushort(x)); + buckets.add(y, (a < 15) ? a+1 : 0, ushort(x)); + } + } + } + } +}; + +#endif diff --git a/lib/libstereo/src/util.hpp b/lib/libstereo/src/util.hpp index 4c50f082b..92d9c3ee9 100644 --- a/lib/libstereo/src/util.hpp +++ b/lib/libstereo/src/util.hpp @@ -21,6 +21,15 @@ __device__ inline T warpMin(T e) { return e; } +template <typename T> +__device__ inline T warpMax(T e) { + for (int i = WARP_SIZE/2; i > 0; i /= 2) { + const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); + e = max(e, other); + } + return e; +} + #ifdef USE_GPU template <typename FUNCTOR> @@ -30,6 +39,16 @@ __global__ void kernel2D(FUNCTOR f, ushort2 size) { f(thread, stride, size); } +template <typename FUNCTOR, int WARPS> +__global__ void kernel2DWarpSM(FUNCTOR f, ushort2 size) { + const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)}; + const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)}; + + __shared__ typename FUNCTOR::WarpSharedMemory sm[WARPS]; + + f(thread, stride, size, sm[threadIdx.y]); +} + template <typename FUNCTOR> __global__ void kernel1D(FUNCTOR f, ushort size) { const ushort thread = threadIdx.x+blockIdx.x*blockDim.x; @@ -99,6 +118,27 @@ void parallel1DWarp(const FUNCTOR &f, int size, int size2) { #endif } +template <typename FUNCTOR> +void parallel1DWarpSM(const FUNCTOR &f, int size, int size2) { +#if __cplusplus >= 201703L + //static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)"); + // Is this static_assert correct? + static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)"); +#endif + +#ifdef USE_GPU + cudaSafeCall(cudaGetLastError()); + const dim3 gridSize(1, (size+8-1) / 8); + const dim3 blockSize(32, 8); + ushort2 tsize{ushort(size2), ushort(size)}; + kernel2DWarpSM<FUNCTOR, 8><<<gridSize, blockSize, 0, 0>>>(f,tsize); + cudaSafeCall(cudaGetLastError()); + //cudaSafeCall(cudaDeviceSynchronize()); +#else + +#endif +} + /** * Wrapper to initiate a parallel processing job using a given functor. The * width and height parameters are used to determine the number of threads. -- GitLab