diff --git a/cv-node/CMakeLists.txt b/cv-node/CMakeLists.txt index dab94af564545322531eb4935bb736220e454a19..0f32a0c45abf0206ca75bf041e068fac1bcc9332 100644 --- a/cv-node/CMakeLists.txt +++ b/cv-node/CMakeLists.txt @@ -88,7 +88,12 @@ if (LIBSGM_FOUND) endif (LIBSGM_FOUND) if (CUDA_FOUND) - list(APPEND CVNODESRC "src/algorithms/opencv_cuda_bm.cpp" "src/algorithms/opencv_cuda_bp.cpp" "src/algorithms/rtcensus.cu") + list(APPEND CVNODESRC + "src/algorithms/opencv_cuda_bm.cpp" + "src/algorithms/opencv_cuda_bp.cpp" + "src/algorithms/rtcensus.cu" + "src/algorithms/consistency.cu" + "src/algorithms/sparse_census.cu") endif (CUDA_FOUND) add_executable(cv-node ${CVNODESRC}) diff --git a/cv-node/include/ftl/cuda_algorithms.hpp b/cv-node/include/ftl/cuda_algorithms.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f4c4298ba5efcfdf3377911c092eb77676a25c17 --- /dev/null +++ b/cv-node/include/ftl/cuda_algorithms.hpp @@ -0,0 +1,19 @@ +#ifndef _FTL_CUDA_ALGORITHMS_HPP_ +#define _FTL_CUDA_ALGORITHMS_HPP_ + +#include <ftl/cuda_common.hpp> + +namespace ftl { +namespace cuda { + + void consistency(const TextureObject<float> &dl, const TextureObject<float> &dr, + TextureObject<float> &disp); + + void sparse_census(const TextureObject<uchar4> &l, const TextureObject<uchar4> &r, + TextureObject<uint2> &cl, TextureObject<uint2> &cr); + +} +} + +#endif // _FTL_CUDA_ALGORITHMS_HPP_ + diff --git a/cv-node/include/ftl/cuda_common.hpp b/cv-node/include/ftl/cuda_common.hpp index 99bd131a73eff066f035f05ca219e8ef1aeef032..834e2af362b2fa74ab31c2de23a33c14ac23d45d 100644 --- a/cv-node/include/ftl/cuda_common.hpp +++ b/cv-node/include/ftl/cuda_common.hpp @@ -19,12 +19,14 @@ class TextureObject { TextureObject() : texobj_(0), ptr_(nullptr) {}; TextureObject(const cv::cuda::PtrStepSz<T> &d); TextureObject(T *ptr, int pitch, int width, int height); - TextureObject(int width, int height); + TextureObject(size_t width, size_t height); TextureObject(const TextureObject &t); ~TextureObject(); - int getPitch(); + int pitch() const { return pitch_; } T *devicePtr() { return ptr_; }; + int width() const { return width_; } + int height() const { return height_; } cudaTextureObject_t cudaTexture() const { return texobj_; } __device__ inline T tex2D(int u, int v) { return ::tex2D<T>(texobj_, u, v); } __device__ inline T tex2D(float u, float v) { return ::tex2D<T>(texobj_, u, v); } @@ -41,7 +43,7 @@ class TextureObject { private: cudaTextureObject_t texobj_; - int pitch_; + size_t pitch_; int width_; int height_; T *ptr_; @@ -109,8 +111,8 @@ TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) { } template <typename T> -TextureObject<T>::TextureObject(int width, int height) { - cudaMallocPitch(&ptr_,&pitch_,width*sizeof(T),height); +TextureObject<T>::TextureObject(size_t width, size_t height) { + cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height); cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); diff --git a/cv-node/src/algorithms/consistency.cu b/cv-node/src/algorithms/consistency.cu new file mode 100644 index 0000000000000000000000000000000000000000..3924ace27891b58f1a896aab1ab150e0b356c84d --- /dev/null +++ b/cv-node/src/algorithms/consistency.cu @@ -0,0 +1,49 @@ +#include <ftl/cuda_common.hpp> + +#define CONSISTENCY_THRESHOLD 1.0f + +/* + * Check for consistency between the LR and RL disparity maps, only selecting + * those that are similar. Otherwise it sets the disparity to NAN. + */ +__global__ void consistency_kernel(cudaTextureObject_t d_sub_l, + cudaTextureObject_t d_sub_r, float *disp, int w, int h, int pitch) { + + // TODO This doesn't work at either edge (+-max_disparities) + for (STRIDE_Y(v,h)) { + for (STRIDE_X(u,w)) { + float a = (int)tex2D<float>(d_sub_l, u, v); + if (u-a < 0) { + disp[v*pitch+u] = NAN; // TODO Check + continue; + } + + auto b = tex2D<float>(d_sub_r, u-a, v); + + if (abs(a-b) <= CONSISTENCY_THRESHOLD) disp[v*pitch+u] = abs((a+b)/2); + else disp[v*pitch+u] = NAN; + } + } + +} + +namespace ftl { +namespace cuda { + void consistency(const TextureObject<float> &dl, const TextureObject<float> &dr, + TextureObject<float> &disp) { + dim3 grid(1,1,1); + dim3 threads(128, 1, 1); + grid.x = cv::cuda::device::divUp(disp.width(), 128); + grid.y = cv::cuda::device::divUp(disp.height(), 11); + consistency_kernel<<<grid, threads>>>( + dl.cudaTexture(), + dr.cudaTexture(), + disp.devicePtr(), + disp.width(), + disp.height(), + disp.pitch() / sizeof(float)); + cudaSafeCall( cudaGetLastError() ); + } +} +} + diff --git a/cv-node/src/algorithms/rtcensus.cu b/cv-node/src/algorithms/rtcensus.cu index ece058b09404ace2f5ceca5df1d3116ce08b55a9..e722dbac7c592602ba33d99ccc32e46f1e64d490 100644 --- a/cv-node/src/algorithms/rtcensus.cu +++ b/cv-node/src/algorithms/rtcensus.cu @@ -12,6 +12,7 @@ */ #include <ftl/cuda_common.hpp> +#include <ftl/cuda_algorithms.hpp> using namespace cv::cuda; using namespace cv; @@ -22,35 +23,12 @@ using namespace cv; #define RADIUS2 2 #define ROWSperTHREAD 1 -#define XHI(P1,P2) ((P1 <= P2) ? 0 : 1) - namespace ftl { namespace gpu { // --- SUPPORT ----------------------------------------------------------------- - -/* - * Sparse 16x16 census (so 8x8) creating a 64bit mask - * (14) & (15), based upon (9) - */ -__device__ uint64_t sparse_census(cudaTextureObject_t tex, int u, int v) { - uint64_t r = 0; - - unsigned char t = tex2D<uchar4>(tex, u,v).z; - - for (int m=-7; m<=7; m+=2) { - //auto start_ix = (v + m)*w + u; - for (int n=-7; n<=7; n+=2) { - r <<= 1; - r |= XHI(t, tex2D<uchar4>(tex, u+n, v+m).z); - } - } - - return r; -} - /* * Parabolic interpolation between matched disparities either side. * Results in subpixel disparity. (20). @@ -63,30 +41,6 @@ __device__ float fit_parabola(size_t pi, uint16_t p, uint16_t pl, uint16_t pr) { // --- KERNELS ----------------------------------------------------------------- -/* - * Calculate census mask for left and right images together. - */ -__global__ void census_kernel(cudaTextureObject_t l, cudaTextureObject_t r, - int w, int h, uint64_t *censusL, uint64_t *censusR, - size_t pL, size_t pR) { - - int u = (blockIdx.x * BLOCK_W + threadIdx.x + RADIUS); - int v_start = blockIdx.y * ROWSperTHREAD + RADIUS; - int v_end = v_start + ROWSperTHREAD; - - if (v_end+RADIUS >= h) v_end = h-RADIUS; - if (u+RADIUS >= w) return; - - for (int v=v_start; v<v_end; v++) { - //int ix = (u + v*pL); - uint64_t cenL = sparse_census(l, u, v); - uint64_t cenR = sparse_census(r, u, v); - - censusL[(u + v*pL)] = cenL; - censusR[(u + v*pR)] = cenR; - } -} - /* Convert vector uint2 (32bit x2) into a single uint64_t */ __forceinline__ __device__ uint64_t uint2asull (uint2 a) { uint64_t res; @@ -218,31 +172,6 @@ __global__ void disp_kernel(float *disp_l, float *disp_r, } } -/* - * Check for consistency between the LR and RL disparity maps, only selecting - * those that are similar. Otherwise it sets the disparity to NAN. - */ -__global__ void consistency_kernel(cudaTextureObject_t d_sub_l, - cudaTextureObject_t d_sub_r, float *disp, int w, int h, int pitch) { - - // TODO This doesn't work at either edge (+-max_disparities) - for (STRIDE_Y(v,h)) { - for (STRIDE_X(u,w)) { - float a = (int)tex2D<float>(d_sub_l, u, v); - if (u-a < 0) { - //disp[v*pitch+u] = a; - continue; - } - - auto b = tex2D<float>(d_sub_r, u-a, v); - - if (abs(a-b) <= 1.0) disp[v*pitch+u] = abs((a+b)/2); - else disp[v*pitch+u] = NAN; - } - } - -} - template <typename T> @@ -394,68 +323,37 @@ ftl::cuda::TextureObject<float> prevDisp; ftl::cuda::TextureObject<uchar4> prevImage; void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) { - dim3 grid(1,1,1); - dim3 threads(BLOCK_W, 1, 1); - - grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS, BLOCK_W); - grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS, ROWSperTHREAD); - - // TODO, reduce allocations - uint64_t *censusL; - uint64_t *censusR; - size_t pitchL; - size_t pitchR; - - float *disp_l; - float *disp_r; - size_t pitchDL; - size_t pitchDR; - - float *disp_raw; - size_t pitchD; - - cudaSafeCall( cudaMallocPitch(&censusL, &pitchL, l.cols*sizeof(uint64_t), l.rows) ); - cudaSafeCall( cudaMallocPitch(&censusR, &pitchR, r.cols*sizeof(uint64_t), r.rows) ); - - //cudaMemset(census, 0, sizeof(uint64_t)*l.cols*l.rows*2); - cudaSafeCall( cudaMallocPitch(&disp_l, &pitchDL, sizeof(float)*l.cols, l.rows) ); - cudaSafeCall( cudaMallocPitch(&disp_r, &pitchDR, sizeof(float)*l.cols, l.rows) ); - - cudaSafeCall( cudaMallocPitch(&disp_raw, &pitchD, sizeof(float)*l.cols, l.rows) ); - - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - + // Make all the required texture steps + // TODO Could reduce re-allocations by caching these ftl::cuda::TextureObject<uchar4> texLeft(l); ftl::cuda::TextureObject<uchar4> texRight(r); - - //size_t smem_size = (2 * l.cols * l.rows) * sizeof(uint64_t); - - // Calculate L and R census - census_kernel<<<grid, threads>>>(texLeft.cudaTexture(), texRight.cudaTexture(), l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t)); - cudaSafeCall( cudaGetLastError() ); - - //cudaSafeCall( cudaDeviceSynchronize() ); - - ftl::cuda::TextureObject<uint2> censusTexLeft((uint2*)censusL, pitchL, l.cols, l.rows); - ftl::cuda::TextureObject<uint2> censusTexRight((uint2*)censusR, pitchR, r.cols, r.rows); + ftl::cuda::TextureObject<uint2> censusTexLeft(l.cols, l.rows); + ftl::cuda::TextureObject<uint2> censusTexRight(r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTexLeft(l.cols, l.rows); + ftl::cuda::TextureObject<float> dispTexRight(r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTex(r.cols, r.rows); + // Calculate the census for left and right + ftl::cuda::sparse_census(texLeft, texRight, censusTexLeft, censusTexRight); + + dim3 grid(1,1,1); + dim3 threads(BLOCK_W, 1, 1); grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W); grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD); // Calculate L and R disparities - disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), num_disp); + disp_kernel<<<grid, threads>>>( + dispTexLeft.devicePtr(), dispTexRight.devicePtr(), + dispTexLeft.pitch()/sizeof(float), dispTexRight.pitch()/sizeof(float), + l.cols, l.rows, + censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), + num_disp); cudaSafeCall( cudaGetLastError() ); - - ftl::cuda::TextureObject<float> dispTexLeft(disp_l, pitchDL, l.cols, l.rows); - ftl::cuda::TextureObject<float> dispTexRight(disp_r, pitchDR, r.cols, r.rows); // Check consistency between L and R disparities. - consistency_kernel<<<grid, threads>>>(dispTexLeft.cudaTexture(), dispTexRight.cudaTexture(), disp_raw, l.cols, l.rows, pitchD/sizeof(float)); - cudaSafeCall( cudaGetLastError() ); + consistency(dispTexLeft, dispTexRight, dispTex); - ftl::cuda::TextureObject<float> dispTex(disp_raw, pitchD, r.cols, r.rows); + grid.x = 4; grid.y = l.rows; @@ -480,11 +378,6 @@ void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const dispTexLeft.free(); dispTexRight.free(); dispTex.free(); - - cudaFree(disp_r); - cudaFree(disp_l); - cudaFree(censusL); - cudaFree(censusR); } }; diff --git a/cv-node/src/algorithms/sparse_census.cu b/cv-node/src/algorithms/sparse_census.cu new file mode 100644 index 0000000000000000000000000000000000000000..c912354085a3a73078c3ad54666342305a59be72 --- /dev/null +++ b/cv-node/src/algorithms/sparse_census.cu @@ -0,0 +1,81 @@ +/* + * Author: Nicolas Pope and Sebastian Hahta (2019) + * Implementation of algorithm presented in article(s): + * + * [1] Humenberger, Engelke, Kubinger: A fast stereo matching algorithm suitable + * for embedded real-time systems + * [2] Humenberger, Zinner, Kubinger: Performance Evaluation of Census-Based + * Stereo Matching Algorithm on Embedded and Multi-Core Hardware + * + * Equation numbering uses [1] unless otherwise stated + * + */ + +#include <ftl/cuda_common.hpp> + +#define XHI(P1,P2) ((P1 <= P2) ? 0 : 1) + +/* + * Sparse 16x16 census (so 8x8) creating a 64bit mask + * (14) & (15), based upon (9) + */ +__device__ uint64_t _sparse_census(cudaTextureObject_t tex, int u, int v) { + uint64_t r = 0; + + unsigned char t = tex2D<uchar4>(tex, u,v).z; + + for (int m=-7; m<=7; m+=2) { + //auto start_ix = (v + m)*w + u; + for (int n=-7; n<=7; n+=2) { + r <<= 1; + r |= XHI(t, tex2D<uchar4>(tex, u+n, v+m).z); + } + } + + return r; +} + +/* + * Calculate census mask for left and right images together. + */ +__global__ void census_kernel(cudaTextureObject_t l, cudaTextureObject_t r, + int w, int h, uint64_t *censusL, uint64_t *censusR, + size_t pL, size_t pR) { + + //if (v_end+RADIUS >= h) v_end = h-RADIUS; + //if (u+RADIUS >= w) return; + + for (STRIDE_Y(v,h)) { + for (STRIDE_X(u,w)) { + //int ix = (u + v*pL); + uint64_t cenL = _sparse_census(l, u, v); + uint64_t cenR = _sparse_census(r, u, v); + + censusL[(u + v*pL)] = cenL; + censusR[(u + v*pR)] = cenR; + } + } +} + +namespace ftl { +namespace cuda { + void sparse_census(const TextureObject<uchar4> &l, const TextureObject<uchar4> &r, + TextureObject<uint2> &cl, TextureObject<uint2> &cr) { + dim3 grid(1,1,1); + dim3 threads(128, 1, 1); + grid.x = cv::cuda::device::divUp(l.width(), 128); + grid.y = cv::cuda::device::divUp(l.height(), 11); + + + census_kernel<<<grid, threads>>>( + l.cudaTexture(), r.cudaTexture(), + l.width(), l.height(), + (uint64_t*)cl.devicePtr(), (uint64_t*)cr.devicePtr(), + cl.pitch()/sizeof(uint64_t), + cr.pitch()/sizeof(uint64_t)); + cudaSafeCall( cudaGetLastError() ); + } +} +} + + diff --git a/cv-node/src/algorithms/tex_filter.cu b/cv-node/src/algorithms/tex_filter.cu new file mode 100644 index 0000000000000000000000000000000000000000..b5934c6f3292b9ea897a1913cb92f5843f6f36d8 --- /dev/null +++ b/cv-node/src/algorithms/tex_filter.cu @@ -0,0 +1,54 @@ +#include <ftl/cuda_common.hpp> + +#define FILTER_WINDOW 11 +#define FILTER_WINDOW_R 5 + +__global__ void texture_filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d, + ftl::cuda::TextureObject<float> f, int num_disp, int thresh) { // Thresh = -5000000 + + float disp = tex2D<float>(d,u,v); + int neigh_sq = 0; + int neigh_sum = 0; + + for (STRIDE_Y(v,h)) { + for (STRIDE_X(u,w)) { + for (int m=-FILTER_WINDOW_R; m<=FILTER_WINDOW_R; m++) { + for (int n=-FILTER_WINDOW_R; n<=FILTER_WINDOW_R; n++) { + uchar4 neigh = tex2D<uchar4>(t, u+n, v+m); + neigh_sq += neigh*neigh; + neigh_sum += neigh; + } + } + } + } + + // Texture map filtering + int tm = (neigh_sq / (FILTER_WINDOW*FILTER_WINDOW)) - + ((neigh_sum*neigh_sum) / (FILTER_WINDOW*FILTER_WINDOW)); + + if (tm < thesh) { + f(u,v) = disp; + } else { + f(u,v) = NAN; + } +} + +namespace ftl { +namespace cuda { + void texture_filter(const TextureObject<uchar4> &t, const TextureObject<float> &d, + TextureObject<float> &f, int num_disp, int thresh) { + dim3 grid(1,1,1); + dim3 threads(128, 1, 1); + grid.x = cv::cuda::device::divUp(disp.width(), 128); + grid.y = cv::cuda::device::divUp(disp.height(), 1); + texture_filter_kernel<<<grid, threads>>> + t.cudaTexture(), + d.cudaTexture(), + f, + num_disp, + thresh); + cudaSafeCall( cudaGetLastError() ); + } +} +} +