From a2851649e4b9db64683c3b5c9aeff83266be25a6 Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nwpope@utu.fi> Date: Sun, 24 Mar 2019 18:27:35 +0200 Subject: [PATCH] Refactor cuda textures into object wrapper --- cv-node/include/ftl/cuda_common.hpp | 159 +++++++++++++++++ cv-node/src/algorithms/rtcensus.cu | 268 +++++++++++++--------------- 2 files changed, 287 insertions(+), 140 deletions(-) create mode 100644 cv-node/include/ftl/cuda_common.hpp diff --git a/cv-node/include/ftl/cuda_common.hpp b/cv-node/include/ftl/cuda_common.hpp new file mode 100644 index 000000000..99bd131a7 --- /dev/null +++ b/cv-node/include/ftl/cuda_common.hpp @@ -0,0 +1,159 @@ +#ifndef _FTL_CUDA_COMMON_HPP_ +#define _FTL_CUDA_COMMON_HPP_ + +#if defined HAVE_CUDA + +#include <opencv2/core/cuda.hpp> +#include <opencv2/core/cuda/common.hpp> + +/* Grid stride loop macros */ +#define STRIDE_Y(I,N) int I = blockIdx.y * blockDim.y + threadIdx.y; I < N; I += blockDim.y * gridDim.y +#define STRIDE_X(I,N) int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x + +namespace ftl { +namespace cuda { + +template <typename T> +class TextureObject { + public: + TextureObject() : texobj_(0), ptr_(nullptr) {}; + TextureObject(const cv::cuda::PtrStepSz<T> &d); + TextureObject(T *ptr, int pitch, int width, int height); + TextureObject(int width, int height); + TextureObject(const TextureObject &t); + ~TextureObject(); + + int getPitch(); + T *devicePtr() { return ptr_; }; + cudaTextureObject_t cudaTexture() const { return texobj_; } + __device__ inline T tex2D(int u, int v) { return ::tex2D<T>(texobj_, u, v); } + __device__ inline T tex2D(float u, float v) { return ::tex2D<T>(texobj_, u, v); } + + inline const T &operator()(int u, int v) const { return ptr_[u+v*pitch_]; } + inline T &operator()(int u, int v) { return ptr_[u+v*pitch_]; } + + void free() { + if (texobj_ != 0) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); + if (ptr_ && needsfree_) cudaFree(ptr_); + ptr_ = nullptr; + texobj_ = 0; + } + + private: + cudaTextureObject_t texobj_; + int pitch_; + int width_; + int height_; + T *ptr_; + bool needsfree_; + //bool needsdestroy_; +}; + +/** + * Create a 2D array texture from an OpenCV GpuMat object. + */ +template <typename T> +TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = d.data; + resDesc.res.pitch2D.pitchInBytes = d.step; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = d.cols; + resDesc.res.pitch2D.height = d.rows; + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + cudaTextureObject_t tex = 0; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + texobj_ = tex; + pitch_ = d.step; + ptr_ = d.data; + width_ = d.cols; + height_ = d.rows; + needsfree_ = false; + //needsdestroy_ = true; +} + +/** + * Create a 2D array texture object using a cudaMallocPitch device pointer. + * The texture object returned must be destroyed by the caller. + */ +template <typename T> +TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = ptr; + resDesc.res.pitch2D.pitchInBytes = pitch; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = width; + resDesc.res.pitch2D.height = height; + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + cudaTextureObject_t tex = 0; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + texobj_ = tex; + pitch_ = pitch; + ptr_ = ptr; + width_ = width; + height_ = height; + needsfree_ = false; + //needsdestroy_ = true; +} + +template <typename T> +TextureObject<T>::TextureObject(int width, int height) { + cudaMallocPitch(&ptr_,&pitch_,width*sizeof(T),height); + + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = ptr_; + resDesc.res.pitch2D.pitchInBytes = pitch_; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = width; + resDesc.res.pitch2D.height = height; + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + cudaTextureObject_t tex = 0; + cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + texobj_ = tex; + width_ = width; + height_ = height; + needsfree_ = true; + //needsdestroy_ = true; +} + +template <typename T> +TextureObject<T>::TextureObject(const TextureObject &p) { + texobj_ = p.texobj_; + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + needsfree_ = p.needsfree_; + //needsdestroy_ = false; +} + +template <typename T> +TextureObject<T>::~TextureObject() { + //if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); + //if (needsfree_) cudaFree(ptr_); +} + +} +} + +#endif // HAVE_CUDA + +#endif // FTL_CUDA_COMMON_HPP_ diff --git a/cv-node/src/algorithms/rtcensus.cu b/cv-node/src/algorithms/rtcensus.cu index a76bdbfef..ece058b09 100644 --- a/cv-node/src/algorithms/rtcensus.cu +++ b/cv-node/src/algorithms/rtcensus.cu @@ -11,14 +11,11 @@ * */ -#include <opencv2/core/cuda/common.hpp> +#include <ftl/cuda_common.hpp> using namespace cv::cuda; using namespace cv; -/* Grid stride loop macros */ -#define STRIDE_Y(I,N) int I = blockIdx.y * blockDim.y + threadIdx.y; I < N; I += blockDim.y * gridDim.y -#define STRIDE_X(I,N) int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x #define BLOCK_W 60 #define RADIUS 7 @@ -32,45 +29,7 @@ namespace gpu { // --- SUPPORT ----------------------------------------------------------------- -template <typename T> -cudaTextureObject_t makeTexture2D(const PtrStepSz<T> &d) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = d.data; - resDesc.res.pitch2D.pitchInBytes = d.step; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = d.cols; - resDesc.res.pitch2D.height = d.rows; - - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - - cudaTextureObject_t tex = 0; - cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); - return tex; -} - -template <typename T> -cudaTextureObject_t makeTexture2D(void *ptr, int pitch, int width, int height) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = ptr; - resDesc.res.pitch2D.pitchInBytes = pitch; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = width; - resDesc.res.pitch2D.height = height; - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - - cudaTextureObject_t tex = 0; - cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); - return tex; -} /* * Sparse 16x16 census (so 8x8) creating a 64bit mask @@ -270,7 +229,10 @@ __global__ void consistency_kernel(cudaTextureObject_t d_sub_l, for (STRIDE_Y(v,h)) { for (STRIDE_X(u,w)) { float a = (int)tex2D<float>(d_sub_l, u, v); - if (u-a < 0) continue; + if (u-a < 0) { + //disp[v*pitch+u] = a; + continue; + } auto b = tex2D<float>(d_sub_r, u-a, v); @@ -281,98 +243,76 @@ __global__ void consistency_kernel(cudaTextureObject_t d_sub_l, } -/*#define FILTER_WINDOW 31 -#define FILTER_WINDOW_R 15 -#define FILTER_SIM_THRESH 5 -#define FILTER_DISP_THRESH 2.0f -__global__ void filter_kernel_old(cudaTextureObject_t t, cudaTextureObject_t d, - cudaTextureObject_t prevD, - cudaTextureObject_t prevT, PtrStepSz<float> f, int num_disp) { - size_t u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS; - size_t v = blockIdx.y + RADIUS; - - float disp = tex2D<float>(d,u,v); - - cudaTextureObject_t nTex = (prevT) ? prevT : t; - cudaTextureObject_t nDisp = (prevD) ? prevD : d; - - float pdisp = tex2D<float>(nDisp,u,v); - if (isnan(pdisp)) pdisp = disp; - if (isnan(disp)) disp = pdisp; - uchar4 pixel = tex2D<uchar4>(t, u, v); - uchar4 ppixel = tex2D<uchar4>(nTex, u, v); - float est = 0.0f; //(isnan(disp)) ? tex2D<float>(prev, u, v) : disp; - int nn = 0; //(isnan(disp)) ? 0 : 1; - int neigh_sq = 0; - int neigh_sum = 0; - - if (isnan(pdisp)) { - f(v,u) = disp; - } else if (!isnan(disp) && abs(pdisp-disp) <= FILTER_DISP_THRESH) { - f(v,u) = (disp+pdisp) / 2; - } else { - f(v,u) = disp; - } - return; + +template <typename T> +__host__ __device__ +inline T lerp(T v0, T v1, T t) { + return fma(t, v1, fma(-t, v0, v0)); +} + +#define FILTER_WINDOW 21 +#define FILTER_WINDOW_R 10 +#define EDGE_SENSITIVITY 10.0f + +__device__ float calculate_edge_disp(cudaTextureObject_t t, cudaTextureObject_t d, cudaTextureObject_t pT, cudaTextureObject_t pD, uchar4 pixel, int u, int v) { + float est = 0.0; + int nn = 0; + //float pest = 0.0; + //int pnn = 0; - //if (!isnan(pdisp) && isnan(disp) && colour_error(pixel,ppixel) <= FILTER_SIM_THRESH) { - // disp = pdisp; - //} + //cudaTextureObject_t nTex = (pT) ? pT : t; + //cudaTextureObject_t nDisp = (pD) ? pD : d; for (int m=-FILTER_WINDOW_R; m<=FILTER_WINDOW_R; m++) { for (int n=-FILTER_WINDOW_R; n<=FILTER_WINDOW_R; n++) { uchar4 neigh = tex2D<uchar4>(t, u+n, v+m); - //neigh_sq += neigh*neigh; - //neigh_sum += neigh; - float ndisp = tex2D<float>(d,u+n,v+m); - if (isnan(ndisp)) { - ndisp = tex2D<float>(nDisp,u+n,v+m); - neigh = tex2D<uchar4>(nTex, u+n, v+m); - } + + //uchar4 pneigh = tex2D<uchar4>(nTex, u+n, v+m); + //float pndisp = tex2D<float>(nDisp,u+n,v+m); //if (isnan(tex2D<float>(nDisp,u+n,v+m))) continue; - if (m == 0 && n == 0) continue; + //if (m == 0 && n == 0) continue; - if (!isnan(ndisp) && (colour_error(neigh,pixel) <= FILTER_SIM_THRESH)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) { + if (!isnan(ndisp) && (abs(neigh.z-pixel.z) <= EDGE_SENSITIVITY)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) { est += ndisp; nn++; } + + //if (!isnan(pndisp) && (abs(pneigh.z-pixel.z) <= EDGE_SENSITIVITY)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) { + // pest += pndisp; + // pnn++; + //} } } + + est = (nn > 0) ? est/nn : NAN; + //pest = (pnn > 0) ? pest/pnn : NAN; + + return est; +} - // Texture map filtering - //int tm = (neigh_sq / (FILTER_WINDOW*FILTER_WINDOW)) - ((neigh_sum*neigh_sum) / (FILTER_WINDOW*FILTER_WINDOW)); - //if (tm >= -5000000) { - // nn = 0; - //} - // ) { - - if (nn > 10) { - f(v,u) = (est+disp) / (nn+1); - } else if (!isnan(pdisp) && colour_error(pixel,ppixel) <= FILTER_SIM_THRESH) { - f(v,u) = pdisp; - } else f(v,u) = disp; - //} else { - // f(v,u) = NAN; - //} -}*/ - -__device__ int colour_error(uchar4 v1, uchar4 v2) { - int dx = abs(v1.x-v2.x); - int dz = abs(v1.z-v2.z); - return dx*dx + dz*dz; +__device__ float colour_error(uchar4 v1, uchar4 v2) { + float dx = 0.05*abs(v1.x-v2.x); + float dy = 0.1*abs(v1.y-v2.y); + float dz = 0.85*abs(v1.z-v2.z); + return dx + dz + dy; } +// TODO Use HUE also and perhaps increase window? +// Or use more complex notion of texture? + +/* Just crossed and currently on edge */ __device__ bool is_edge_left(uchar4 *line, int x, int n) { if (x < 1 || x >= n-1) return false; - return (abs(line[x-1].z-line[x].z) > 15 && abs(line[x].z-line[x+1].z) <= 15); + return (colour_error(line[x-1],line[x]) > EDGE_SENSITIVITY && colour_error(line[x],line[x+1]) <= EDGE_SENSITIVITY); } +/* Just crossed but not on edge now */ __device__ bool is_edge_right(uchar4 *line, int x, int n) { if (x < 1 || x >= n-1) return false; - return (abs(line[x-1].z-line[x].z) <= 15 && abs(line[x].z-line[x+1].z) > 15); + return (colour_error(line[x-1],line[x]) <= EDGE_SENSITIVITY && colour_error(line[x],line[x+1]) > EDGE_SENSITIVITY); } __global__ void filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d, @@ -389,23 +329,69 @@ __global__ void filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d, __syncthreads(); for (STRIDE_X(u,f.cols)) { - if (is_edge_left(line, u, f.cols)) { - float edge_disp = tex2D<float>(d, u, v); + if (is_edge_right(line, u, f.cols)) { + float edge_disp = calculate_edge_disp(t,d,prevT,prevD,line[u],u+2,v); // tex2D<float>(d, u, v); f(v,u) = edge_disp; + continue; + + float est = 0.0f; + int nn = 0; + + if (!isnan(edge_disp)) { + est += edge_disp; + nn++; + } + //f(v,u) = edge_disp; + + // TODO, find other edge first to get disparity + // Use middle disparities to: + // estimate curve or linear (choose equation) + // or ignore as noise if impossible + + // TODO For edge disparity, use a window to: + // a) find a missing disparity + // b) make sure disparity has some consensus (above or below mostly) + + // TODO Use past values? + // Another way to fill blanks and gain concensus + + // TODO Maintain a disparity stack to pop back to background? + // Issue of background disparity not being detected. + // Only if hsv also matches previous background + + // TODO Edge prediction (in vertical direction at least) could + // help fill both edge and disparity gaps. Propagate disparity + // along edges - for (int i=1; u+i<f.cols; i++) { - if (is_edge_right(line, u+i, f.cols)) break; + float last_disp = edge_disp; + + int i; + for (i=1; u+i<f.cols; i++) { + if (is_edge_right(line, u+i, f.cols)) { + //float end_disp = calculate_edge_disp(t,d,prevT,prevD,line[u+i-1],u+i-3,v); + //if (!isnan(end_disp)) last_disp = end_disp; + break; + } + float di = tex2D<float>(d,u+i,v); - if (!isnan(di)) edge_disp = di; - f(v,u+i) = edge_disp; + if (!isnan(di)) { + est += di; + nn++; + } + //f(v,u+i) = edge_disp; } - }// else f(v,u) = NAN; + + est = (nn > 0) ? est / nn : NAN; + //for (int j=1; j<i; j++) { + // f(v,u+j) = est; //lerp(edge_disp, last_disp, (float)j / (float)(i-1)); + //} + } else f(v,u) = NAN; } } } -cudaTextureObject_t prevDisp = 0; -cudaTextureObject_t prevImage = 0; +ftl::cuda::TextureObject<float> prevDisp; +ftl::cuda::TextureObject<uchar4> prevImage; void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) { dim3 grid(1,1,1); @@ -441,58 +427,60 @@ void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const memset(&texDesc, 0, sizeof(texDesc)); texDesc.readMode = cudaReadModeElementType; - cudaTextureObject_t texLeft = makeTexture2D<uchar4>(l); - cudaTextureObject_t texRight = makeTexture2D<uchar4>(r); + ftl::cuda::TextureObject<uchar4> texLeft(l); + ftl::cuda::TextureObject<uchar4> texRight(r); //size_t smem_size = (2 * l.cols * l.rows) * sizeof(uint64_t); // Calculate L and R census - census_kernel<<<grid, threads>>>(texLeft, texRight, l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t)); + census_kernel<<<grid, threads>>>(texLeft.cudaTexture(), texRight.cudaTexture(), l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t)); cudaSafeCall( cudaGetLastError() ); //cudaSafeCall( cudaDeviceSynchronize() ); - cudaTextureObject_t censusTexLeft = makeTexture2D<uint2>(censusL, pitchL, l.cols, l.rows); - cudaTextureObject_t censusTexRight = makeTexture2D<uint2>(censusR, pitchR, r.cols, r.rows); + ftl::cuda::TextureObject<uint2> censusTexLeft((uint2*)censusL, pitchL, l.cols, l.rows); + ftl::cuda::TextureObject<uint2> censusTexRight((uint2*)censusR, pitchR, r.cols, r.rows); grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W); grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD); // Calculate L and R disparities - disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft, censusTexRight, num_disp); + disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), num_disp); cudaSafeCall( cudaGetLastError() ); - cudaTextureObject_t dispTexLeft = makeTexture2D<float>(disp_l, pitchDL, l.cols, l.rows); - cudaTextureObject_t dispTexRight = makeTexture2D<float>(disp_r, pitchDR, r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTexLeft(disp_l, pitchDL, l.cols, l.rows); + ftl::cuda::TextureObject<float> dispTexRight(disp_r, pitchDR, r.cols, r.rows); // Check consistency between L and R disparities. - consistency_kernel<<<grid, threads>>>(dispTexLeft, dispTexRight, disp_raw, l.cols, l.rows, pitchD/sizeof(float)); + consistency_kernel<<<grid, threads>>>(dispTexLeft.cudaTexture(), dispTexRight.cudaTexture(), disp_raw, l.cols, l.rows, pitchD/sizeof(float)); cudaSafeCall( cudaGetLastError() ); - cudaTextureObject_t dispTex = makeTexture2D<float>(disp_raw, pitchD, r.cols, r.rows); + ftl::cuda::TextureObject<float> dispTex(disp_raw, pitchD, r.cols, r.rows); grid.x = 4; grid.y = l.rows; threads.x = l.cols; size_t filter_smem = sizeof(uchar4) * l.cols; - filter_kernel<<<grid, threads, filter_smem>>>(texLeft, dispTex, prevDisp, prevImage, disp, num_disp); + filter_kernel<<<grid, threads, filter_smem>>>(texLeft.cudaTexture(), dispTex.cudaTexture(), prevDisp.cudaTexture(), prevImage.cudaTexture(), disp, num_disp); cudaSafeCall( cudaGetLastError() ); - if (prevDisp) cudaSafeCall( cudaDestroyTextureObject (prevDisp) ); - prevDisp = makeTexture2D<float>(disp); - if (prevImage) cudaSafeCall( cudaDestroyTextureObject (prevImage) ); + prevDisp.free(); + prevDisp = disp; + prevImage.free(); prevImage = texLeft; //if (&stream == Stream::Null()) cudaSafeCall( cudaDeviceSynchronize() ); //cudaSafeCall( cudaDestroyTextureObject (texLeft) ); - cudaSafeCall( cudaDestroyTextureObject (texRight) ); - cudaSafeCall( cudaDestroyTextureObject (censusTexLeft) ); - cudaSafeCall( cudaDestroyTextureObject (censusTexRight) ); - cudaSafeCall( cudaDestroyTextureObject (dispTexLeft) ); - cudaSafeCall( cudaDestroyTextureObject (dispTexRight) ); - cudaSafeCall( cudaDestroyTextureObject (dispTex) ); + + texRight.free(); + censusTexLeft.free(); + censusTexRight.free(); + dispTexLeft.free(); + dispTexRight.free(); + dispTex.free(); + cudaFree(disp_r); cudaFree(disp_l); cudaFree(censusL); -- GitLab