Skip to content
Snippets Groups Projects
Commit a2851649 authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Refactor cuda textures into object wrapper

parent 9d1531a0
No related branches found
No related tags found
No related merge requests found
#ifndef _FTL_CUDA_COMMON_HPP_
#define _FTL_CUDA_COMMON_HPP_
#if defined HAVE_CUDA
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda/common.hpp>
/* Grid stride loop macros */
#define STRIDE_Y(I,N) int I = blockIdx.y * blockDim.y + threadIdx.y; I < N; I += blockDim.y * gridDim.y
#define STRIDE_X(I,N) int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x
namespace ftl {
namespace cuda {
template <typename T>
class TextureObject {
public:
TextureObject() : texobj_(0), ptr_(nullptr) {};
TextureObject(const cv::cuda::PtrStepSz<T> &d);
TextureObject(T *ptr, int pitch, int width, int height);
TextureObject(int width, int height);
TextureObject(const TextureObject &t);
~TextureObject();
int getPitch();
T *devicePtr() { return ptr_; };
cudaTextureObject_t cudaTexture() const { return texobj_; }
__device__ inline T tex2D(int u, int v) { return ::tex2D<T>(texobj_, u, v); }
__device__ inline T tex2D(float u, float v) { return ::tex2D<T>(texobj_, u, v); }
inline const T &operator()(int u, int v) const { return ptr_[u+v*pitch_]; }
inline T &operator()(int u, int v) { return ptr_[u+v*pitch_]; }
void free() {
if (texobj_ != 0) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
if (ptr_ && needsfree_) cudaFree(ptr_);
ptr_ = nullptr;
texobj_ = 0;
}
private:
cudaTextureObject_t texobj_;
int pitch_;
int width_;
int height_;
T *ptr_;
bool needsfree_;
//bool needsdestroy_;
};
/**
* Create a 2D array texture from an OpenCV GpuMat object.
*/
template <typename T>
TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d.data;
resDesc.res.pitch2D.pitchInBytes = d.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
resDesc.res.pitch2D.width = d.cols;
resDesc.res.pitch2D.height = d.rows;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
texobj_ = tex;
pitch_ = d.step;
ptr_ = d.data;
width_ = d.cols;
height_ = d.rows;
needsfree_ = false;
//needsdestroy_ = true;
}
/**
* Create a 2D array texture object using a cudaMallocPitch device pointer.
* The texture object returned must be destroyed by the caller.
*/
template <typename T>
TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = ptr;
resDesc.res.pitch2D.pitchInBytes = pitch;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
texobj_ = tex;
pitch_ = pitch;
ptr_ = ptr;
width_ = width;
height_ = height;
needsfree_ = false;
//needsdestroy_ = true;
}
template <typename T>
TextureObject<T>::TextureObject(int width, int height) {
cudaMallocPitch(&ptr_,&pitch_,width*sizeof(T),height);
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = ptr_;
resDesc.res.pitch2D.pitchInBytes = pitch_;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
texobj_ = tex;
width_ = width;
height_ = height;
needsfree_ = true;
//needsdestroy_ = true;
}
template <typename T>
TextureObject<T>::TextureObject(const TextureObject &p) {
texobj_ = p.texobj_;
ptr_ = p.ptr_;
width_ = p.width_;
height_ = p.height_;
pitch_ = p.pitch_;
needsfree_ = p.needsfree_;
//needsdestroy_ = false;
}
template <typename T>
TextureObject<T>::~TextureObject() {
//if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) );
//if (needsfree_) cudaFree(ptr_);
}
}
}
#endif // HAVE_CUDA
#endif // FTL_CUDA_COMMON_HPP_
......@@ -11,14 +11,11 @@
*
*/
#include <opencv2/core/cuda/common.hpp>
#include <ftl/cuda_common.hpp>
using namespace cv::cuda;
using namespace cv;
/* Grid stride loop macros */
#define STRIDE_Y(I,N) int I = blockIdx.y * blockDim.y + threadIdx.y; I < N; I += blockDim.y * gridDim.y
#define STRIDE_X(I,N) int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x
#define BLOCK_W 60
#define RADIUS 7
......@@ -32,45 +29,7 @@ namespace gpu {
// --- SUPPORT -----------------------------------------------------------------
template <typename T>
cudaTextureObject_t makeTexture2D(const PtrStepSz<T> &d) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d.data;
resDesc.res.pitch2D.pitchInBytes = d.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
resDesc.res.pitch2D.width = d.cols;
resDesc.res.pitch2D.height = d.rows;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
return tex;
}
template <typename T>
cudaTextureObject_t makeTexture2D(void *ptr, int pitch, int width, int height) {
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = ptr;
resDesc.res.pitch2D.pitchInBytes = pitch;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
resDesc.res.pitch2D.width = width;
resDesc.res.pitch2D.height = height;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
return tex;
}
/*
* Sparse 16x16 census (so 8x8) creating a 64bit mask
......@@ -270,7 +229,10 @@ __global__ void consistency_kernel(cudaTextureObject_t d_sub_l,
for (STRIDE_Y(v,h)) {
for (STRIDE_X(u,w)) {
float a = (int)tex2D<float>(d_sub_l, u, v);
if (u-a < 0) continue;
if (u-a < 0) {
//disp[v*pitch+u] = a;
continue;
}
auto b = tex2D<float>(d_sub_r, u-a, v);
......@@ -281,98 +243,76 @@ __global__ void consistency_kernel(cudaTextureObject_t d_sub_l,
}
/*#define FILTER_WINDOW 31
#define FILTER_WINDOW_R 15
#define FILTER_SIM_THRESH 5
#define FILTER_DISP_THRESH 2.0f
__global__ void filter_kernel_old(cudaTextureObject_t t, cudaTextureObject_t d,
cudaTextureObject_t prevD,
cudaTextureObject_t prevT, PtrStepSz<float> f, int num_disp) {
size_t u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS;
size_t v = blockIdx.y + RADIUS;
float disp = tex2D<float>(d,u,v);
cudaTextureObject_t nTex = (prevT) ? prevT : t;
cudaTextureObject_t nDisp = (prevD) ? prevD : d;
float pdisp = tex2D<float>(nDisp,u,v);
if (isnan(pdisp)) pdisp = disp;
if (isnan(disp)) disp = pdisp;
uchar4 pixel = tex2D<uchar4>(t, u, v);
uchar4 ppixel = tex2D<uchar4>(nTex, u, v);
float est = 0.0f; //(isnan(disp)) ? tex2D<float>(prev, u, v) : disp;
int nn = 0; //(isnan(disp)) ? 0 : 1;
int neigh_sq = 0;
int neigh_sum = 0;
if (isnan(pdisp)) {
f(v,u) = disp;
} else if (!isnan(disp) && abs(pdisp-disp) <= FILTER_DISP_THRESH) {
f(v,u) = (disp+pdisp) / 2;
} else {
f(v,u) = disp;
}
return;
template <typename T>
__host__ __device__
inline T lerp(T v0, T v1, T t) {
return fma(t, v1, fma(-t, v0, v0));
}
#define FILTER_WINDOW 21
#define FILTER_WINDOW_R 10
#define EDGE_SENSITIVITY 10.0f
__device__ float calculate_edge_disp(cudaTextureObject_t t, cudaTextureObject_t d, cudaTextureObject_t pT, cudaTextureObject_t pD, uchar4 pixel, int u, int v) {
float est = 0.0;
int nn = 0;
//float pest = 0.0;
//int pnn = 0;
//if (!isnan(pdisp) && isnan(disp) && colour_error(pixel,ppixel) <= FILTER_SIM_THRESH) {
// disp = pdisp;
//}
//cudaTextureObject_t nTex = (pT) ? pT : t;
//cudaTextureObject_t nDisp = (pD) ? pD : d;
for (int m=-FILTER_WINDOW_R; m<=FILTER_WINDOW_R; m++) {
for (int n=-FILTER_WINDOW_R; n<=FILTER_WINDOW_R; n++) {
uchar4 neigh = tex2D<uchar4>(t, u+n, v+m);
//neigh_sq += neigh*neigh;
//neigh_sum += neigh;
float ndisp = tex2D<float>(d,u+n,v+m);
if (isnan(ndisp)) {
ndisp = tex2D<float>(nDisp,u+n,v+m);
neigh = tex2D<uchar4>(nTex, u+n, v+m);
}
//uchar4 pneigh = tex2D<uchar4>(nTex, u+n, v+m);
//float pndisp = tex2D<float>(nDisp,u+n,v+m);
//if (isnan(tex2D<float>(nDisp,u+n,v+m))) continue;
if (m == 0 && n == 0) continue;
//if (m == 0 && n == 0) continue;
if (!isnan(ndisp) && (colour_error(neigh,pixel) <= FILTER_SIM_THRESH)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) {
if (!isnan(ndisp) && (abs(neigh.z-pixel.z) <= EDGE_SENSITIVITY)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) {
est += ndisp;
nn++;
}
//if (!isnan(pndisp) && (abs(pneigh.z-pixel.z) <= EDGE_SENSITIVITY)) { // && (isnan(disp) || abs(ndisp-disp) < FILTER_DISP_THRESH)) {
// pest += pndisp;
// pnn++;
//}
}
}
est = (nn > 0) ? est/nn : NAN;
//pest = (pnn > 0) ? pest/pnn : NAN;
return est;
}
// Texture map filtering
//int tm = (neigh_sq / (FILTER_WINDOW*FILTER_WINDOW)) - ((neigh_sum*neigh_sum) / (FILTER_WINDOW*FILTER_WINDOW));
//if (tm >= -5000000) {
// nn = 0;
//}
// ) {
if (nn > 10) {
f(v,u) = (est+disp) / (nn+1);
} else if (!isnan(pdisp) && colour_error(pixel,ppixel) <= FILTER_SIM_THRESH) {
f(v,u) = pdisp;
} else f(v,u) = disp;
//} else {
// f(v,u) = NAN;
//}
}*/
__device__ int colour_error(uchar4 v1, uchar4 v2) {
int dx = abs(v1.x-v2.x);
int dz = abs(v1.z-v2.z);
return dx*dx + dz*dz;
__device__ float colour_error(uchar4 v1, uchar4 v2) {
float dx = 0.05*abs(v1.x-v2.x);
float dy = 0.1*abs(v1.y-v2.y);
float dz = 0.85*abs(v1.z-v2.z);
return dx + dz + dy;
}
// TODO Use HUE also and perhaps increase window?
// Or use more complex notion of texture?
/* Just crossed and currently on edge */
__device__ bool is_edge_left(uchar4 *line, int x, int n) {
if (x < 1 || x >= n-1) return false;
return (abs(line[x-1].z-line[x].z) > 15 && abs(line[x].z-line[x+1].z) <= 15);
return (colour_error(line[x-1],line[x]) > EDGE_SENSITIVITY && colour_error(line[x],line[x+1]) <= EDGE_SENSITIVITY);
}
/* Just crossed but not on edge now */
__device__ bool is_edge_right(uchar4 *line, int x, int n) {
if (x < 1 || x >= n-1) return false;
return (abs(line[x-1].z-line[x].z) <= 15 && abs(line[x].z-line[x+1].z) > 15);
return (colour_error(line[x-1],line[x]) <= EDGE_SENSITIVITY && colour_error(line[x],line[x+1]) > EDGE_SENSITIVITY);
}
__global__ void filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d,
......@@ -389,23 +329,69 @@ __global__ void filter_kernel(cudaTextureObject_t t, cudaTextureObject_t d,
__syncthreads();
for (STRIDE_X(u,f.cols)) {
if (is_edge_left(line, u, f.cols)) {
float edge_disp = tex2D<float>(d, u, v);
if (is_edge_right(line, u, f.cols)) {
float edge_disp = calculate_edge_disp(t,d,prevT,prevD,line[u],u+2,v); // tex2D<float>(d, u, v);
f(v,u) = edge_disp;
continue;
float est = 0.0f;
int nn = 0;
if (!isnan(edge_disp)) {
est += edge_disp;
nn++;
}
//f(v,u) = edge_disp;
// TODO, find other edge first to get disparity
// Use middle disparities to:
// estimate curve or linear (choose equation)
// or ignore as noise if impossible
// TODO For edge disparity, use a window to:
// a) find a missing disparity
// b) make sure disparity has some consensus (above or below mostly)
// TODO Use past values?
// Another way to fill blanks and gain concensus
// TODO Maintain a disparity stack to pop back to background?
// Issue of background disparity not being detected.
// Only if hsv also matches previous background
// TODO Edge prediction (in vertical direction at least) could
// help fill both edge and disparity gaps. Propagate disparity
// along edges
for (int i=1; u+i<f.cols; i++) {
if (is_edge_right(line, u+i, f.cols)) break;
float last_disp = edge_disp;
int i;
for (i=1; u+i<f.cols; i++) {
if (is_edge_right(line, u+i, f.cols)) {
//float end_disp = calculate_edge_disp(t,d,prevT,prevD,line[u+i-1],u+i-3,v);
//if (!isnan(end_disp)) last_disp = end_disp;
break;
}
float di = tex2D<float>(d,u+i,v);
if (!isnan(di)) edge_disp = di;
f(v,u+i) = edge_disp;
if (!isnan(di)) {
est += di;
nn++;
}
//f(v,u+i) = edge_disp;
}
}// else f(v,u) = NAN;
est = (nn > 0) ? est / nn : NAN;
//for (int j=1; j<i; j++) {
// f(v,u+j) = est; //lerp(edge_disp, last_disp, (float)j / (float)(i-1));
//}
} else f(v,u) = NAN;
}
}
}
cudaTextureObject_t prevDisp = 0;
cudaTextureObject_t prevImage = 0;
ftl::cuda::TextureObject<float> prevDisp;
ftl::cuda::TextureObject<uchar4> prevImage;
void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const PtrStepSz<float> &disp, size_t num_disp, const int &stream) {
dim3 grid(1,1,1);
......@@ -441,58 +427,60 @@ void rtcensus_call(const PtrStepSz<uchar4> &l, const PtrStepSz<uchar4> &r, const
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
cudaTextureObject_t texLeft = makeTexture2D<uchar4>(l);
cudaTextureObject_t texRight = makeTexture2D<uchar4>(r);
ftl::cuda::TextureObject<uchar4> texLeft(l);
ftl::cuda::TextureObject<uchar4> texRight(r);
//size_t smem_size = (2 * l.cols * l.rows) * sizeof(uint64_t);
// Calculate L and R census
census_kernel<<<grid, threads>>>(texLeft, texRight, l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t));
census_kernel<<<grid, threads>>>(texLeft.cudaTexture(), texRight.cudaTexture(), l.cols, l.rows, censusL, censusR, pitchL/sizeof(uint64_t), pitchR/sizeof(uint64_t));
cudaSafeCall( cudaGetLastError() );
//cudaSafeCall( cudaDeviceSynchronize() );
cudaTextureObject_t censusTexLeft = makeTexture2D<uint2>(censusL, pitchL, l.cols, l.rows);
cudaTextureObject_t censusTexRight = makeTexture2D<uint2>(censusR, pitchR, r.cols, r.rows);
ftl::cuda::TextureObject<uint2> censusTexLeft((uint2*)censusL, pitchL, l.cols, l.rows);
ftl::cuda::TextureObject<uint2> censusTexRight((uint2*)censusR, pitchR, r.cols, r.rows);
grid.x = cv::cuda::device::divUp(l.cols - 2 * RADIUS2, BLOCK_W);
grid.y = cv::cuda::device::divUp(l.rows - 2 * RADIUS2, ROWSperTHREAD);
// Calculate L and R disparities
disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft, censusTexRight, num_disp);
disp_kernel<<<grid, threads>>>(disp_l, disp_r, pitchDL/sizeof(float), pitchDR/sizeof(float), l.cols, l.rows, censusTexLeft.cudaTexture(), censusTexRight.cudaTexture(), num_disp);
cudaSafeCall( cudaGetLastError() );
cudaTextureObject_t dispTexLeft = makeTexture2D<float>(disp_l, pitchDL, l.cols, l.rows);
cudaTextureObject_t dispTexRight = makeTexture2D<float>(disp_r, pitchDR, r.cols, r.rows);
ftl::cuda::TextureObject<float> dispTexLeft(disp_l, pitchDL, l.cols, l.rows);
ftl::cuda::TextureObject<float> dispTexRight(disp_r, pitchDR, r.cols, r.rows);
// Check consistency between L and R disparities.
consistency_kernel<<<grid, threads>>>(dispTexLeft, dispTexRight, disp_raw, l.cols, l.rows, pitchD/sizeof(float));
consistency_kernel<<<grid, threads>>>(dispTexLeft.cudaTexture(), dispTexRight.cudaTexture(), disp_raw, l.cols, l.rows, pitchD/sizeof(float));
cudaSafeCall( cudaGetLastError() );
cudaTextureObject_t dispTex = makeTexture2D<float>(disp_raw, pitchD, r.cols, r.rows);
ftl::cuda::TextureObject<float> dispTex(disp_raw, pitchD, r.cols, r.rows);
grid.x = 4;
grid.y = l.rows;
threads.x = l.cols;
size_t filter_smem = sizeof(uchar4) * l.cols;
filter_kernel<<<grid, threads, filter_smem>>>(texLeft, dispTex, prevDisp, prevImage, disp, num_disp);
filter_kernel<<<grid, threads, filter_smem>>>(texLeft.cudaTexture(), dispTex.cudaTexture(), prevDisp.cudaTexture(), prevImage.cudaTexture(), disp, num_disp);
cudaSafeCall( cudaGetLastError() );
if (prevDisp) cudaSafeCall( cudaDestroyTextureObject (prevDisp) );
prevDisp = makeTexture2D<float>(disp);
if (prevImage) cudaSafeCall( cudaDestroyTextureObject (prevImage) );
prevDisp.free();
prevDisp = disp;
prevImage.free();
prevImage = texLeft;
//if (&stream == Stream::Null())
cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaDestroyTextureObject (texLeft) );
cudaSafeCall( cudaDestroyTextureObject (texRight) );
cudaSafeCall( cudaDestroyTextureObject (censusTexLeft) );
cudaSafeCall( cudaDestroyTextureObject (censusTexRight) );
cudaSafeCall( cudaDestroyTextureObject (dispTexLeft) );
cudaSafeCall( cudaDestroyTextureObject (dispTexRight) );
cudaSafeCall( cudaDestroyTextureObject (dispTex) );
texRight.free();
censusTexLeft.free();
censusTexRight.free();
dispTexLeft.free();
dispTexRight.free();
dispTex.free();
cudaFree(disp_r);
cudaFree(disp_l);
cudaFree(censusL);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment