diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp index 00cb8554f5b4dceabe029aaf9c4ea78f499fe885..ca11bf526ef0e6e141861a79a15e2bf656106a5c 100644 --- a/applications/gui/src/camera.cpp +++ b/applications/gui/src/camera.cpp @@ -423,14 +423,14 @@ void ftl::gui::Camera::_downloadFrames(ftl::rgbd::Frame *frame) { } else if (channel_ == Channel::ColourNormals && frame->hasChannel(Channel::Depth)) { // We can calculate normals here. ftl::cuda::normals( - frame->createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())), + frame->createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())), frame->createTexture<float>(Channel::Depth), frame->getLeftCamera(), 0 ); frame->create<GpuMat>(Channel::ColourNormals, ftl::rgbd::Format<uchar4>(frame->get<cv::cuda::GpuMat>(Channel::Depth).size())).setTo(cv::Scalar(0,0,0,0), cv::cuda::Stream::Null()); - ftl::cuda::normal_visualise(frame->getTexture<float4>(Channel::Normals), frame->createTexture<uchar4>(Channel::ColourNormals), + ftl::cuda::normal_visualise(frame->getTexture<half4>(Channel::Normals), frame->createTexture<uchar4>(Channel::ColourNormals), make_float3(0.3f,0.3f,0.3f), make_uchar4(200,200,200,255), make_uchar4(50,50,50,255), 0); diff --git a/components/common/cpp/include/ftl/cuda_buffer.hpp b/components/common/cpp/include/ftl/cuda_buffer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..45abe03c733d222deec030873753dfa7c6b7c900 --- /dev/null +++ b/components/common/cpp/include/ftl/cuda_buffer.hpp @@ -0,0 +1,213 @@ +#ifndef _FTL_CUDA_BUFFER_HPP_ +#define _FTL_CUDA_BUFFER_HPP_ + +namespace ftl { +namespace cuda { + +class BufferBase { + public: + __host__ __device__ BufferBase() + : pitch_(0), pitch2_(0), width_(0), height_(0), + ptr_(nullptr), needsfree_(false), + cvType_(-1) {}; + ~BufferBase(); + + BufferBase(const BufferBase &)=delete; + BufferBase &operator=(const BufferBase &)=delete; + + BufferBase(BufferBase &&); + BufferBase &operator=(BufferBase &&); + + inline size_t pitch() const { return pitch_; } + inline size_t pixelPitch() const { return pitch2_; } + inline uchar *devicePtr() const { return ptr_; }; + __host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; } + __host__ __device__ inline int width() const { return width_; } + __host__ __device__ inline int height() const { return height_; } + + void upload(const cv::Mat &, cudaStream_t stream=0); + void download(cv::Mat &, cudaStream_t stream=0) const; + + __host__ void free(); + + inline int cvType() const { return cvType_; } + + protected: + size_t pitch_; + size_t pitch2_; // in T units + int width_; + int height_; + uchar *ptr_; // Device memory pointer + bool needsfree_; // We manage memory, so free it + int cvType_; // Used to validate casting +}; + + +template <typename T> +class Buffer : public BufferBase { + public: + typedef T type; + + __host__ __device__ Buffer() : BufferBase() {}; + explicit Buffer(const cv::cuda::GpuMat &d); + explicit Buffer(const cv::cuda::PtrStepSz<T> &d); + Buffer(T *ptr, int pitch, int width, int height); + Buffer(size_t width, size_t height); + Buffer(const Buffer<T> &t); + __host__ __device__ Buffer(Buffer<T> &&); + ~Buffer(); + + Buffer<T> &operator=(const Buffer<T> &); + __host__ __device__ Buffer<T> &operator=(Buffer<T> &&); + + operator cv::cuda::GpuMat(); + + void create(const cv::Size &); + void create(int w, int h); + + __host__ __device__ T *devicePtr() const { return (T*)(ptr_); }; + __host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; } + + __host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + __host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + + /** + * Cast a base texture object to this type of texture object. If the + * underlying pixel types do not match then a bad_cast exception is thrown. + */ + static Buffer<T> &cast(BufferBase &); +}; + +#ifndef __CUDACC__ +template <typename T> +Buffer<T> &Buffer<T>::cast(BufferBase &b) { + if (b.cvType() != ftl::traits::OpenCVType<T>::value) { + //LOG(ERROR) << "Bad cast of texture object"; + throw std::bad_cast(); + } + return reinterpret_cast<Buffer<T>&>(b); +} + +/** + * Create a 2D array texture from an OpenCV GpuMat object. + */ +template <typename T> +Buffer<T>::Buffer(const cv::cuda::GpuMat &d) { + pitch_ = d.step; + pitch2_ = pitch_ / sizeof(T); + ptr_ = d.data; + width_ = d.cols; + height_ = d.rows; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +#endif // __CUDACC__ + +template <typename T> +Buffer<T>::Buffer(const cv::cuda::PtrStepSz<T> &d) { + pitch_ = d.step; + pitch2_ = pitch_ / sizeof(T); + ptr_ = d.data; + width_ = d.cols; + height_ = d.rows; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +template <typename T> +Buffer<T>::Buffer(T *ptr, int pitch, int width, int height) { + pitch_ = pitch; + pitch2_ = pitch_ / sizeof(T); + ptr_ = ptr; + width_ = width; + height_ = height; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +template <typename T> +Buffer<T>::Buffer(size_t width, size_t height) { + cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height); + width_ = (int)width; + height_ = (int)height; + needsfree_ = true; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +#ifndef __CUDACC__ +template <typename T> +void Buffer<T>::create(const cv::Size &s) { + create(s.width, s.height); +} + +template <typename T> +void Buffer<T>::create(int w, int h) { + if (width_ != w || height_ != h) { + *this = std::move(Buffer<T>(w, h)); + } +} +#endif + +template <typename T> +Buffer<T>::Buffer(const Buffer<T> &p) { + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; + needsfree_ = false; +} + +template <typename T> +Buffer<T>::Buffer(Buffer<T> &&p) { + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + needsfree_ = p.needsfree_; + p.needsfree_ = false; + p.ptr_ = nullptr; + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +template <typename T> +Buffer<T> &Buffer<T>::operator=(const Buffer<T> &p) { + free(); + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; + needsfree_ = false; + return *this; +} + +template <typename T> +Buffer<T> &Buffer<T>::operator=(Buffer<T> &&p) { + free(); + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + needsfree_ = p.needsfree_; + p.needsfree_ = false; + p.ptr_ = nullptr; + cvType_ = ftl::traits::OpenCVType<T>::value; + return *this; +} + +template <typename T> +Buffer<T>::~Buffer() { + free(); +} + +} +} + +#endif diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp index c74c4faa0868c3e352a41e77369daa2ea24bd78e..5c3cc9f484ecbaf615cf44469e3c37b636a4f7e6 100644 --- a/components/common/cpp/include/ftl/cuda_common.hpp +++ b/components/common/cpp/include/ftl/cuda_common.hpp @@ -9,6 +9,9 @@ #include <ftl/cuda_util.hpp> #include <opencv2/core/cuda.hpp> #include <opencv2/core/cuda/common.hpp> +#include <ftl/cuda_half.hpp> +#include <ftl/cuda_texture.hpp> +#include <ftl/cuda_buffer.hpp> #ifndef __CUDACC__ #include <exception> @@ -33,355 +36,6 @@ void setDevice(int); void setDevice(); -template <typename T> -struct Float; - -template <> struct Float<float> { typedef float type; }; -template <> struct Float<int> { typedef float type; }; -template <> struct Float<float4> { typedef float4 type; }; -template <> struct Float<uchar4> { typedef float4 type; }; -template <> struct Float<uint8_t> { typedef float type; }; -template <> struct Float<short2> { typedef float2 type; }; -template <> struct Float<short> { typedef float type; }; - -template <typename T> -struct ScaleValue; - -template <> struct ScaleValue<uchar4> { static constexpr float value = 255.0f; }; -template <> struct ScaleValue<uint8_t> { static constexpr float value = 255.0f; }; -template <> struct ScaleValue<float> { static constexpr float value = 1.0f; }; -template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; }; -template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; }; - -/** - * Represent a CUDA texture object. Instances of this class can be used on both - * host and device. A texture object base cannot be constructed directly, it - * must be constructed via a template TextureObject class. - */ -class TextureObjectBase { - public: - __host__ __device__ TextureObjectBase() - : texobj_(0), pitch_(0), pitch2_(0), width_(0), height_(0), - ptr_(nullptr), needsfree_(false), needsdestroy_(false), - cvType_(-1) {}; - ~TextureObjectBase(); - - // Remove ability to copy object directly, instead must use - // templated derivative TextureObject. - TextureObjectBase(const TextureObjectBase &)=delete; - TextureObjectBase &operator=(const TextureObjectBase &)=delete; - - TextureObjectBase(TextureObjectBase &&); - TextureObjectBase &operator=(TextureObjectBase &&); - - inline size_t pitch() const { return pitch_; } - inline size_t pixelPitch() const { return pitch2_; } - inline uchar *devicePtr() const { return ptr_; }; - __host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; } - __host__ __device__ inline int width() const { return width_; } - __host__ __device__ inline int height() const { return height_; } - __host__ __device__ inline cudaTextureObject_t cudaTexture() const { return texobj_; } - - void upload(const cv::Mat &, cudaStream_t stream=0); - void download(cv::Mat &, cudaStream_t stream=0) const; - - __host__ void free(); - - inline int cvType() const { return cvType_; } - - protected: - cudaTextureObject_t texobj_; - size_t pitch_; - size_t pitch2_; // in T units - int width_; - int height_; - uchar *ptr_; // Device memory pointer - bool needsfree_; // We manage memory, so free it - bool needsdestroy_; // The texture object needs to be destroyed - int cvType_; // Used to validate casting -}; - -/** - * Create and manage CUDA texture objects with a particular pixel data type. - * Note: it is not possible to create texture objects for certain types, - * specificially for 3 channel types. - */ -template <typename T> -class TextureObject : public TextureObjectBase { - public: - typedef T type; - - static_assert((16u % sizeof(T)) == 0, "Channel format must be aligned with 16 bytes"); - - __host__ __device__ TextureObject() : TextureObjectBase() {}; - explicit TextureObject(const cv::cuda::GpuMat &d, bool interpolated=false); - explicit TextureObject(const cv::cuda::PtrStepSz<T> &d); - TextureObject(T *ptr, int pitch, int width, int height); - TextureObject(size_t width, size_t height); - TextureObject(const TextureObject<T> &t); - __host__ __device__ TextureObject(TextureObject<T> &&); - ~TextureObject(); - - TextureObject<T> &operator=(const TextureObject<T> &); - __host__ __device__ TextureObject<T> &operator=(TextureObject<T> &&); - - operator cv::cuda::GpuMat(); - - void create(const cv::Size &); - void create(int w, int h); - - __host__ __device__ T *devicePtr() const { return (T*)(ptr_); }; - __host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; } - - #ifdef __CUDACC__ - __device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); } - __device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); } - __device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; } - #endif - - __host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } - __host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } - - /** - * Cast a base texture object to this type of texture object. If the - * underlying pixel types do not match then a bad_cast exception is thrown. - */ - static TextureObject<T> &cast(TextureObjectBase &); -}; - -#ifndef __CUDACC__ -template <typename T> -TextureObject<T> &TextureObject<T>::cast(TextureObjectBase &b) { - if (b.cvType() != ftl::traits::OpenCVType<T>::value) { - //LOG(ERROR) << "Bad cast of texture object"; - throw std::bad_cast(); - } - return reinterpret_cast<TextureObject<T>&>(b); -} - -/** - * Create a 2D array texture from an OpenCV GpuMat object. - */ -template <typename T> -TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) { - // GpuMat must have correct data type - //CHECK(d.type() == ftl::traits::OpenCVType<T>::value); - - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = d.data; - resDesc.res.pitch2D.pitchInBytes = d.step; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = d.cols; - resDesc.res.pitch2D.height = d.rows; - - cudaTextureDesc texDesc; - // cppcheck-suppress memsetClassFloat - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = (interpolated) ? cudaReadModeNormalizedFloat : cudaReadModeElementType; - if (interpolated) texDesc.filterMode = cudaFilterModeLinear; - - cudaTextureObject_t tex = 0; - cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); - texobj_ = tex; - pitch_ = d.step; - pitch2_ = pitch_ / sizeof(T); - ptr_ = d.data; - width_ = d.cols; - height_ = d.rows; - needsfree_ = false; - cvType_ = ftl::traits::OpenCVType<T>::value; - needsdestroy_ = true; -} - -#endif // __CUDACC__ - -/** - * Create a 2D array texture from an OpenCV GpuMat object. - */ -template <typename T> -TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = d.data; - resDesc.res.pitch2D.pitchInBytes = d.step; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = d.cols; - resDesc.res.pitch2D.height = d.rows; - - cudaTextureDesc texDesc; - // cppcheck-suppress memsetClassFloat - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; - - cudaTextureObject_t tex = 0; - cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); - texobj_ = tex; - pitch_ = d.step; - pitch2_ = pitch_ / sizeof(T); - ptr_ = d.data; - width_ = d.cols; - height_ = d.rows; - needsfree_ = false; - cvType_ = ftl::traits::OpenCVType<T>::value; - needsdestroy_ = true; -} - -/** - * Create a 2D array texture object using a cudaMallocPitch device pointer. - * The texture object returned must be destroyed by the caller. - */ -template <typename T> -TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = ptr; - resDesc.res.pitch2D.pitchInBytes = pitch; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = width; - resDesc.res.pitch2D.height = height; - - cudaTextureDesc texDesc; - // cppcheck-suppress memsetClassFloat - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; - - cudaTextureObject_t tex = 0; - cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); - texobj_ = tex; - pitch_ = pitch; - pitch2_ = pitch_ / sizeof(T); - ptr_ = ptr; - width_ = width; - height_ = height; - needsfree_ = false; - cvType_ = ftl::traits::OpenCVType<T>::value; - needsdestroy_ = true; -} - -template <typename T> -TextureObject<T>::TextureObject(size_t width, size_t height) { - cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height); - cudaTextureObject_t tex = 0; - - // Must be an even - //if (!(sizeof(T) & 0x1)) { - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = ptr_; - resDesc.res.pitch2D.pitchInBytes = pitch_; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); - resDesc.res.pitch2D.width = width; - resDesc.res.pitch2D.height = height; - - cudaTextureDesc texDesc; - // cppcheck-suppress memsetClassFloat - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.readMode = cudaReadModeElementType; - //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; - cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); - //} - - texobj_ = tex; - width_ = (int)width; - height_ = (int)height; - needsfree_ = true; - pitch2_ = pitch_ / sizeof(T); - cvType_ = ftl::traits::OpenCVType<T>::value; - needsdestroy_ = true; -} - -#ifndef __CUDACC__ -template <typename T> -void TextureObject<T>::create(const cv::Size &s) { - create(s.width, s.height); -} - -template <typename T> -void TextureObject<T>::create(int w, int h) { - if (width_ != w || height_ != h) { - *this = std::move(TextureObject<T>(w, h)); - } -} -#endif - -template <typename T> -TextureObject<T>::TextureObject(const TextureObject<T> &p) { - texobj_ = p.texobj_; - ptr_ = p.ptr_; - width_ = p.width_; - height_ = p.height_; - pitch_ = p.pitch_; - pitch2_ = pitch_ / sizeof(T); - cvType_ = ftl::traits::OpenCVType<T>::value; - needsfree_ = false; - needsdestroy_ = false; -} - -template <typename T> -TextureObject<T>::TextureObject(TextureObject<T> &&p) { - texobj_ = p.texobj_; - ptr_ = p.ptr_; - width_ = p.width_; - height_ = p.height_; - pitch_ = p.pitch_; - pitch2_ = pitch_ / sizeof(T); - needsfree_ = p.needsfree_; - needsdestroy_ = p.needsdestroy_; - p.texobj_ = 0; - p.needsfree_ = false; - p.needsdestroy_ = false; - p.ptr_ = nullptr; - cvType_ = ftl::traits::OpenCVType<T>::value; -} - -template <typename T> -TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) { - free(); - texobj_ = p.texobj_; - ptr_ = p.ptr_; - width_ = p.width_; - height_ = p.height_; - pitch_ = p.pitch_; - pitch2_ = pitch_ / sizeof(T); - cvType_ = ftl::traits::OpenCVType<T>::value; - needsfree_ = false; - needsdestroy_ = false; - return *this; -} - -template <typename T> -TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) { - free(); - texobj_ = p.texobj_; - ptr_ = p.ptr_; - width_ = p.width_; - height_ = p.height_; - pitch_ = p.pitch_; - pitch2_ = pitch_ / sizeof(T); - needsfree_ = p.needsfree_; - needsdestroy_ = p.needsdestroy_; - p.texobj_ = 0; - p.needsfree_ = false; - p.needsdestroy_ = false; - p.ptr_ = nullptr; - cvType_ = ftl::traits::OpenCVType<T>::value; - return *this; -} - -template <typename T> -TextureObject<T>::~TextureObject() { - //if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); - //if (needsfree_) cudaFree(ptr_); - free(); -} - /** * Read a texture value using coordinates in the range of `b`, but from the * texture `a` which may have a different resolution. diff --git a/components/common/cpp/include/ftl/cuda_half.hpp b/components/common/cpp/include/ftl/cuda_half.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f02b2d5ba561ef928cab165554d923cdd1f990eb --- /dev/null +++ b/components/common/cpp/include/ftl/cuda_half.hpp @@ -0,0 +1,110 @@ +#ifndef _FTL_CUDA_HALF_HPP_ +#define _FTL_CUDA_HALF_HPP_ + +#include <cuda_fp16.h> + +struct half4 { + half2 a; + half2 b; +}; + +static_assert(sizeof(half4) == 8, "Incorrect half4 size"); + +template <> +__host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half4>() { + return cudaCreateChannelDesc<short4>(); +} + +#ifdef __CUDACC__ + +// half4 functions +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ half4 make_half4(half x, half y, half z, half w) { + half4 h; + h.a.x = x; + h.a.y = y; + h.b.x = z; + h.b.y = w; + return h; +} + +inline __device__ half4 make_half4(half2 a, half2 b) { + half4 h; + h.a = a; + h.b = b; + return h; +} + +inline __device__ half4 make_half4(float x, float y, float z, float w) { + half4 h; + h.a = __floats2half2_rn(x,y); + h.b = __floats2half2_rn(z,w); + return h; +} + +inline __device__ half4 make_half4(const float4 &v) { + half4 h; + h.a = __floats2half2_rn(v.x,v.y); + h.b = __floats2half2_rn(v.z,v.w); + return h; +} + +inline __device__ half4 make_half4(const float3 &v, float a) { + half4 h; + h.a = __floats2half2_rn(v.x,v.y); + h.b = __floats2half2_rn(v.z,a); + return h; +} + +inline __device__ half4 make_half4(float a) +{ + return make_half4(a,a,a,a); +} + +inline __device__ float4 make_float4(const half4 &v) +{ + union { + struct { + float2 a; + float2 b; + } x; + float4 f; + }; + x.a = __half22float2(v.a); + x.b = __half22float2(v.b); + return f; +} + +inline __device__ float3 make_float3(const half4 &v) +{ + union { + struct { + float2 a; + float b; + } x; + float3 f; + }; + x.a = __half22float2(v.a); + x.b = __half2float(v.b.x); + return f; +} + + + +// === Texture overloads ======================================================= + +template <> +__device__ inline half4 tex2D<half4>(cudaTextureObject_t o, float x, float y) { + union U { + __device__ inline U(const short4 &s) : t(s) {} + short4 t; + half4 h; + }; + return U(tex2D<short4>(o, x, y)).h; +} + +#endif + + +#endif \ No newline at end of file diff --git a/components/common/cpp/include/ftl/cuda_texture.hpp b/components/common/cpp/include/ftl/cuda_texture.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e103b049750b0a39087fcd4dbd21100ddb7815b5 --- /dev/null +++ b/components/common/cpp/include/ftl/cuda_texture.hpp @@ -0,0 +1,361 @@ +#ifndef _FTL_CUDA_TEXTURE_HPP_ +#define _FTL_CUDA_TEXTURE_HPP_ + +namespace ftl { +namespace cuda { + +template <typename T> +struct Float; + +template <> struct Float<float> { typedef float type; }; +template <> struct Float<int> { typedef float type; }; +template <> struct Float<float4> { typedef float4 type; }; +template <> struct Float<half4> { typedef float4 type; }; +template <> struct Float<uchar4> { typedef float4 type; }; +template <> struct Float<uint8_t> { typedef float type; }; +template <> struct Float<short2> { typedef float2 type; }; +template <> struct Float<short> { typedef float type; }; + +template <typename T> +struct ScaleValue; + +template <> struct ScaleValue<uchar4> { static constexpr float value = 255.0f; }; +template <> struct ScaleValue<uint8_t> { static constexpr float value = 255.0f; }; +template <> struct ScaleValue<float> { static constexpr float value = 1.0f; }; +template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; }; +template <> struct ScaleValue<half4> { static constexpr float value = 1.0f; }; +template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; }; + +/** + * Represent a CUDA texture object. Instances of this class can be used on both + * host and device. A texture object base cannot be constructed directly, it + * must be constructed via a template TextureObject class. + */ +class TextureObjectBase { + public: + __host__ __device__ TextureObjectBase() + : texobj_(0), pitch_(0), pitch2_(0), width_(0), height_(0), + ptr_(nullptr), needsfree_(false), needsdestroy_(false), + cvType_(-1) {}; + ~TextureObjectBase(); + + // Remove ability to copy object directly, instead must use + // templated derivative TextureObject. + TextureObjectBase(const TextureObjectBase &)=delete; + TextureObjectBase &operator=(const TextureObjectBase &)=delete; + + TextureObjectBase(TextureObjectBase &&); + TextureObjectBase &operator=(TextureObjectBase &&); + + inline size_t pitch() const { return pitch_; } + inline size_t pixelPitch() const { return pitch2_; } + inline uchar *devicePtr() const { return ptr_; }; + __host__ __device__ inline uchar *devicePtr(int v) const { return &ptr_[v*pitch_]; } + __host__ __device__ inline int width() const { return width_; } + __host__ __device__ inline int height() const { return height_; } + __host__ __device__ inline cudaTextureObject_t cudaTexture() const { return texobj_; } + + void upload(const cv::Mat &, cudaStream_t stream=0); + void download(cv::Mat &, cudaStream_t stream=0) const; + + __host__ void free(); + + inline int cvType() const { return cvType_; } + + protected: + cudaTextureObject_t texobj_; + size_t pitch_; + size_t pitch2_; // in T units + int width_; + int height_; + uchar *ptr_; // Device memory pointer + bool needsfree_; // We manage memory, so free it + bool needsdestroy_; // The texture object needs to be destroyed + int cvType_; // Used to validate casting +}; + +/** + * Create and manage CUDA texture objects with a particular pixel data type. + * Note: it is not possible to create texture objects for certain types, + * specificially for 3 channel types. + */ +template <typename T> +class TextureObject : public TextureObjectBase { + public: + typedef T type; + + static_assert((16u % sizeof(T)) == 0, "Channel format must be aligned with 16 bytes"); + + __host__ __device__ TextureObject() : TextureObjectBase() {}; + explicit TextureObject(const cv::cuda::GpuMat &d, bool interpolated=false); + explicit TextureObject(const cv::cuda::PtrStepSz<T> &d); + TextureObject(T *ptr, int pitch, int width, int height); + TextureObject(size_t width, size_t height); + TextureObject(const TextureObject<T> &t); + __host__ __device__ TextureObject(TextureObject<T> &&); + ~TextureObject(); + + TextureObject<T> &operator=(const TextureObject<T> &); + __host__ __device__ TextureObject<T> &operator=(TextureObject<T> &&); + + operator cv::cuda::GpuMat(); + + void create(const cv::Size &); + void create(int w, int h); + + __host__ __device__ T *devicePtr() const { return (T*)(ptr_); }; + __host__ __device__ T *devicePtr(int v) const { return &(T*)(ptr_)[v*pitch2_]; } + + #ifdef __CUDACC__ + __device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); } + __device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); } + __device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; } + #endif + + __host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + __host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + + /** + * Cast a base texture object to this type of texture object. If the + * underlying pixel types do not match then a bad_cast exception is thrown. + */ + static TextureObject<T> &cast(TextureObjectBase &); +}; + +#ifndef __CUDACC__ +template <typename T> +TextureObject<T> &TextureObject<T>::cast(TextureObjectBase &b) { + if (b.cvType() != ftl::traits::OpenCVType<T>::value) { + //LOG(ERROR) << "Bad cast of texture object"; + throw std::bad_cast(); + } + return reinterpret_cast<TextureObject<T>&>(b); +} + +/** + * Create a 2D array texture from an OpenCV GpuMat object. + */ +template <typename T> +TextureObject<T>::TextureObject(const cv::cuda::GpuMat &d, bool interpolated) { + // GpuMat must have correct data type + //CHECK(d.type() == ftl::traits::OpenCVType<T>::value); + + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = d.data; + resDesc.res.pitch2D.pitchInBytes = d.step; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = d.cols; + resDesc.res.pitch2D.height = d.rows; + + cudaTextureDesc texDesc; + // cppcheck-suppress memsetClassFloat + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = (interpolated) ? cudaReadModeNormalizedFloat : cudaReadModeElementType; + if (interpolated) texDesc.filterMode = cudaFilterModeLinear; + + cudaTextureObject_t tex = 0; + cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + texobj_ = tex; + pitch_ = d.step; + pitch2_ = pitch_ / sizeof(T); + ptr_ = d.data; + width_ = d.cols; + height_ = d.rows; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; + needsdestroy_ = true; +} + +#endif // __CUDACC__ + +/** + * Create a 2D array texture from an OpenCV GpuMat object. + */ +template <typename T> +TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = d.data; + resDesc.res.pitch2D.pitchInBytes = d.step; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = d.cols; + resDesc.res.pitch2D.height = d.rows; + + cudaTextureDesc texDesc; + // cppcheck-suppress memsetClassFloat + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; + + cudaTextureObject_t tex = 0; + cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + texobj_ = tex; + pitch_ = d.step; + pitch2_ = pitch_ / sizeof(T); + ptr_ = d.data; + width_ = d.cols; + height_ = d.rows; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; + needsdestroy_ = true; +} + +/** + * Create a 2D array texture object using a cudaMallocPitch device pointer. + * The texture object returned must be destroyed by the caller. + */ +template <typename T> +TextureObject<T>::TextureObject(T *ptr, int pitch, int width, int height) { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = ptr; + resDesc.res.pitch2D.pitchInBytes = pitch; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = width; + resDesc.res.pitch2D.height = height; + + cudaTextureDesc texDesc; + // cppcheck-suppress memsetClassFloat + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; + + cudaTextureObject_t tex = 0; + cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + texobj_ = tex; + pitch_ = pitch; + pitch2_ = pitch_ / sizeof(T); + ptr_ = ptr; + width_ = width; + height_ = height; + needsfree_ = false; + cvType_ = ftl::traits::OpenCVType<T>::value; + needsdestroy_ = true; +} + +template <typename T> +TextureObject<T>::TextureObject(size_t width, size_t height) { + cudaMallocPitch((void**)&ptr_,&pitch_,width*sizeof(T),height); + cudaTextureObject_t tex = 0; + + // Must be an even + //if (!(sizeof(T) & 0x1)) { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = ptr_; + resDesc.res.pitch2D.pitchInBytes = pitch_; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>(); + resDesc.res.pitch2D.width = width; + resDesc.res.pitch2D.height = height; + + cudaTextureDesc texDesc; + // cppcheck-suppress memsetClassFloat + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + //if (std::is_same<T,uchar4>::value) texDesc.filterMode = cudaFilterModeLinear; + cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + //} + + texobj_ = tex; + width_ = (int)width; + height_ = (int)height; + needsfree_ = true; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; + needsdestroy_ = true; +} + +#ifndef __CUDACC__ +template <typename T> +void TextureObject<T>::create(const cv::Size &s) { + create(s.width, s.height); +} + +template <typename T> +void TextureObject<T>::create(int w, int h) { + if (width_ != w || height_ != h) { + *this = std::move(TextureObject<T>(w, h)); + } +} +#endif + +template <typename T> +TextureObject<T>::TextureObject(const TextureObject<T> &p) { + texobj_ = p.texobj_; + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; + needsfree_ = false; + needsdestroy_ = false; +} + +template <typename T> +TextureObject<T>::TextureObject(TextureObject<T> &&p) { + texobj_ = p.texobj_; + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + needsfree_ = p.needsfree_; + needsdestroy_ = p.needsdestroy_; + p.texobj_ = 0; + p.needsfree_ = false; + p.needsdestroy_ = false; + p.ptr_ = nullptr; + cvType_ = ftl::traits::OpenCVType<T>::value; +} + +template <typename T> +TextureObject<T> &TextureObject<T>::operator=(const TextureObject<T> &p) { + free(); + texobj_ = p.texobj_; + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + cvType_ = ftl::traits::OpenCVType<T>::value; + needsfree_ = false; + needsdestroy_ = false; + return *this; +} + +template <typename T> +TextureObject<T> &TextureObject<T>::operator=(TextureObject<T> &&p) { + free(); + texobj_ = p.texobj_; + ptr_ = p.ptr_; + width_ = p.width_; + height_ = p.height_; + pitch_ = p.pitch_; + pitch2_ = pitch_ / sizeof(T); + needsfree_ = p.needsfree_; + needsdestroy_ = p.needsdestroy_; + p.texobj_ = 0; + p.needsfree_ = false; + p.needsdestroy_ = false; + p.ptr_ = nullptr; + cvType_ = ftl::traits::OpenCVType<T>::value; + return *this; +} + +template <typename T> +TextureObject<T>::~TextureObject() { + //if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); + //if (needsfree_) cudaFree(ptr_); + free(); +} + +} +} + +#endif diff --git a/components/common/cpp/include/ftl/traits.hpp b/components/common/cpp/include/ftl/traits.hpp index 71668359d1b0bfa036c29fb9f6d4f73ee0d2d5a7..8abf07fc01c06e2962721aff95d2a656c136050e 100644 --- a/components/common/cpp/include/ftl/traits.hpp +++ b/components/common/cpp/include/ftl/traits.hpp @@ -3,6 +3,7 @@ #include <opencv2/core.hpp> #include <ftl/cuda_util.hpp> +#include <ftl/cuda_half.hpp> namespace ftl { namespace traits { @@ -37,6 +38,7 @@ template <> struct OpenCVType<float> { static constexpr int value = CV_32FC1; }; template <> struct OpenCVType<float2> { static constexpr int value = CV_32FC2; }; template <> struct OpenCVType<float3> { static constexpr int value = CV_32FC3; }; template <> struct OpenCVType<float4> { static constexpr int value = CV_32FC4; }; +template <> struct OpenCVType<half4> { static constexpr int value = CV_16FC4; }; } } diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp index 1fb7b68e5e0a9c3d07250d40dc65c42e4d2a2de5..779b9feb4b5fdecc81f25e30f81a423b1833bb38 100644 --- a/components/common/cpp/src/cuda_common.cpp +++ b/components/common/cpp/src/cuda_common.cpp @@ -3,6 +3,7 @@ #include <ftl/cuda_common.hpp> using ftl::cuda::TextureObjectBase; +using ftl::cuda::BufferBase; static int dev_to_use = 0; static int dev_count = 0; @@ -116,58 +117,54 @@ void TextureObjectBase::download(cv::Mat &m, cudaStream_t stream) const { cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * m.elemSize(), m.rows, cudaMemcpyDeviceToHost, stream)); } -/*template <> -void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream) { - cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar4), m.rows, cudaMemcpyHostToDevice, stream)); -} -template <> -void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream) { - cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float), m.rows, cudaMemcpyHostToDevice, stream)); -} -template <> -void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream) { - cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float2), m.rows, cudaMemcpyHostToDevice, stream)); +BufferBase::~BufferBase() { + free(); } -template <> -void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream) { - cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float4), m.rows, cudaMemcpyHostToDevice, stream)); -} +BufferBase::BufferBase(BufferBase &&o) { + needsfree_ = o.needsfree_; + ptr_ = o.ptr_; + cvType_ = o.cvType_; + width_ = o.width_; + height_ = o.height_; + pitch_ = o.pitch_; + pitch2_ = o.pitch2_; -template <> -void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream) { - cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar), m.rows, cudaMemcpyHostToDevice, stream)); + o.ptr_ = nullptr; + o.needsfree_ = false; } +BufferBase &BufferBase::operator=(BufferBase &&o) { + free(); -template <> -void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const { - m.create(height(), width(), CV_8UC4); - cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar4), m.rows, cudaMemcpyDeviceToHost, stream)); -} + needsfree_ = o.needsfree_; + ptr_ = o.ptr_; + cvType_ = o.cvType_; + width_ = o.width_; + height_ = o.height_; + pitch_ = o.pitch_; + pitch2_ = o.pitch2_; -template <> -void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const { - m.create(height(), width(), CV_32FC1); - cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float), m.rows, cudaMemcpyDeviceToHost, stream)); + o.ptr_ = nullptr; + o.needsfree_ = false; + return *this; } -template <> -void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const { - m.create(height(), width(), CV_32FC2); - cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float2), m.rows, cudaMemcpyDeviceToHost, stream)); +void BufferBase::free() { + if (needsfree_) { + if (ptr_) cudaFree(ptr_); + ptr_ = nullptr; + cvType_ = -1; + } } -template <> -void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const { - m.create(height(), width(), CV_32FC4); - cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float4), m.rows, cudaMemcpyDeviceToHost, stream)); +void BufferBase::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * m.elemSize(), m.rows, cudaMemcpyHostToDevice, stream)); } -template <> -void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const { - m.create(height(), width(), CV_8UC1); - cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar), m.rows, cudaMemcpyDeviceToHost, stream)); -}*/ +void BufferBase::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), cvType_); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * m.elemSize(), m.rows, cudaMemcpyDeviceToHost, stream)); +} \ No newline at end of file diff --git a/components/operators/include/ftl/operators/mvmls.hpp b/components/operators/include/ftl/operators/mvmls.hpp index 1c250e98574d1bac9182aa950ec7fbda594c5b2e..824d88e3d8888699f931eb018b15a4741aa00db3 100644 --- a/components/operators/include/ftl/operators/mvmls.hpp +++ b/components/operators/include/ftl/operators/mvmls.hpp @@ -18,7 +18,7 @@ class MultiViewMLS : public ftl::operators::Operator { private: std::vector<ftl::cuda::TextureObject<float4>*> centroid_horiz_; std::vector<ftl::cuda::TextureObject<float4>*> centroid_vert_; - std::vector<ftl::cuda::TextureObject<float4>*> normals_horiz_; + std::vector<ftl::cuda::TextureObject<half4>*> normals_horiz_; std::vector<ftl::cuda::TextureObject<float>*> contributions_; }; diff --git a/components/operators/include/ftl/operators/normals.hpp b/components/operators/include/ftl/operators/normals.hpp index 494a0d10d5ab63ed47902b0b7311a71cd01d4a87..a5faaa17645612bce3ab9601638f92058fd4ba57 100644 --- a/components/operators/include/ftl/operators/normals.hpp +++ b/components/operators/include/ftl/operators/normals.hpp @@ -52,7 +52,7 @@ class SmoothNormals : public ftl::operators::Operator { bool apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t stream) override; private: - ftl::cuda::TextureObject<float4> temp_; + ftl::cuda::TextureObject<half4> temp_; }; diff --git a/components/operators/include/ftl/operators/smoothing.hpp b/components/operators/include/ftl/operators/smoothing.hpp index bb99865954f4553fab1b7e6c862bd4828dcd5b35..0dc463d2eb4c22563e1e83d032f3d56474d095cb 100644 --- a/components/operators/include/ftl/operators/smoothing.hpp +++ b/components/operators/include/ftl/operators/smoothing.hpp @@ -123,7 +123,7 @@ class AggreMLS : public ftl::operators::Operator { private: ftl::cuda::TextureObject<float4> centroid_horiz_; ftl::cuda::TextureObject<float4> centroid_vert_; - ftl::cuda::TextureObject<float4> normals_horiz_; + ftl::cuda::TextureObject<half4> normals_horiz_; ftl::rgbd::Frame temp_; }; diff --git a/components/operators/src/correspondence.cu b/components/operators/src/correspondence.cu index edc333c759221e43247af69748e918aa24f0657a..0b5ea46075627051b86ac25e1617258be22335a9 100644 --- a/components/operators/src/correspondence.cu +++ b/components/operators/src/correspondence.cu @@ -346,8 +346,8 @@ __device__ inline int unpackCameraID(short2 p) { * Identify which source has the best support region for a given pixel. */ __global__ void best_sources_kernel( - TextureObject<float4> normals1, - TextureObject<float4> normals2, + TextureObject<half4> normals1, + TextureObject<half4> normals2, TextureObject<uchar4> support1, TextureObject<uchar4> support2, TextureObject<float> depth1, @@ -414,8 +414,8 @@ __global__ void best_sources_kernel( } void ftl::cuda::best_sources( - ftl::cuda::TextureObject<float4> &normals1, - ftl::cuda::TextureObject<float4> &normals2, + ftl::cuda::TextureObject<half4> &normals1, + ftl::cuda::TextureObject<half4> &normals2, ftl::cuda::TextureObject<uchar4> &support1, ftl::cuda::TextureObject<uchar4> &support2, ftl::cuda::TextureObject<float> &depth1, @@ -439,8 +439,8 @@ void ftl::cuda::best_sources( * Identify which source has the best support region for a given pixel. */ __global__ void aggregate_sources_kernel( - TextureObject<float4> n1, - TextureObject<float4> n2, + TextureObject<half4> n1, + TextureObject<half4> n2, TextureObject<float4> c1, TextureObject<float4> c2, TextureObject<float> depth1, @@ -475,7 +475,7 @@ void ftl::cuda::best_sources( if (cent2.x+cent2.y+cent2.z > 0.0f && norm2.x+norm2.y+norm2.z > 0.0f && length(cent2-cent1) < 0.04f) { norm1 += norm2; norm1 /= 2.0f; - n1(x,y) = make_float4(norm1, 0.0f); + n1(x,y) = make_half4(norm1, 0.0f); cent1 += cent2; cent1 /= 2.0f; c1(x,y) = make_float4(cent1, 0.0f); @@ -490,8 +490,8 @@ void ftl::cuda::best_sources( } void ftl::cuda::aggregate_sources( - ftl::cuda::TextureObject<float4> &n1, - ftl::cuda::TextureObject<float4> &n2, + ftl::cuda::TextureObject<half4> &n1, + ftl::cuda::TextureObject<half4> &n2, ftl::cuda::TextureObject<float4> &c1, ftl::cuda::TextureObject<float4> &c2, ftl::cuda::TextureObject<float> &depth1, @@ -608,7 +608,7 @@ void ftl::cuda::vis_best_sources( // ==== Normalise aggregations ================================================= __global__ void normalise_aggregations_kernel( - TextureObject<float4> norms, + TextureObject<half4> norms, TextureObject<float4> cents, TextureObject<float> contribs) { @@ -618,20 +618,20 @@ __global__ void normalise_aggregations_kernel( if (x < norms.width() && y < norms.height()) { const float contrib = contribs.tex2D((int)x,(int)y); - const auto a = norms.tex2D((int)x,(int)y); + const auto a = make_float3(norms.tex2D((int)x,(int)y)); const auto b = cents.tex2D(x,y); //const float4 normal = normals.tex2D((int)x,(int)y); //out(x,y) = (contrib == 0.0f) ? make<B>(a) : make<B>(a / contrib); if (contrib > 0.0f) { - norms(x,y) = a / (contrib+1.0f); + norms(x,y) = make_half4(a / (contrib+1.0f), 1.0f); cents(x,y) = b / (contrib+1.0f); } } } -void ftl::cuda::normalise_aggregations(TextureObject<float4> &norms, TextureObject<float4> ¢s, TextureObject<float> &contribs, cudaStream_t stream) { +void ftl::cuda::normalise_aggregations(TextureObject<half4> &norms, TextureObject<float4> ¢s, TextureObject<float> &contribs, cudaStream_t stream) { const dim3 gridSize((norms.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (norms.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); diff --git a/components/operators/src/mls.cu b/components/operators/src/mls.cu index 5d25fc6c0e8648318a242ff08fcec26648086f07..afe15d5592bc396da1abc5703c9f8c0a96b00d49 100644 --- a/components/operators/src/mls.cu +++ b/components/operators/src/mls.cu @@ -14,8 +14,8 @@ using ftl::cuda::TextureObject; */ template <int SEARCH_RADIUS> __global__ void mls_smooth_kernel( - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, + TextureObject<half4> normals_in, + TextureObject<half4> normals_out, TextureObject<float> depth_in, // Virtual depth map TextureObject<float> depth_out, // Accumulated output float smoothing, @@ -69,12 +69,12 @@ using ftl::cuda::TextureObject; // depth_out(screen.x,screen.y) = X.z; //} depth_out(x,y) = X.z; - normals_out(x,y) = make_float4(nX / length(nX), 0.0f); + normals_out(x,y) = make_half4(nX / length(nX), 0.0f); } void ftl::cuda::mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, float smoothing, @@ -112,8 +112,8 @@ void ftl::cuda::mls_smooth( */ template <int SEARCH_RADIUS> __global__ void colour_mls_smooth_kernel( - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, + TextureObject<half4> normals_in, + TextureObject<half4> normals_out, TextureObject<float> depth_in, // Virtual depth map TextureObject<float> depth_out, // Accumulated output TextureObject<uchar4> colour_in, @@ -179,12 +179,12 @@ void ftl::cuda::mls_smooth( // depth_out(screen.x,screen.y) = X.z; //} depth_out(x,y) = X.z; - normals_out(x,y) = make_float4(nX / length(nX), 0.0f); + normals_out(x,y) = make_half4(nX / length(nX), 0.0f); } void ftl::cuda::colour_mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -233,8 +233,8 @@ __device__ inline int segmentID(int u, int v) { template <bool FILLING, int RADIUS> __global__ void colour_mls_smooth_csr_kernel( TextureObject<uchar4> region, - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, + TextureObject<half4> normals_in, + TextureObject<half4> normals_out, TextureObject<float> depth_in, // Virtual depth map TextureObject<float> depth_out, // Accumulated output TextureObject<uchar4> colour_in, @@ -325,14 +325,14 @@ __device__ inline int segmentID(int u, int v) { // depth_out(screen.x,screen.y) = X.z; //} depth_out(x,y) = X.z; - normals_out(x,y) = make_float4(nX / length(nX), 0.0f); + normals_out(x,y) = make_half4(nX / length(nX), 0.0f); } } void ftl::cuda::colour_mls_smooth_csr( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -362,6 +362,24 @@ void ftl::cuda::colour_mls_smooth_csr( // ===== Cross Aggregate MLS =================================================== +__device__ inline float colourAsFloat(uchar4 c) { + union { + uchar4 col; + float f; + }; + col = c; + return f; +} + +__device__ inline uchar4 floatAsColour(float pf) { + union { + uchar4 col; + float f; + }; + f = pf; + return col; +} + /* * Smooth depth map using Moving Least Squares. This version uses colour * similarity weights to adjust the spatial smoothing factor. It also uses @@ -371,12 +389,16 @@ void ftl::cuda::colour_mls_smooth_csr( */ template <int RADIUS> __global__ void mls_aggr_horiz_kernel( - TextureObject<uchar4> region, - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, - TextureObject<float> depth_in, // Virtual depth map + const uchar4* __restrict__ region, + size_t region_pitch, + const half4* __restrict__ normals_in, + size_t normals_in_pitch, + TextureObject<half4> normals_out, + const float* __restrict__ depth_in, // Virtual depth map + size_t depth_in_pitch, TextureObject<float4> centroid_out, // Accumulated output - TextureObject<uchar4> colour_in, + const uchar4* __restrict__ colour_in, + size_t colour_in_pitch, float smoothing, float colour_smoothing, ftl::rgbd::Camera camera) { @@ -384,59 +406,65 @@ void ftl::cuda::colour_mls_smooth_csr( const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return; + if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) { - float3 aX = make_float3(0.0f,0.0f,0.0f); - float3 nX = make_float3(0.0f,0.0f,0.0f); - float contrib = 0.0f; + float3 aX = make_float3(0.0f,0.0f,0.0f); + float3 nX = make_float3(0.0f,0.0f,0.0f); + float contrib = 0.0f; - float d0 = depth_in.tex2D(x, y); + const float d0 = depth_in[y*depth_in_pitch+x]; + const uchar4 c0 = colour_in[y*colour_in_pitch+x]; - // Note: x and y flipped as output is rotated. - centroid_out(y,x) = make_float4(0.0f); - normals_out(y,x) = normals_in(x,y); + // Note: x and y flipped as output is rotated. + centroid_out(y,x) = make_float4(0.0f,0.0f,0.0f,colourAsFloat(c0)); + normals_out(y,x) = normals_in[y*normals_in_pitch+x]; - if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return; - - float3 X = camera.screenToCam((int)(x),(int)(y),d0); - float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f); + if (d0 > camera.minDepth && d0 < camera.maxDepth) { + + float3 X = camera.screenToCam((int)(x),(int)(y),d0); - // Cross-Support Neighbourhood - uchar4 base = region.tex2D(x,y); + // Cross-Support Neighbourhood + const uchar4 base = region[y*region_pitch+x]; - #pragma unroll - for (int u=-RADIUS; u<=RADIUS; ++u) { - const float d = depth_in.tex2D(x+u, y); + const half4* __restrict__ nptr = normals_in + y*normals_in_pitch + x - RADIUS; + const float* __restrict__ dptr = depth_in + y*depth_in_pitch + x - RADIUS; + const uchar4* __restrict__ cptr = colour_in + y*colour_in_pitch + x - RADIUS; - // If outside of cross support range, set weight to 0 to ignore - float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f; + #pragma unroll + for (int u=-RADIUS; u<=RADIUS; ++u) { + const float d = *(dptr++); - // Point and normal of neighbour - const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y),d); - const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y))); + // If outside of cross support range, set weight to 0 to ignore + float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f; - // Bad or missing normals should be ignored - if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f; + // Point and normal of neighbour + const float3 Xi = camera.screenToCam(x+u,y,d); + const float3 Ni = make_float3(*(nptr++)); - // Gauss approx colour weighting. - const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y) + 0.5f); - w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing); + // Bad or missing normals should be ignored + if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f; - // Gauss approx weighting function using point distance - w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w); + // Gauss approx colour weighting. + const uchar4 c = *(cptr++); + w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing); - aX += Xi*w; - nX += Ni*w; - contrib += w; - } + // Gauss approx weighting function using point distance + w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w); - if (contrib > 0.0f) { - nX /= contrib; // Weighted average normal - aX /= contrib; // Weighted average point (centroid) + aX += Xi*w; + nX += Ni*w; + contrib += w; + } - // Note: x and y flipped since output is rotated 90 degrees. - centroid_out(y,x) = make_float4(aX, 0.0f); - normals_out(y,x) = make_float4(nX / length(nX), 0.0f); + if (contrib > 0.0f) { + nX /= contrib; // Weighted average normal + aX /= contrib; // Weighted average point (centroid) + + // Note: x and y flipped since output is rotated 90 degrees. + centroid_out(y,x) = make_float4(aX, colourAsFloat(c0)); + normals_out(y,x) = make_half4(nX / length(nX), 0.0f); + } + } } } @@ -446,13 +474,14 @@ void ftl::cuda::colour_mls_smooth_csr( */ template <int RADIUS> __global__ void mls_aggr_vert_kernel( - TextureObject<uchar4> region, - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, - TextureObject<float4> centroid_in, // Virtual depth map + const uchar4* __restrict__ region, + size_t region_pitch, + const half4* __restrict__ normals_in, + size_t normals_in_pitch, + TextureObject<half4> normals_out, + const float4* __restrict__ centroid_in, // Virtual depth map + size_t centroid_in_pitch, TextureObject<float4> centroid_out, // Accumulated output - TextureObject<uchar4> colour_in, - TextureObject<float> depth_in, float smoothing, float colour_smoothing, ftl::rgbd::Camera camera) { @@ -460,60 +489,65 @@ template <int RADIUS> const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return; - - float3 aX = make_float3(0.0f,0.0f,0.0f); - float3 nX = make_float3(0.0f,0.0f,0.0f); - float contrib = 0.0f; - - float d0 = depth_in.tex2D(x, y); - if (d0 <= camera.minDepth || d0 >= camera.maxDepth) return; - - float3 X = camera.screenToCam((int)(x),(int)(y),d0); - - centroid_out(x,y) = make_float4(0.0f); - normals_out(x,y) = make_float4(0.0f); - - float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f); + if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) { + float3 aX = make_float3(0.0f,0.0f,0.0f); + float3 nX = make_float3(0.0f,0.0f,0.0f); + float contrib = 0.0f; + + const float4 cin = centroid_in[x*centroid_in_pitch+y]; + const float3 A = make_float3(cin); + const uchar4 c0 = floatAsColour(cin.w); - // Cross-Support Neighbourhood - uchar4 base = region.tex2D(x,y); + centroid_out(x,y) = make_float4(0.0f); + normals_out(x,y) = make_half4(0.0f); + + if (A.z > camera.minDepth && A.z < camera.maxDepth) { + //float3 X = camera.screenToCam((int)(x),(int)(y),d0); + + //const uchar4 c0 = colour_in[y*colour_in_pitch+x]; - #pragma unroll - for (int v=-RADIUS; v<=RADIUS; ++v) { - const float d = depth_in.tex2D(x, y+v); - const float3 Xi = camera.screenToCam(x,y+v,d); + // Cross-Support Neighbourhood + const uchar4 base = region[y*region_pitch+x]; - // Note: x and y flipped, input image is rotated. - float3 Ai = make_float3(centroid_in.tex2D(y+v, x)); + const half4* __restrict__ nptr = normals_in + x*normals_in_pitch + y - RADIUS; + const float4* __restrict__ ceptr = (centroid_in + x*centroid_in_pitch + y - RADIUS); - // If outside the cross support range, set weight to 0 to ignore - float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f; + #pragma unroll + for (int v=-RADIUS; v<=RADIUS; ++v) { + // Note: x and y flipped, input image is rotated. + const float4 cin = *(ceptr++); + const float3 Ai = make_float3(cin); + const uchar4 c = floatAsColour(cin.w); - // Note: x and y flipped, input image is rotated. - const float3 Ni = make_float3(normals_in.tex2D(y+v, x)); + // If outside the cross support range, set weight to 0 to ignore + float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f; - // Bad normals should be ignored. - if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f; + // Note: x and y flipped, input image is rotated. + const float3 Ni = make_float3(*(nptr++)); - // Gauss approx colour weighting. - const float4 c = colour_in.tex2D(float(x) + 0.5f, float(y+v) + 0.5f); - w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing); + // Bad normals should be ignored. + if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f; - // Gauss approx weighting function using point distance - w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w); + // Gauss approx colour weighting. + //const uchar4 c = *(cptr+v*colour_in_pitch); + w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing); - aX += Ai*w; - nX += Ni*w; - contrib += w; - } + // Gauss approx weighting function using point distance + w = ftl::cuda::spatialWeighting(A,Ai,A.z*smoothing*w); - // Normalise the summed points and normals - if (contrib > 0.0f) { - nX /= contrib; // Weighted average normal - aX /= contrib; // Weighted average point (centroid) - centroid_out(x,y) = make_float4(aX, 0.0f); - normals_out(x,y) = make_float4(nX / length(nX), 0.0f); + aX += Ai*w; + nX += Ni*w; + contrib += w; + } + + // Normalise the summed points and normals + if (contrib > 0.0f) { + nX /= contrib; // Weighted average normal + aX /= contrib; // Weighted average point (centroid) + centroid_out(x,y) = make_float4(aX, 0.0f); + normals_out(x,y) = make_half4(nX / length(nX), 0.0f); + } + } } } @@ -522,41 +556,40 @@ template <int RADIUS> * field and move the depth value accordingly using the calculated normal. */ __global__ void mls_adjust_depth_kernel( - TextureObject<float4> normals_in, + TextureObject<half4> normals_in, TextureObject<float4> centroid_in, // Virtual depth map - TextureObject<float> depth_in, TextureObject<float> depth_out, ftl::rgbd::Camera camera) { const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x < 0 || y < 0 || x >= depth_out.width() || y >= depth_out.height()) return; + if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) { + const float3 aX = make_float3(centroid_in(x,y)); + const float3 nX = make_float3(normals_in(x,y)); - float3 aX = make_float3(centroid_in(x,y)); - float3 nX = make_float3(normals_in(x,y)); + //float d0 = depth_in.tex2D(x, y); + depth_out(x,y) = aX.z; - float d0 = depth_in.tex2D(x, y); - depth_out(x,y) = d0; - - if (d0 > camera.minDepth && d0 < camera.maxDepth && aX.z > camera.minDepth && aX.z < camera.maxDepth) { - float3 X = camera.screenToCam((int)(x),(int)(y),d0); + if (aX.z > camera.minDepth && aX.z < camera.maxDepth) { + float3 X = camera.screenToCam((int)(x),(int)(y),aX.z); - // Signed-Distance Field function - float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); + // Signed-Distance Field function + float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); - // Calculate new point using SDF function to adjust depth (and position) - X = X - nX * fX; - - depth_out(x,y) = X.z; + // Calculate new point using SDF function to adjust depth (and position) + X = X - nX * fX; + + depth_out(x,y) = X.z; + } } } void ftl::cuda::mls_aggr_horiz( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float4> ¢roid_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -566,17 +599,20 @@ void ftl::cuda::mls_aggr_horiz( const ftl::rgbd::Camera &camera, cudaStream_t stream) { - const dim3 gridSize((normals_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + static constexpr int THREADS_X = 16; + static constexpr int THREADS_Y = 16; + + const dim3 gridSize((normals_in.width() + THREADS_X - 1)/THREADS_X, (normals_in.height() + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); switch(radius) { - case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; - case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, centroid_out, colour_in, smoothing, colour_smoothing, camera); break; + case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; + case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break; default: return; } cudaSafeCall( cudaGetLastError() ); @@ -589,29 +625,30 @@ void ftl::cuda::mls_aggr_horiz( void ftl::cuda::mls_aggr_vert( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float4> ¢roid_in, ftl::cuda::TextureObject<float4> ¢roid_out, - ftl::cuda::TextureObject<uchar4> &colour_in, - ftl::cuda::TextureObject<float> &depth_in, float smoothing, float colour_smoothing, int radius, const ftl::rgbd::Camera &camera, cudaStream_t stream) { - const dim3 gridSize((normals_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (normals_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + static constexpr int THREADS_X = 4; + static constexpr int THREADS_Y = 32; + + const dim3 gridSize((normals_out.width() + THREADS_X - 1)/THREADS_X, (normals_out.height() + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); switch(radius) { - case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; - case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, centroid_in, centroid_out, colour_in, depth_in, smoothing, colour_smoothing, camera); break; + case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; + case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break; default: return; } cudaSafeCall( cudaGetLastError() ); @@ -623,17 +660,19 @@ void ftl::cuda::mls_aggr_vert( } void ftl::cuda::mls_adjust_depth( - ftl::cuda::TextureObject<float4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_in, ftl::cuda::TextureObject<float4> ¢roid_in, - ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, const ftl::rgbd::Camera &camera, cudaStream_t stream) { - const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + static constexpr int THREADS_X = 32; + static constexpr int THREADS_Y = 4; + + const dim3 gridSize((depth_out.width() + THREADS_X - 1)/THREADS_X, (depth_out.height() + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); - mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_in, depth_out, camera); + mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_out, camera); cudaSafeCall( cudaGetLastError() ); @@ -654,8 +693,8 @@ void ftl::cuda::mls_adjust_depth( */ template <int SEARCH_RADIUS> __global__ void adaptive_mls_smooth_kernel( - TextureObject<float4> normals_in, - TextureObject<float4> normals_out, + TextureObject<half4> normals_in, + TextureObject<half4> normals_out, TextureObject<float> depth_in, // Virtual depth map TextureObject<float> depth_out, // Accumulated output TextureObject<float> smoothing, @@ -717,12 +756,12 @@ void ftl::cuda::mls_adjust_depth( // depth_out(screen.x,screen.y) = X.z; //} depth_out(x,y) = X.z; - normals_out(x,y) = make_float4(nX / length(nX), 0.0f); + normals_out(x,y) = make_half4(nX / length(nX), 0.0f); } void ftl::cuda::adaptive_mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<float> &smoothing, diff --git a/components/operators/src/mvmls.cpp b/components/operators/src/mvmls.cpp index ab150ed459862a453f5dce98660885a65b3e0f0f..8a74b7a95f92103e55d5bce371ae38886353c1f2 100644 --- a/components/operators/src/mvmls.cpp +++ b/components/operators/src/mvmls.cpp @@ -48,7 +48,7 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda // Make sure we have enough buffers while (normals_horiz_.size() < in.frames.size()) { - normals_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width)); + normals_horiz_.push_back(new ftl::cuda::TextureObject<half4>(size.height, size.width)); centroid_horiz_.push_back(new ftl::cuda::TextureObject<float4>(size.height, size.width)); centroid_vert_.push_back(new ftl::cuda::TextureObject<float4>(size.width, size.height)); contributions_.push_back(new ftl::cuda::TextureObject<float>(size.width, size.height)); @@ -221,7 +221,7 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda ftl::cuda::mls_aggr_horiz( f.createTexture<uchar4>(Channel::Support2), - f.createTexture<float4>(Channel::Normals), + f.createTexture<half4>(Channel::Normals), *normals_horiz_[i], f.createTexture<float>(Channel::Depth), *centroid_horiz_[i], @@ -236,11 +236,9 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda ftl::cuda::mls_aggr_vert( f.getTexture<uchar4>(Channel::Support2), *normals_horiz_[i], - f.getTexture<float4>(Channel::Normals), + f.getTexture<half4>(Channel::Normals), *centroid_horiz_[i], *centroid_vert_[i], - f.getTexture<uchar4>(Channel::Colour), - f.getTexture<float>(Channel::Depth), thresh, col_smooth, radius, @@ -288,8 +286,8 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda // For the corresponding points, combine normals and centroids ftl::cuda::aggregate_sources( - f1.getTexture<float4>(Channel::Normals), - f2.getTexture<float4>(Channel::Normals), + f1.getTexture<half4>(Channel::Normals), + f2.getTexture<half4>(Channel::Normals), *centroid_vert_[i], *centroid_vert_[j], f1.getTexture<float>(Channel::Depth), @@ -324,9 +322,8 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda }*/ ftl::cuda::mls_adjust_depth( - f.getTexture<float4>(Channel::Normals), + f.getTexture<half4>(Channel::Normals), *centroid_vert_[i], - f.getTexture<float>(Channel::Depth), f.createTexture<float>(Channel::Depth2, ftl::rgbd::Format<float>(size)), f.getLeftCamera(), stream diff --git a/components/operators/src/mvmls_cuda.hpp b/components/operators/src/mvmls_cuda.hpp index 1189ca758a4c2765b1a78b2a9f728303ee2b1384..49dd0c1db15e45606f1fe791879c05cf75dfe1f1 100644 --- a/components/operators/src/mvmls_cuda.hpp +++ b/components/operators/src/mvmls_cuda.hpp @@ -53,8 +53,8 @@ void zero_confidence( cudaStream_t stream);*/ void aggregate_sources( - ftl::cuda::TextureObject<float4> &n1, - ftl::cuda::TextureObject<float4> &n2, + ftl::cuda::TextureObject<half4> &n1, + ftl::cuda::TextureObject<half4> &n2, ftl::cuda::TextureObject<float4> &c1, ftl::cuda::TextureObject<float4> &c2, ftl::cuda::TextureObject<float> &depth1, @@ -66,8 +66,8 @@ void aggregate_sources( cudaStream_t stream); void best_sources( - ftl::cuda::TextureObject<float4> &normals1, - ftl::cuda::TextureObject<float4> &normals2, + ftl::cuda::TextureObject<half4> &normals1, + ftl::cuda::TextureObject<half4> &normals2, ftl::cuda::TextureObject<uchar4> &support1, ftl::cuda::TextureObject<uchar4> &suppor2, ftl::cuda::TextureObject<float> &depth1, @@ -88,7 +88,7 @@ void vis_best_sources( cudaStream_t stream); void normalise_aggregations( - ftl::cuda::TextureObject<float4> &norms, + ftl::cuda::TextureObject<half4> &norms, ftl::cuda::TextureObject<float4> ¢s, ftl::cuda::TextureObject<float> &contribs, cudaStream_t stream); diff --git a/components/operators/src/normals.cpp b/components/operators/src/normals.cpp index 3a4e7f5dbe5b1b02ca16ba5e7aa9d21c0b63dd5c..aefd04623e7ef9a6f89c6cbaa7d3b3984542612e 100644 --- a/components/operators/src/normals.cpp +++ b/components/operators/src/normals.cpp @@ -27,7 +27,7 @@ bool Normals::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t st } ftl::cuda::normals( - out.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + out.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), in.getLeftCamera(), stream ); @@ -93,7 +93,7 @@ bool SmoothNormals::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStrea temp_.create(depth.size()); ftl::cuda::normals( - out.createTexture<float4>(Channel::Normals, Format<float4>(depth.size())), + out.createTexture<half4>(Channel::Normals, Format<half4>(depth.size())), temp_, in.createTexture<float>(Channel::Depth), radius, smoothing, diff --git a/components/operators/src/smoothing.cpp b/components/operators/src/smoothing.cpp index c558637085972bdb12f84697131e9617f0e20a9c..49ca3f4b9e76b8f7bebbd28b6b4dfa8fa1b64bb6 100644 --- a/components/operators/src/smoothing.cpp +++ b/components/operators/src/smoothing.cpp @@ -159,8 +159,8 @@ bool SimpleMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t // FIXME: Assume in and out are the same frame. for (int i=0; i<iters; ++i) { ftl::cuda::mls_smooth( - in.createTexture<float4>(Channel::Normals), - temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.createTexture<half4>(Channel::Normals), + temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), thresh, @@ -204,8 +204,8 @@ bool ColourMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t for (int i=0; i<iters; ++i) { if (!crosssup) { ftl::cuda::colour_mls_smooth( - in.createTexture<float4>(Channel::Normals), - temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.createTexture<half4>(Channel::Normals), + temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<uchar4>(Channel::Colour), @@ -218,8 +218,8 @@ bool ColourMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t } else { ftl::cuda::colour_mls_smooth_csr( in.createTexture<uchar4>(Channel::Support1), - in.createTexture<float4>(Channel::Normals), - temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.createTexture<half4>(Channel::Normals), + temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<uchar4>(Channel::Colour), @@ -278,7 +278,7 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s if (aggre) { ftl::cuda::mls_aggr_horiz( in.createTexture<uchar4>(Channel::Support1), - in.createTexture<float4>(Channel::Normals), + in.createTexture<half4>(Channel::Normals), normals_horiz_, in.createTexture<float>(Channel::Depth), centroid_horiz_, @@ -293,11 +293,9 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s ftl::cuda::mls_aggr_vert( in.createTexture<uchar4>(Channel::Support1), normals_horiz_, - in.createTexture<float4>(Channel::Normals), + in.createTexture<half4>(Channel::Normals), centroid_horiz_, centroid_vert_, - in.createTexture<uchar4>(Channel::Colour), - in.createTexture<float>(Channel::Depth), thresh, col_smooth, radius, @@ -306,9 +304,8 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s ); ftl::cuda::mls_adjust_depth( - in.createTexture<float4>(Channel::Normals), + in.createTexture<half4>(Channel::Normals), centroid_vert_, - in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(size)), in.getLeftCamera(), stream @@ -321,8 +318,8 @@ bool AggreMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t s } else { ftl::cuda::colour_mls_smooth_csr( in.createTexture<uchar4>(Channel::Support1), - in.createTexture<float4>(Channel::Normals), - temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.createTexture<half4>(Channel::Normals), + temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<uchar4>(Channel::Colour), @@ -363,8 +360,8 @@ bool AdaptiveMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_ // FIXME: Assume in and out are the same frame. for (int i=0; i<iters; ++i) { ftl::cuda::adaptive_mls_smooth( - in.createTexture<float4>(Channel::Normals), - temp_.createTexture<float4>(Channel::Normals, ftl::rgbd::Format<float4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.createTexture<half4>(Channel::Normals), + temp_.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Depth), temp_.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), in.createTexture<float>(Channel::Smoothing), diff --git a/components/operators/src/smoothing_cuda.hpp b/components/operators/src/smoothing_cuda.hpp index 6681c1a2a0383fdf69c7a62ab0b78346fede5f3a..06c6d713f81ab41e14695bcad21943be7d71e3d4 100644 --- a/components/operators/src/smoothing_cuda.hpp +++ b/components/operators/src/smoothing_cuda.hpp @@ -8,8 +8,8 @@ namespace ftl { namespace cuda { void mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, float smoothing, @@ -18,8 +18,8 @@ void mls_smooth( cudaStream_t stream); void colour_mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -31,8 +31,8 @@ void colour_mls_smooth( void colour_mls_smooth_csr( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -43,17 +43,16 @@ void colour_mls_smooth_csr( cudaStream_t stream); void mls_adjust_depth( - ftl::cuda::TextureObject<float4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_in, ftl::cuda::TextureObject<float4> ¢roid_in, - ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, const ftl::rgbd::Camera &camera, cudaStream_t stream); void mls_aggr_horiz( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float4> ¢roid_out, ftl::cuda::TextureObject<uchar4> &colour_in, @@ -65,12 +64,10 @@ void mls_aggr_horiz( void mls_aggr_vert( ftl::cuda::TextureObject<uchar4> ®ion, - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float4> ¢roid_in, ftl::cuda::TextureObject<float4> ¢roid_out, - ftl::cuda::TextureObject<uchar4> &colour_in, - ftl::cuda::TextureObject<float> &depth_in, float smoothing, float colour_smoothing, int radius, @@ -78,8 +75,8 @@ void mls_aggr_vert( cudaStream_t stream); void adaptive_mls_smooth( - ftl::cuda::TextureObject<float4> &normals_in, - ftl::cuda::TextureObject<float4> &normals_out, + ftl::cuda::TextureObject<half4> &normals_in, + ftl::cuda::TextureObject<half4> &normals_out, ftl::cuda::TextureObject<float> &depth_in, ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<float> &smoothing, diff --git a/components/renderers/cpp/include/ftl/cuda/normals.hpp b/components/renderers/cpp/include/ftl/cuda/normals.hpp index da07582d3155485f32c13d34b38c65d88ed4da73..0ac3d3713b1cafc7710094e1f74231831ab19959 100644 --- a/components/renderers/cpp/include/ftl/cuda/normals.hpp +++ b/components/renderers/cpp/include/ftl/cuda/normals.hpp @@ -8,7 +8,7 @@ namespace ftl { namespace cuda { -void normals(ftl::cuda::TextureObject<float4> &output, +/*void normals(ftl::cuda::TextureObject<float4> &output, ftl::cuda::TextureObject<float4> &temp, ftl::cuda::TextureObject<float4> &input, int radius, @@ -22,17 +22,17 @@ void normals(ftl::cuda::TextureObject<float4> &output, int radius, float smoothing, const ftl::rgbd::Camera &camera, - const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream); + const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream);*/ -void normals(ftl::cuda::TextureObject<float4> &output, - ftl::cuda::TextureObject<float4> &temp, +void normals(ftl::cuda::TextureObject<half4> &output, + ftl::cuda::TextureObject<half4> &temp, ftl::cuda::TextureObject<float> &input, int radius, float smoothing, const ftl::rgbd::Camera &camera, const float3x3 &pose_inv, const float3x3 &pose, cudaStream_t stream); -void normals(ftl::cuda::TextureObject<float4> &output, +void normals(ftl::cuda::TextureObject<half4> &output, ftl::cuda::TextureObject<float> &input, // Integer depth values const ftl::rgbd::Camera &camera, cudaStream_t stream); @@ -42,22 +42,22 @@ void normals_dot(ftl::cuda::TextureObject<float> &output, const ftl::rgbd::Camera &camera, cudaStream_t stream); -void normal_visualise(ftl::cuda::TextureObject<float4> &norm, +void normal_visualise(ftl::cuda::TextureObject<half4> &norm, ftl::cuda::TextureObject<uchar4> &output, const float3 &light, const uchar4 &diffuse, const uchar4 &ambient, cudaStream_t stream); -void cool_blue(ftl::cuda::TextureObject<float4> &norm, +void cool_blue(ftl::cuda::TextureObject<half4> &norm, ftl::cuda::TextureObject<uchar4> &output, const uchar4 &colouring, const float3x3 &pose, cudaStream_t stream); -void normal_filter(ftl::cuda::TextureObject<float4> &norm, +/*void normal_filter(ftl::cuda::TextureObject<half4> &norm, ftl::cuda::TextureObject<float4> &points, const ftl::rgbd::Camera &camera, const float4x4 &pose, - float thresh, cudaStream_t stream); + float thresh, cudaStream_t stream);*/ -void transform_normals(ftl::cuda::TextureObject<float4> &norm, +void transform_normals(ftl::cuda::TextureObject<half4> &norm, const float3x3 &pose, cudaStream_t stream); diff --git a/components/renderers/cpp/include/ftl/cuda/weighting.hpp b/components/renderers/cpp/include/ftl/cuda/weighting.hpp index 61db736d316494b9c20262eff080aacec710ec26..fb3429dd5a58da419f8c2188386c22b91c340169 100644 --- a/components/renderers/cpp/include/ftl/cuda/weighting.hpp +++ b/components/renderers/cpp/include/ftl/cuda/weighting.hpp @@ -44,6 +44,17 @@ __device__ inline float colourDistance(uchar4 a, uchar4 b) { return ch*ch*ch*ch; } +/* + * Alternative colour distance measure + */ + __device__ inline float colourWeighting2(uchar4 a, uchar4 b, float h) { + const float c = float(max(abs(int(a.x)-int(b.x)), max(abs(int(a.y)-int(b.y)), abs(int(a.z)-int(b.z))))); + if (c >= h) return 0.0f; + float ch = c / h; + ch = 1.0f - ch*ch; + return ch*ch*ch*ch; +} + /* * Colour weighting as suggested in: * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014. @@ -71,6 +82,11 @@ __device__ inline float colourDistance(uchar4 a, uchar4 b) { return ch*ch*ch*ch; } + __device__ inline float simpleColourWeighting(uchar4 a, uchar4 b, float h) { + const float w = max(fabsf(float(a.x)-float(b.x)), max(fabsf(float(a.y)-float(b.y)), fabsf(float(a.z)-float(b.z)))); + return 1.0f - min(1.0f, w/h); +} + } } diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp index 38e244d34c2f1c7428c03d2aaae637dfbc603e3e..c346b1809182aa0815e7140a54efbda6397c385b 100644 --- a/components/renderers/cpp/src/CUDARender.cpp +++ b/components/renderers/cpp/src/CUDARender.cpp @@ -199,7 +199,7 @@ void CUDARender::__reprojectChannel(ftl::rgbd::Frame &output, ftl::codecs::Chann f.createTexture<float>(Channel::Depth), output.getTexture<float>(Channel::Depth), f.createTexture<short>(Channel::Weights), - (output.hasChannel(Channel::Normals)) ? &output.createTexture<float4>(Channel::Normals) : nullptr, + (output.hasChannel(Channel::Normals)) ? &output.createTexture<half4>(Channel::Normals) : nullptr, temp_.createTexture<typename AccumSelector<T>::type>(AccumSelector<T>::channel), temp_.getTexture<int>(Channel::Contribution), params_, @@ -394,8 +394,8 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre //filters_->filter(out, src, stream); // Generate normals for final virtual image - ftl::cuda::normals(out.createTexture<float4>(Channel::Normals, Format<float4>(params_.camera.width, params_.camera.height)), - temp_.createTexture<float4>(Channel::Normals), + ftl::cuda::normals(out.createTexture<half4>(Channel::Normals, Format<half4>(params_.camera.width, params_.camera.height)), + temp_.createTexture<half4>(Channel::Normals), out.createTexture<float>(Channel::Depth), value("normal_radius", 1), value("normal_smoothing", 0.02f), params_.camera, pose_.getFloat3x3(), poseInverse_.getFloat3x3(), stream_); @@ -495,7 +495,7 @@ void CUDARender::_allocateChannels(ftl::rgbd::Frame &out) { temp_.create<GpuMat>(Channel::Contribution, Format<int>(camera.width, camera.height)); temp_.create<GpuMat>(Channel::Depth, Format<int>(camera.width, camera.height)); temp_.create<GpuMat>(Channel::Depth2, Format<int>(camera.width, camera.height)); - temp_.create<GpuMat>(Channel::Normals, Format<float4>(camera.width, camera.height)); + temp_.create<GpuMat>(Channel::Normals, Format<half4>(camera.width, camera.height)); temp_.create<GpuMat>(Channel::Weights, Format<float>(camera.width, camera.height)); temp_.createTexture<int>(Channel::Depth); } @@ -558,7 +558,7 @@ void CUDARender::_postprocessColours(ftl::rgbd::Frame &out) { auto col = parseCUDAColour(value("cool_effect_colour", std::string("#2222ff"))); ftl::cuda::cool_blue( - out.getTexture<float4>(Channel::Normals), + out.getTexture<half4>(Channel::Normals), out.getTexture<uchar4>(Channel::Colour), col, pose, stream_ @@ -599,7 +599,7 @@ void CUDARender::_renderNormals(ftl::rgbd::Frame &out) { // Visualise normals to RGBA out.create<GpuMat>(Channel::ColourNormals, Format<uchar4>(params_.camera.width, params_.camera.height)).setTo(cv::Scalar(0,0,0,0), cvstream); - ftl::cuda::normal_visualise(out.getTexture<float4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals), + ftl::cuda::normal_visualise(out.getTexture<half4>(Channel::Normals), out.createTexture<uchar4>(Channel::ColourNormals), light_pos_, light_diffuse_, light_ambient_, stream_); diff --git a/components/renderers/cpp/src/normals.cu b/components/renderers/cpp/src/normals.cu index eb2a8fc892562ac275b0d6dd1ab0229b02c34b12..81ee7638b2df2cb3193dacdcfb679564818dc4bb 100644 --- a/components/renderers/cpp/src/normals.cu +++ b/components/renderers/cpp/src/normals.cu @@ -4,7 +4,7 @@ #define T_PER_BLOCK 16 #define MINF __int_as_float(0xff800000) -__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, +/*__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, ftl::cuda::TextureObject<float4> input) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -29,13 +29,13 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, } } } -} +}*/ __device__ inline bool isValid(const ftl::rgbd::Camera &camera, const float3 &d) { return d.z >= camera.minDepth && d.z <= camera.maxDepth; } -__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, +/*__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, ftl::cuda::TextureObject<int> input, ftl::rgbd::Camera camera, float3x3 pose) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -61,16 +61,16 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, } } } -} +}*/ -__global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, +__global__ void computeNormals_kernel(ftl::cuda::TextureObject<half4> output, ftl::cuda::TextureObject<float> input, ftl::rgbd::Camera camera) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; if(x >= input.width() || y >= input.height()) return; - output(x,y) = make_float4(0, 0, 0, 0); + output(x,y) = make_half4(0, 0, 0, 0); if(x > 0 && x < input.width()-1 && y > 0 && y < input.height()-1) { const float3 CC = camera.screenToCam(x+0, y+0, input.tex2D((int)x+0, (int)y+0)); @@ -85,13 +85,13 @@ __global__ void computeNormals_kernel(ftl::cuda::TextureObject<float4> output, const float l = length(n); if(l > 0.0f) { - output(x,y) = make_float4((n/-l), 1.0f); + output(x,y) = make_half4((n/-l), 1.0f); } } } } -template <int RADIUS> +/*template <int RADIUS> __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, ftl::cuda::TextureObject<float4> output, ftl::cuda::TextureObject<float4> points, @@ -134,9 +134,9 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, nsum /= length(nsum); output(x,y) = (contrib > 0.0f) ? make_float4(nsum, dot(nsum, ray)) : make_float4(0.0f); -} +}*/ -template <int RADIUS> +/*template <int RADIUS> __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, ftl::cuda::TextureObject<float4> output, ftl::cuda::TextureObject<int> depth, @@ -179,9 +179,9 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, nsum /= length(nsum); output(x,y) = (contrib > 0.0f) ? make_float4(pose*nsum, 1.0f) : make_float4(0.0f); -} +}*/ -template <> +/*template <> __global__ void smooth_normals_kernel<0>(ftl::cuda::TextureObject<float4> norms, ftl::cuda::TextureObject<float4> output, ftl::cuda::TextureObject<int> depth, @@ -206,11 +206,11 @@ __global__ void smooth_normals_kernel<0>(ftl::cuda::TextureObject<float4> norms, const float4 n = norms.tex2D((int)x,(int)y); output(x,y) = n; -} +}*/ template <int RADIUS> -__global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, - ftl::cuda::TextureObject<float4> output, +__global__ void smooth_normals_kernel(ftl::cuda::TextureObject<half4> norms, + ftl::cuda::TextureObject<half4> output, ftl::cuda::TextureObject<float> depth, ftl::rgbd::Camera camera, float3x3 pose, float smoothing) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; @@ -222,7 +222,7 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, float3 nsum = make_float3(0.0f); float contrib = 0.0f; - output(x,y) = make_float4(0.0f,0.0f,0.0f,0.0f); + output(x,y) = make_half4(0.0f,0.0f,0.0f,0.0f); if (p0.z < camera.minDepth || p0.z > camera.maxDepth) return; @@ -234,7 +234,7 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, //const float s = 1.0f; //if (s > 0.0f) { - const float4 n = norms.tex2D((int)x+u,(int)y+v); + const float4 n = make_float4(norms.tex2D((int)x+u,(int)y+v)); if (n.w > 0.0f) { nsum += make_float3(n) * s; contrib += s; @@ -250,10 +250,10 @@ __global__ void smooth_normals_kernel(ftl::cuda::TextureObject<float4> norms, nsum /= contrib; nsum /= length(nsum); - output(x,y) = (contrib > 0.0f) ? make_float4(pose*nsum, 1.0f) : make_float4(0.0f); + output(x,y) = (contrib > 0.0f) ? make_half4(pose*nsum, 1.0f) : make_half4(0.0f); } -void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, +/*void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, ftl::cuda::TextureObject<float4> &temp, ftl::cuda::TextureObject<float4> &input, int radius, @@ -309,10 +309,10 @@ void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, cudaSafeCall(cudaDeviceSynchronize()); //cutilCheckMsg(__FUNCTION__); #endif -} +}*/ -void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, - ftl::cuda::TextureObject<float4> &temp, +void ftl::cuda::normals(ftl::cuda::TextureObject<half4> &output, + ftl::cuda::TextureObject<half4> &temp, ftl::cuda::TextureObject<float> &input, int radius, float smoothing, @@ -339,7 +339,7 @@ void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, #endif } -void ftl::cuda::normals(ftl::cuda::TextureObject<float4> &output, +void ftl::cuda::normals(ftl::cuda::TextureObject<half4> &output, ftl::cuda::TextureObject<float> &input, const ftl::rgbd::Camera &camera, cudaStream_t stream) { @@ -409,7 +409,7 @@ void ftl::cuda::normals_dot(ftl::cuda::TextureObject<float> &output, //============================================================================== -__global__ void vis_normals_kernel(ftl::cuda::TextureObject<float4> norm, +__global__ void vis_normals_kernel(ftl::cuda::TextureObject<half4> norm, ftl::cuda::TextureObject<uchar4> output, float3 direction, uchar4 diffuse, uchar4 ambient) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; @@ -432,7 +432,7 @@ __global__ void vis_normals_kernel(ftl::cuda::TextureObject<float4> norm, min(255.0f, diffuse.z*d + ambient.z), 255); } -void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<float4> &norm, +void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<half4> &norm, ftl::cuda::TextureObject<uchar4> &output, const float3 &light, const uchar4 &diffuse, const uchar4 &ambient, cudaStream_t stream) { @@ -451,7 +451,7 @@ void ftl::cuda::normal_visualise(ftl::cuda::TextureObject<float4> &norm, //============================================================================== -__global__ void cool_blue_kernel(ftl::cuda::TextureObject<float4> norm, +__global__ void cool_blue_kernel(ftl::cuda::TextureObject<half4> norm, ftl::cuda::TextureObject<uchar4> output, uchar4 colouring, float3x3 pose) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; @@ -476,7 +476,7 @@ __global__ void cool_blue_kernel(ftl::cuda::TextureObject<float4> norm, min(255.0f, colouring.z*d + original.z), 255); } -void ftl::cuda::cool_blue(ftl::cuda::TextureObject<float4> &norm, +void ftl::cuda::cool_blue(ftl::cuda::TextureObject<half4> &norm, ftl::cuda::TextureObject<uchar4> &output, const uchar4 &colouring, const float3x3 &pose, cudaStream_t stream) { @@ -495,7 +495,7 @@ void ftl::cuda::cool_blue(ftl::cuda::TextureObject<float4> &norm, //============================================================================== -__global__ void filter_normals_kernel(ftl::cuda::TextureObject<float4> norm, +/*__global__ void filter_normals_kernel(ftl::cuda::TextureObject<float4> norm, ftl::cuda::TextureObject<float4> output, ftl::rgbd::Camera camera, float4x4 pose, float thresh) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; @@ -535,11 +535,11 @@ void ftl::cuda::normal_filter(ftl::cuda::TextureObject<float4> &norm, cudaSafeCall(cudaDeviceSynchronize()); //cutilCheckMsg(__FUNCTION__); #endif -} +}*/ //============================================================================== -__global__ void transform_normals_kernel(ftl::cuda::TextureObject<float4> norm, +__global__ void transform_normals_kernel(ftl::cuda::TextureObject<half4> norm, float3x3 pose) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -548,10 +548,10 @@ __global__ void transform_normals_kernel(ftl::cuda::TextureObject<float4> norm, float3 normal = pose * make_float3(norm.tex2D((int)x,(int)y)); normal /= length(normal); - norm(x,y) = make_float4(normal, 0.0f); + norm(x,y) = make_half4(normal, 0.0f); } -void ftl::cuda::transform_normals(ftl::cuda::TextureObject<float4> &norm, +void ftl::cuda::transform_normals(ftl::cuda::TextureObject<half4> &norm, const float3x3 &pose, cudaStream_t stream) { diff --git a/components/renderers/cpp/src/reprojection.cu b/components/renderers/cpp/src/reprojection.cu index 2012642a80729dec2c7dd88de9003e409dd6560c..ce0745003c5415cdf07e3941c0e092b60f4b65cc 100644 --- a/components/renderers/cpp/src/reprojection.cu +++ b/components/renderers/cpp/src/reprojection.cu @@ -130,7 +130,7 @@ __device__ inline auto getInput(TextureObject<A> &in, const float2 &screen, floa return in.tex2D(screen.x*inSX, screen.y*inSY); } -__device__ float weightByNormal(TextureObject<float4> &normals, int x, int y, const float3x3 &transformR, const float2 &screenPos, const ftl::rgbd::Camera &camera) { +__device__ float weightByNormal(TextureObject<half4> &normals, int x, int y, const float3x3 &transformR, const float2 &screenPos, const ftl::rgbd::Camera &camera) { // Calculate the dot product of surface normal and camera ray const float3 n = transformR * make_float3(normals.tex2D(x, y)); float3 ray = camera.screenToCam(screenPos.x, screenPos.y, 1.0f); @@ -156,7 +156,7 @@ __global__ void reprojection_kernel( TextureObject<float> depth_src, TextureObject<float> depth_in, // Virtual depth map TextureObject<short> weights, - TextureObject<float4> normals, + TextureObject<half4> normals, TextureObject<B> out, // Accumulated output TextureObject<int> contrib, Parameters params, @@ -258,7 +258,7 @@ void ftl::cuda::reproject( TextureObject<float> &depth_src, // Original 3D points TextureObject<float> &depth_in, // Virtual depth map TextureObject<short> &weights, - TextureObject<float4> *normals, + TextureObject<half4> *normals, TextureObject<B> &out, // Accumulated output TextureObject<int> &contrib, const Parameters ¶ms, @@ -340,7 +340,7 @@ template void ftl::cuda::reproject( ftl::cuda::TextureObject<float> &depth_src, // Original 3D points ftl::cuda::TextureObject<float> &depth_in, // Virtual depth map ftl::cuda::TextureObject<short> &weights, - ftl::cuda::TextureObject<float4> *normals, + ftl::cuda::TextureObject<half4> *normals, ftl::cuda::TextureObject<float4> &out, // Accumulated output ftl::cuda::TextureObject<int> &contrib, const ftl::render::Parameters ¶ms, @@ -353,7 +353,7 @@ template void ftl::cuda::reproject( ftl::cuda::TextureObject<float> &depth_src, // Original 3D points ftl::cuda::TextureObject<float> &depth_in, // Virtual depth map ftl::cuda::TextureObject<short> &weights, - ftl::cuda::TextureObject<float4> *normals, + ftl::cuda::TextureObject<half4> *normals, ftl::cuda::TextureObject<float> &out, // Accumulated output ftl::cuda::TextureObject<int> &contrib, const ftl::render::Parameters ¶ms, @@ -366,7 +366,7 @@ template void ftl::cuda::reproject( ftl::cuda::TextureObject<float> &depth_src, // Original 3D points ftl::cuda::TextureObject<float> &depth_in, // Virtual depth map ftl::cuda::TextureObject<short> &weights, - ftl::cuda::TextureObject<float4> *normals, + ftl::cuda::TextureObject<half4> *normals, ftl::cuda::TextureObject<float4> &out, // Accumulated output ftl::cuda::TextureObject<int> &contrib, const ftl::render::Parameters ¶ms, diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp index 285f2e651394913d8b10dba603a04b74090fe52a..da47b6fd3b2ef8ae0809728d27d876338207350f 100644 --- a/components/renderers/cpp/src/splatter_cuda.hpp +++ b/components/renderers/cpp/src/splatter_cuda.hpp @@ -48,7 +48,7 @@ namespace cuda { float alpha, cudaStream_t stream); - void dibr_merge( + /*void dibr_merge( ftl::cuda::TextureObject<float4> &points, ftl::cuda::TextureObject<float4> &normals, ftl::cuda::TextureObject<int> &depth, @@ -60,7 +60,7 @@ namespace cuda { ftl::cuda::TextureObject<float4> &points, ftl::cuda::TextureObject<int> &depth, ftl::render::Parameters params, - cudaStream_t stream); + cudaStream_t stream);*/ void dibr_merge( ftl::cuda::TextureObject<float> &depth, @@ -77,15 +77,15 @@ namespace cuda { ftl::render::Parameters params, cudaStream_t stream); - template <typename T> + /*template <typename T> void splat( - ftl::cuda::TextureObject<float4> &normals, + ftl::cuda::TextureObject<half4> &normals, ftl::cuda::TextureObject<float> &density, ftl::cuda::TextureObject<T> &colour_in, ftl::cuda::TextureObject<int> &depth_in, // Virtual depth map ftl::cuda::TextureObject<float> &depth_out, ftl::cuda::TextureObject<T> &colour_out, - const ftl::render::Parameters ¶ms, cudaStream_t stream); + const ftl::render::Parameters ¶ms, cudaStream_t stream);*/ template <typename A, typename B> void dibr_attribute( @@ -102,7 +102,7 @@ namespace cuda { ftl::cuda::TextureObject<float> &depth_src, // Original 3D points ftl::cuda::TextureObject<float> &depth_in, // Virtual depth map ftl::cuda::TextureObject<short> &weights, - ftl::cuda::TextureObject<float4> *normals, + ftl::cuda::TextureObject<half4> *normals, ftl::cuda::TextureObject<B> &out, // Accumulated output ftl::cuda::TextureObject<int> &contrib, const ftl::render::Parameters ¶ms,