From ecb62fbe10526dafb48e69d57d36d4923748fade Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nwpope@utu.fi> Date: Sun, 1 Nov 2020 21:49:54 +0200 Subject: [PATCH] Use cudatl --- components/renderers/cpp/CMakeLists.txt | 2 +- components/renderers/cpp/src/screen.cu | 6 ++--- .../renderers/cpp/src/triangle_render.cu | 2 +- lib/cudatl/include/cudatl/fixed.hpp | 17 ++++++++++++ lib/cudatl/include/cudatl/halfwarp.hpp | 21 ++++++++++----- lib/cudatl/include/cudatl/host_utility.hpp | 6 +++-- lib/cudatl/include/cudatl/impl/fixed.hpp | 11 ++++++++ lib/cudatl/include/cudatl/memory.hpp | 23 +++++++++------- lib/cudatl/include/cudatl/warp.hpp | 27 ++++++++++++------- 9 files changed, 82 insertions(+), 33 deletions(-) create mode 100644 lib/cudatl/include/cudatl/fixed.hpp create mode 100644 lib/cudatl/include/cudatl/impl/fixed.hpp diff --git a/components/renderers/cpp/CMakeLists.txt b/components/renderers/cpp/CMakeLists.txt index 5346849e2..fe93b78bb 100644 --- a/components/renderers/cpp/CMakeLists.txt +++ b/components/renderers/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ target_include_directories(ftlrender PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/ext/nanogui/include> $<INSTALL_INTERFACE:include> PRIVATE src) -target_link_libraries(ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) +target_link_libraries(ftlrender ftlrgbd ftlcommon cudatl Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) target_precompile_headers(ftlrender REUSE_FROM ftldata) diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu index 1fda32759..3b3e4d3bf 100644 --- a/components/renderers/cpp/src/screen.cu +++ b/components/renderers/cpp/src/screen.cu @@ -2,7 +2,7 @@ #include "splatter_cuda.hpp" #include <ftl/rgbd/camera.hpp> #include <ftl/cuda_common.hpp> -#include <ftl/cuda/fixed.hpp> +#include <cudatl/fixed.hpp> using ftl::rgbd::Camera; using ftl::cuda::TextureObject; @@ -66,7 +66,7 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters screenPos = make_float3(30000,30000,0); screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); - depth_out[y*pitch2+x] = float2fixed<10>(screenPos.z); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(screenPos.z); } } @@ -136,7 +136,7 @@ void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &de screenPos = make_uint2(30000,30000); screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); - depth_out[y*pitch2+x] = float2fixed<10>(camPos.z); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(camPos.z); } } diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu index ff20436a9..4c8770bdb 100644 --- a/components/renderers/cpp/src/triangle_render.cu +++ b/components/renderers/cpp/src/triangle_render.cu @@ -253,7 +253,7 @@ __global__ void reverse_check_kernel( if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) { float d2 = depth_original[oy*opitch4+ox]; if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*0.001f)) break; - d += 0.001f; + d += 0.002f; } else break; } diff --git a/lib/cudatl/include/cudatl/fixed.hpp b/lib/cudatl/include/cudatl/fixed.hpp new file mode 100644 index 000000000..3009f180c --- /dev/null +++ b/lib/cudatl/include/cudatl/fixed.hpp @@ -0,0 +1,17 @@ +#ifndef _CUDATL_FIXED_HPP_ +#define _CUDATL_FIXED_HPP_ + +namespace cudatl +{ + +template <int FRAC> +__device__ inline float fixed2float(short v); + +template <int FRAC> +__device__ inline short float2fixed(float v); + +} + +#include <cudatl/impl/fixed.hpp> + +#endif diff --git a/lib/cudatl/include/cudatl/halfwarp.hpp b/lib/cudatl/include/cudatl/halfwarp.hpp index 3085381ae..c615fe735 100644 --- a/lib/cudatl/include/cudatl/halfwarp.hpp +++ b/lib/cudatl/include/cudatl/halfwarp.hpp @@ -3,15 +3,18 @@ #include <cuda_runtime.h> -namespace cudatl { +namespace cudatl +{ static constexpr int HALF_WARP_SIZE = 16; static constexpr unsigned int HALF_MASK1 = 0xFFFF0000; static constexpr unsigned int HALF_MASK2 = 0x0000FFFF; template <typename T> -__device__ inline T halfWarpMin(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMin(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -19,8 +22,10 @@ __device__ inline T halfWarpMin(T e) { } template <typename T> -__device__ inline T halfWarpMax(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMax(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -28,8 +33,10 @@ __device__ inline T halfWarpMax(T e) { } template <typename T> -__device__ inline T halfWarpSum(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpSum(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } diff --git a/lib/cudatl/include/cudatl/host_utility.hpp b/lib/cudatl/include/cudatl/host_utility.hpp index 39f066c35..16019260f 100644 --- a/lib/cudatl/include/cudatl/host_utility.hpp +++ b/lib/cudatl/include/cudatl/host_utility.hpp @@ -4,9 +4,11 @@ #include <cuda_runtime.hpp> #include <string> -namespace cudatl { +namespace cudatl +{ -inline safeCall(cudaError_t e) { +inline safeCall(cudaError_t e) +{ if (e != cudaSuccess) throw new std::exception(std::string("Cuda Error "+std::to_string(int(e)))); } diff --git a/lib/cudatl/include/cudatl/impl/fixed.hpp b/lib/cudatl/include/cudatl/impl/fixed.hpp new file mode 100644 index 000000000..289eafbdf --- /dev/null +++ b/lib/cudatl/include/cudatl/impl/fixed.hpp @@ -0,0 +1,11 @@ +template <int FRAC> +__device__ inline float cudatl::fixed2float(short v) +{ + return float(v) / float(1 << FRAC); +} + +template <int FRAC> +__device__ inline short cudatl::float2fixed(float v) +{ + return short(v * float(1 << FRAC)); +} \ No newline at end of file diff --git a/lib/cudatl/include/cudatl/memory.hpp b/lib/cudatl/include/cudatl/memory.hpp index 3b249dc04..a46402a07 100644 --- a/lib/cudatl/include/cudatl/memory.hpp +++ b/lib/cudatl/include/cudatl/memory.hpp @@ -3,25 +3,27 @@ #include <cudatl/host_utility.hpp> -namespace cudatl { +namespace cudatl +{ template <typename T> -T *allocate(size_t size) { -#ifdef USE_GPU +T *allocate(size_t size) +{ T *ptr; cudatl::safeCall(cudaMalloc(&ptr, size*sizeof(T))); return ptr; -#else - return new T[size]; -#endif } template <typename T> -T *allocate(size_t width, size_t height, uint &pitch) { - if (width == 1 || height == 1) { +T *allocate(size_t width, size_t height, uint &pitch) +{ + if (width == 1 || height == 1) + { pitch = width; return allocateMemory<T>((width > height) ? width : height); - } else { + } + else + { T *ptr; size_t ptmp; cudatl::safeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height)); @@ -31,7 +33,8 @@ T *allocate(size_t width, size_t height, uint &pitch) { } template <typename T> -void free(T *ptr) { +void free(T *ptr) +{ cudatl::safeCall(cudaFree(ptr)); } diff --git a/lib/cudatl/include/cudatl/warp.hpp b/lib/cudatl/include/cudatl/warp.hpp index 1a41181e6..6a8dd898d 100644 --- a/lib/cudatl/include/cudatl/warp.hpp +++ b/lib/cudatl/include/cudatl/warp.hpp @@ -5,14 +5,17 @@ #define __cuda__ __host__ __device__ -namespace cudatl { +namespace cudatl +{ static constexpr int WARP_SIZE = 32; static constexpr unsigned int FULL_MASK = 0xFFFFFFFF; template <typename T> -__device__ inline T warpMin(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMin(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -20,8 +23,10 @@ __device__ inline T warpMin(T e) { } template <typename T> -__device__ inline T warpMax(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMax(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -29,8 +34,10 @@ __device__ inline T warpMax(T e) { } template <typename T> -__device__ inline T warpSum(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpSum(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } @@ -43,9 +50,11 @@ __device__ inline T warpSum(T e) { * TODO: This could be more efficient, perhaps with _shfl_XXX */ template <typename T> -inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) { +inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) +{ const int thread = tix%32; - for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) { + for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) + { __syncwarp(); const uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; __syncwarp(); -- GitLab