diff --git a/components/renderers/cpp/CMakeLists.txt b/components/renderers/cpp/CMakeLists.txt index 5346849e22b23a31471ca6b3a1b52dc8eb37209d..fe93b78bb84000a5a499eda8dd136a5f7c153fae 100644 --- a/components/renderers/cpp/CMakeLists.txt +++ b/components/renderers/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ target_include_directories(ftlrender PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/ext/nanogui/include> $<INSTALL_INTERFACE:include> PRIVATE src) -target_link_libraries(ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) +target_link_libraries(ftlrender ftlrgbd ftlcommon cudatl Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) target_precompile_headers(ftlrender REUSE_FROM ftldata) diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu index 1fda3275953bccf54be24cc4b68d7e046d30f6cf..3b3e4d3bf575f3d027b3edc36114cbd2eb3ae3c4 100644 --- a/components/renderers/cpp/src/screen.cu +++ b/components/renderers/cpp/src/screen.cu @@ -2,7 +2,7 @@ #include "splatter_cuda.hpp" #include <ftl/rgbd/camera.hpp> #include <ftl/cuda_common.hpp> -#include <ftl/cuda/fixed.hpp> +#include <cudatl/fixed.hpp> using ftl::rgbd::Camera; using ftl::cuda::TextureObject; @@ -66,7 +66,7 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters screenPos = make_float3(30000,30000,0); screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); - depth_out[y*pitch2+x] = float2fixed<10>(screenPos.z); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(screenPos.z); } } @@ -136,7 +136,7 @@ void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &de screenPos = make_uint2(30000,30000); screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); - depth_out[y*pitch2+x] = float2fixed<10>(camPos.z); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(camPos.z); } } diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu index ff20436a95fe10f27fe9b881146a7ffe6f3e4d91..4c8770bdbaeabbfa8e2288c1d15a3f7a3aa0e27c 100644 --- a/components/renderers/cpp/src/triangle_render.cu +++ b/components/renderers/cpp/src/triangle_render.cu @@ -253,7 +253,7 @@ __global__ void reverse_check_kernel( if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) { float d2 = depth_original[oy*opitch4+ox]; if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*0.001f)) break; - d += 0.001f; + d += 0.002f; } else break; } diff --git a/lib/cudatl/include/cudatl/fixed.hpp b/lib/cudatl/include/cudatl/fixed.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3009f180c3bfb674a7a06cb56a05b61d7cfb3ecb --- /dev/null +++ b/lib/cudatl/include/cudatl/fixed.hpp @@ -0,0 +1,17 @@ +#ifndef _CUDATL_FIXED_HPP_ +#define _CUDATL_FIXED_HPP_ + +namespace cudatl +{ + +template <int FRAC> +__device__ inline float fixed2float(short v); + +template <int FRAC> +__device__ inline short float2fixed(float v); + +} + +#include <cudatl/impl/fixed.hpp> + +#endif diff --git a/lib/cudatl/include/cudatl/halfwarp.hpp b/lib/cudatl/include/cudatl/halfwarp.hpp index 3085381ae4d6b8ca33f2e690232e7204828f6832..c615fe73507b21a41f9bc7b2acf46955eb17c74b 100644 --- a/lib/cudatl/include/cudatl/halfwarp.hpp +++ b/lib/cudatl/include/cudatl/halfwarp.hpp @@ -3,15 +3,18 @@ #include <cuda_runtime.h> -namespace cudatl { +namespace cudatl +{ static constexpr int HALF_WARP_SIZE = 16; static constexpr unsigned int HALF_MASK1 = 0xFFFF0000; static constexpr unsigned int HALF_MASK2 = 0x0000FFFF; template <typename T> -__device__ inline T halfWarpMin(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMin(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -19,8 +22,10 @@ __device__ inline T halfWarpMin(T e) { } template <typename T> -__device__ inline T halfWarpMax(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMax(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -28,8 +33,10 @@ __device__ inline T halfWarpMax(T e) { } template <typename T> -__device__ inline T halfWarpSum(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpSum(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } diff --git a/lib/cudatl/include/cudatl/host_utility.hpp b/lib/cudatl/include/cudatl/host_utility.hpp index 39f066c3573f10db41a0d34b0bcbc0a93d93a971..16019260f89fe1cc643544e99a7a5fd8fb49c69d 100644 --- a/lib/cudatl/include/cudatl/host_utility.hpp +++ b/lib/cudatl/include/cudatl/host_utility.hpp @@ -4,9 +4,11 @@ #include <cuda_runtime.hpp> #include <string> -namespace cudatl { +namespace cudatl +{ -inline safeCall(cudaError_t e) { +inline safeCall(cudaError_t e) +{ if (e != cudaSuccess) throw new std::exception(std::string("Cuda Error "+std::to_string(int(e)))); } diff --git a/lib/cudatl/include/cudatl/impl/fixed.hpp b/lib/cudatl/include/cudatl/impl/fixed.hpp new file mode 100644 index 0000000000000000000000000000000000000000..289eafbdf72e4fe057e3bc4d2934bb157706f0a4 --- /dev/null +++ b/lib/cudatl/include/cudatl/impl/fixed.hpp @@ -0,0 +1,11 @@ +template <int FRAC> +__device__ inline float cudatl::fixed2float(short v) +{ + return float(v) / float(1 << FRAC); +} + +template <int FRAC> +__device__ inline short cudatl::float2fixed(float v) +{ + return short(v * float(1 << FRAC)); +} \ No newline at end of file diff --git a/lib/cudatl/include/cudatl/memory.hpp b/lib/cudatl/include/cudatl/memory.hpp index 3b249dc043e1ad09365a8d8662b3280dc352710e..a46402a07de4ede20c43a46792dc524fd098c232 100644 --- a/lib/cudatl/include/cudatl/memory.hpp +++ b/lib/cudatl/include/cudatl/memory.hpp @@ -3,25 +3,27 @@ #include <cudatl/host_utility.hpp> -namespace cudatl { +namespace cudatl +{ template <typename T> -T *allocate(size_t size) { -#ifdef USE_GPU +T *allocate(size_t size) +{ T *ptr; cudatl::safeCall(cudaMalloc(&ptr, size*sizeof(T))); return ptr; -#else - return new T[size]; -#endif } template <typename T> -T *allocate(size_t width, size_t height, uint &pitch) { - if (width == 1 || height == 1) { +T *allocate(size_t width, size_t height, uint &pitch) +{ + if (width == 1 || height == 1) + { pitch = width; return allocateMemory<T>((width > height) ? width : height); - } else { + } + else + { T *ptr; size_t ptmp; cudatl::safeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height)); @@ -31,7 +33,8 @@ T *allocate(size_t width, size_t height, uint &pitch) { } template <typename T> -void free(T *ptr) { +void free(T *ptr) +{ cudatl::safeCall(cudaFree(ptr)); } diff --git a/lib/cudatl/include/cudatl/warp.hpp b/lib/cudatl/include/cudatl/warp.hpp index 1a41181e6ec5ab2478dcfa7abcfedc8079d0b0b9..6a8dd898d77058d1f16ffc95fc0350efefd927b5 100644 --- a/lib/cudatl/include/cudatl/warp.hpp +++ b/lib/cudatl/include/cudatl/warp.hpp @@ -5,14 +5,17 @@ #define __cuda__ __host__ __device__ -namespace cudatl { +namespace cudatl +{ static constexpr int WARP_SIZE = 32; static constexpr unsigned int FULL_MASK = 0xFFFFFFFF; template <typename T> -__device__ inline T warpMin(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMin(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -20,8 +23,10 @@ __device__ inline T warpMin(T e) { } template <typename T> -__device__ inline T warpMax(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMax(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -29,8 +34,10 @@ __device__ inline T warpMax(T e) { } template <typename T> -__device__ inline T warpSum(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpSum(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } @@ -43,9 +50,11 @@ __device__ inline T warpSum(T e) { * TODO: This could be more efficient, perhaps with _shfl_XXX */ template <typename T> -inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) { +inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) +{ const int thread = tix%32; - for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) { + for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) + { __syncwarp(); const uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; __syncwarp();