From ecb62fbe10526dafb48e69d57d36d4923748fade Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Sun, 1 Nov 2020 21:49:54 +0200
Subject: [PATCH] Use cudatl

---
 components/renderers/cpp/CMakeLists.txt       |  2 +-
 components/renderers/cpp/src/screen.cu        |  6 ++---
 .../renderers/cpp/src/triangle_render.cu      |  2 +-
 lib/cudatl/include/cudatl/fixed.hpp           | 17 ++++++++++++
 lib/cudatl/include/cudatl/halfwarp.hpp        | 21 ++++++++++-----
 lib/cudatl/include/cudatl/host_utility.hpp    |  6 +++--
 lib/cudatl/include/cudatl/impl/fixed.hpp      | 11 ++++++++
 lib/cudatl/include/cudatl/memory.hpp          | 23 +++++++++-------
 lib/cudatl/include/cudatl/warp.hpp            | 27 ++++++++++++-------
 9 files changed, 82 insertions(+), 33 deletions(-)
 create mode 100644 lib/cudatl/include/cudatl/fixed.hpp
 create mode 100644 lib/cudatl/include/cudatl/impl/fixed.hpp

diff --git a/components/renderers/cpp/CMakeLists.txt b/components/renderers/cpp/CMakeLists.txt
index 5346849e2..fe93b78bb 100644
--- a/components/renderers/cpp/CMakeLists.txt
+++ b/components/renderers/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ target_include_directories(ftlrender PUBLIC
 	$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/ext/nanogui/include>
 	$<INSTALL_INTERFACE:include>
 	PRIVATE src)
-target_link_libraries(ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS})
+target_link_libraries(ftlrender ftlrgbd ftlcommon cudatl Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS})
 
 target_precompile_headers(ftlrender REUSE_FROM ftldata)
 
diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu
index 1fda32759..3b3e4d3bf 100644
--- a/components/renderers/cpp/src/screen.cu
+++ b/components/renderers/cpp/src/screen.cu
@@ -2,7 +2,7 @@
 #include "splatter_cuda.hpp"
 #include <ftl/rgbd/camera.hpp>
 #include <ftl/cuda_common.hpp>
-#include <ftl/cuda/fixed.hpp>
+#include <cudatl/fixed.hpp>
 
 using ftl::rgbd::Camera;
 using ftl::cuda::TextureObject;
@@ -66,7 +66,7 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters
 			screenPos = make_float3(30000,30000,0);
 
 		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
-		depth_out[y*pitch2+x] = float2fixed<10>(screenPos.z);
+		depth_out[y*pitch2+x] = cudatl::float2fixed<10>(screenPos.z);
 	}
 }
 
@@ -136,7 +136,7 @@ void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &de
 			screenPos = make_uint2(30000,30000);
 
 		screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y);
-		depth_out[y*pitch2+x] = float2fixed<10>(camPos.z);
+		depth_out[y*pitch2+x] = cudatl::float2fixed<10>(camPos.z);
 	}
 }
 
diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu
index ff20436a9..4c8770bdb 100644
--- a/components/renderers/cpp/src/triangle_render.cu
+++ b/components/renderers/cpp/src/triangle_render.cu
@@ -253,7 +253,7 @@ __global__ void reverse_check_kernel(
 		if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) {
 			float d2 = depth_original[oy*opitch4+ox];
 			if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*0.001f)) break;
-			d += 0.001f;
+			d += 0.002f;
 		} else break;
 	}
 
diff --git a/lib/cudatl/include/cudatl/fixed.hpp b/lib/cudatl/include/cudatl/fixed.hpp
new file mode 100644
index 000000000..3009f180c
--- /dev/null
+++ b/lib/cudatl/include/cudatl/fixed.hpp
@@ -0,0 +1,17 @@
+#ifndef _CUDATL_FIXED_HPP_
+#define _CUDATL_FIXED_HPP_
+
+namespace cudatl
+{
+
+template <int FRAC>
+__device__ inline float fixed2float(short v);
+
+template <int FRAC>
+__device__ inline short float2fixed(float v);
+
+}
+
+#include <cudatl/impl/fixed.hpp>
+
+#endif
diff --git a/lib/cudatl/include/cudatl/halfwarp.hpp b/lib/cudatl/include/cudatl/halfwarp.hpp
index 3085381ae..c615fe735 100644
--- a/lib/cudatl/include/cudatl/halfwarp.hpp
+++ b/lib/cudatl/include/cudatl/halfwarp.hpp
@@ -3,15 +3,18 @@
 
 #include <cuda_runtime.h>
 
-namespace cudatl {
+namespace cudatl
+{
 
 static constexpr int HALF_WARP_SIZE = 16;
 static constexpr unsigned int HALF_MASK1 = 0xFFFF0000;
 static constexpr unsigned int HALF_MASK2 = 0x0000FFFF;
 
 template <typename T>
-__device__ inline T halfWarpMin(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpMin(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = min(e, other);
 	}
@@ -19,8 +22,10 @@ __device__ inline T halfWarpMin(T e) {
 }
 
 template <typename T>
-__device__ inline T halfWarpMax(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpMax(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = max(e, other);
 	}
@@ -28,8 +33,10 @@ __device__ inline T halfWarpMax(T e) {
 }
 
 template <typename T>
-__device__ inline T halfWarpSum(T e) {
-	for (int i = WARP_SIZE/4; i > 0; i /= 2) {
+__device__ inline T halfWarpSum(T e)
+{
+	for (int i = WARP_SIZE/4; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e += other;
 	}
diff --git a/lib/cudatl/include/cudatl/host_utility.hpp b/lib/cudatl/include/cudatl/host_utility.hpp
index 39f066c35..16019260f 100644
--- a/lib/cudatl/include/cudatl/host_utility.hpp
+++ b/lib/cudatl/include/cudatl/host_utility.hpp
@@ -4,9 +4,11 @@
 #include <cuda_runtime.hpp>
 #include <string>
 
-namespace cudatl {
+namespace cudatl
+{
 
-inline safeCall(cudaError_t e) {
+inline safeCall(cudaError_t e)
+{
 	if (e != cudaSuccess) throw new std::exception(std::string("Cuda Error "+std::to_string(int(e))));
 }
 
diff --git a/lib/cudatl/include/cudatl/impl/fixed.hpp b/lib/cudatl/include/cudatl/impl/fixed.hpp
new file mode 100644
index 000000000..289eafbdf
--- /dev/null
+++ b/lib/cudatl/include/cudatl/impl/fixed.hpp
@@ -0,0 +1,11 @@
+template <int FRAC>
+__device__ inline float cudatl::fixed2float(short v)
+{
+    return float(v) / float(1 << FRAC);
+}
+
+template <int FRAC>
+__device__ inline short cudatl::float2fixed(float v)
+{
+    return short(v * float(1 << FRAC));
+}
\ No newline at end of file
diff --git a/lib/cudatl/include/cudatl/memory.hpp b/lib/cudatl/include/cudatl/memory.hpp
index 3b249dc04..a46402a07 100644
--- a/lib/cudatl/include/cudatl/memory.hpp
+++ b/lib/cudatl/include/cudatl/memory.hpp
@@ -3,25 +3,27 @@
 
 #include <cudatl/host_utility.hpp>
 
-namespace cudatl {
+namespace cudatl
+{
 
 template <typename T>
-T *allocate(size_t size) {
-#ifdef USE_GPU
+T *allocate(size_t size)
+{
 	T *ptr;
 	cudatl::safeCall(cudaMalloc(&ptr, size*sizeof(T)));
 	return ptr;
-#else
-	return new T[size];
-#endif
 }
 
 template <typename T>
-T *allocate(size_t width, size_t height, uint &pitch) {
-	if (width == 1 || height == 1) {
+T *allocate(size_t width, size_t height, uint &pitch)
+{
+	if (width == 1 || height == 1)
+	{
 		pitch = width;
 		return allocateMemory<T>((width > height) ? width : height);
-	} else {
+	}
+	else
+	{
 		T *ptr;
 		size_t ptmp;
 		cudatl::safeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height));
@@ -31,7 +33,8 @@ T *allocate(size_t width, size_t height, uint &pitch) {
 }
 
 template <typename T>
-void free(T *ptr) {
+void free(T *ptr)
+{
 	cudatl::safeCall(cudaFree(ptr));
 }
 
diff --git a/lib/cudatl/include/cudatl/warp.hpp b/lib/cudatl/include/cudatl/warp.hpp
index 1a41181e6..6a8dd898d 100644
--- a/lib/cudatl/include/cudatl/warp.hpp
+++ b/lib/cudatl/include/cudatl/warp.hpp
@@ -5,14 +5,17 @@
 
 #define __cuda__ __host__ __device__
 
-namespace cudatl {
+namespace cudatl
+{
 
 static constexpr int WARP_SIZE = 32;
 static constexpr unsigned int FULL_MASK = 0xFFFFFFFF;
 
 template <typename T>
-__device__ inline T warpMin(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpMin(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = min(e, other);
 	}
@@ -20,8 +23,10 @@ __device__ inline T warpMin(T e) {
 }
 
 template <typename T>
-__device__ inline T warpMax(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpMax(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e = max(e, other);
 	}
@@ -29,8 +34,10 @@ __device__ inline T warpMax(T e) {
 }
 
 template <typename T>
-__device__ inline T warpSum(T e) {
-	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+__device__ inline T warpSum(T e)
+{
+	for (int i = WARP_SIZE/2; i > 0; i /= 2)
+	{
 		const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
 		e += other;
 	}
@@ -43,9 +50,11 @@ __device__ inline T warpSum(T e) {
  * TODO: This could be more efficient, perhaps with _shfl_XXX
  */
 template <typename T>
-inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) {
+inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold)
+{
 	const int thread = tix%32;
-	for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) {
+	for (uint offset = 1; offset < WARP_SIZE; offset <<= 1)
+	{
 		__syncwarp();
 		const uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
 		__syncwarp();
-- 
GitLab