diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt index 58d9312bd9ce8d5c1247b81e2a5525f7aa904002..82e2d43d84ddb6a33db19e168d2dff102cf70f0d 100644 --- a/applications/reconstruct/CMakeLists.txt +++ b/applications/reconstruct/CMakeLists.txt @@ -1,9 +1,13 @@ # Need to include staged files and libs -include_directories(${PROJECT_SOURCE_DIR}/applications/reconstruct/include) +#include_directories(${PROJECT_SOURCE_DIR}/reconstruct/include) #include_directories(${PROJECT_BINARY_DIR}) set(REPSRC src/main.cpp + src/scene_rep_hash_sdf.cu + src/ray_cast_sdf.cu + src/camera_util.cu + src/ray_cast_sdf.cpp src/registration.cpp ) @@ -11,6 +15,15 @@ add_executable(ftl-reconstruct ${REPSRC}) add_dependencies(ftl-reconstruct ftlnet) add_dependencies(ftl-reconstruct ftlcommon) +target_include_directories(ftl-reconstruct PUBLIC + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> + $<INSTALL_INTERFACE:include> + PRIVATE src) + +if (CUDA_FOUND) +set_property(TARGET ftl-reconstruct PROPERTY CUDA_SEPARABLE_COMPILATION ON) +endif() + #target_include_directories(cv-node PUBLIC ${PROJECT_SOURCE_DIR}/include) target_link_libraries(ftl-reconstruct ftlcommon ftlrgbd Threads::Threads ${OpenCV_LIBS} glog::glog ftlnet ftlrender) diff --git a/applications/reconstruct/include/ftl/cuda_matrix_util.hpp b/applications/reconstruct/include/ftl/cuda_matrix_util.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e8d803e6fdf7b7c62d52d456ddce288bf554b233 --- /dev/null +++ b/applications/reconstruct/include/ftl/cuda_matrix_util.hpp @@ -0,0 +1,1771 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/cuda_SimpleMatrixUtil.h + +#pragma once + +#ifndef _CUDA_SIMPLE_MATRIX_UTIL_ +#define _CUDA_SIMPLE_MATRIX_UTIL_ + +#ifndef __CUDACC__ +inline float __int_as_float(int a) { + union {int a; float b;} u; + u.a = a; + return u.b; +} +#endif + +#define MINF __int_as_float(0xff800000) +#define INF __int_as_float(0x7f800000) + +#include <iostream> +#include <ftl/cuda_util.hpp> + +////////////////////////////// +// float2x2 +////////////////////////////// + +class float2x2 +{ +public: + + inline __device__ __host__ float2x2() + { + } + + inline __device__ __host__ float2x2(const float values[4]) + { + m11 = values[0]; m12 = values[1]; + m21 = values[2]; m22 = values[3]; + } + + inline __device__ __host__ float2x2(const float2x2& other) + { + m11 = other.m11; m12 = other.m12; + m21 = other.m21; m22 = other.m22; + } + + inline __device__ __host__ void setZero() + { + m11 = 0.0f; m12 = 0.0f; + m21 = 0.0f; m22 = 0.0f; + } + + static inline __device__ __host__ float2x2 getIdentity() + { + float2x2 res; + res.setZero(); + res.m11 = res.m22 = 1.0f; + return res; + } + + inline __device__ __host__ float2x2& operator=(const float2x2 &other) + { + m11 = other.m11; m12 = other.m12; + m21 = other.m21; m22 = other.m22; + return *this; + } + + inline __device__ __host__ float2x2 getInverse() + { + float2x2 res; + res.m11 = m22; res.m12 = -m12; + res.m21 = -m21; res.m22 = m11; + + return res*(1.0f/det()); + } + + inline __device__ __host__ float det() + { + return m11*m22-m21*m12; + } + + inline __device__ __host__ float2 operator*(const float2& v) const + { + return make_float2(m11*v.x + m12*v.y, m21*v.x + m22*v.y); + } + + //! matrix scalar multiplication + inline __device__ __host__ float2x2 operator*(const float t) const + { + float2x2 res; + res.m11 = m11 * t; res.m12 = m12 * t; + res.m21 = m21 * t; res.m22 = m22 * t; + return res; + } + + //! matrix matrix multiplication + inline __device__ __host__ float2x2 operator*(const float2x2& other) const + { + float2x2 res; + res.m11 = m11 * other.m11 + m12 * other.m21; + res.m12 = m11 * other.m12 + m12 * other.m22; + res.m21 = m21 * other.m11 + m22 * other.m21; + res.m22 = m21 * other.m12 + m22 * other.m22; + return res; + } + + //! matrix matrix addition + inline __device__ __host__ float2x2 operator+(const float2x2& other) const + { + float2x2 res; + res.m11 = m11 + other.m11; + res.m12 = m12 + other.m12; + res.m21 = m21 + other.m21; + res.m22 = m22 + other.m22; + return res; + } + + inline __device__ __host__ float& operator()(int i, int j) + { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const + { + return entries2[i][j]; + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union + { + struct + { + float m11; float m12; + float m21; float m22; + }; + + float entries[4]; + float entries2[2][2]; + }; +}; + +////////////////////////////// +// float2x3 +////////////////////////////// + +class float2x3 +{ +public: + + inline __device__ __host__ float2x3() + { + } + + inline __device__ __host__ float2x3(const float values[6]) + { + m11 = values[0]; m12 = values[1]; m13 = values[2]; + m21 = values[3]; m22 = values[4]; m23 = values[5]; + } + + inline __device__ __host__ float2x3(const float2x3& other) + { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + } + + inline __device__ __host__ float2x3& operator=(const float2x3 &other) + { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + return *this; + } + + inline __device__ __host__ float2 operator*(const float3 &v) const + { + return make_float2(m11*v.x + m12*v.y + m13*v.z, m21*v.x + m22*v.y + m23*v.z); + } + + //! matrix scalar multiplication + inline __device__ __host__ float2x3 operator*(const float t) const + { + float2x3 res; + res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t; + res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t; + return res; + } + + //! matrix scalar division + inline __device__ __host__ float2x3 operator/(const float t) const + { + float2x3 res; + res.m11 = m11 / t; res.m12 = m12 / t; res.m13 = m13 / t; + res.m21 = m21 / t; res.m22 = m22 / t; res.m23 = m23 / t; + return res; + } + + inline __device__ __host__ float& operator()(int i, int j) + { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const + { + return entries2[i][j]; + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union + { + struct + { + float m11; float m12; float m13; + float m21; float m22; float m23; + }; + + float entries[6]; + float entries2[3][2]; + }; +}; + +////////////////////////////// +// float3x2 +////////////////////////////// + +class float3x2 +{ +public: + + inline __device__ __host__ float3x2() + { + } + + inline __device__ __host__ float3x2(const float values[6]) + { + m11 = values[0]; m12 = values[1]; + m21 = values[2]; m22 = values[3]; + m31 = values[4]; m32 = values[5]; + } + + inline __device__ __host__ float3x2& operator=(const float3x2& other) + { + m11 = other.m11; m12 = other.m12; + m21 = other.m21; m22 = other.m22; + m31 = other.m31; m32 = other.m32; + return *this; + } + + inline __device__ __host__ float3 operator*(const float2& v) const + { + return make_float3(m11*v.x + m12*v.y, m21*v.x + m22*v.y, m31*v.x + m32*v.y); + } + + inline __device__ __host__ float3x2 operator*(const float t) const + { + float3x2 res; + res.m11 = m11 * t; res.m12 = m12 * t; + res.m21 = m21 * t; res.m22 = m22 * t; + res.m31 = m31 * t; res.m32 = m32 * t; + return res; + } + + inline __device__ __host__ float& operator()(int i, int j) + { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const + { + return entries2[i][j]; + } + + inline __device__ __host__ float2x3 getTranspose() + { + float2x3 res; + res.m11 = m11; res.m12 = m21; res.m13 = m31; + res.m21 = m12; res.m22 = m22; res.m23 = m32; + return res; + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union + { + struct + { + float m11; float m12; + float m21; float m22; + float m31; float m32; + }; + + float entries[6]; + float entries2[3][2]; + }; +}; + +inline __device__ __host__ float2x2 matMul(const float2x3& m0, const float3x2& m1) +{ + float2x2 res; + res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31; + res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32; + res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31; + res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32; + return res; +} + +class float3x3 { +public: + inline __device__ __host__ float3x3() { + + } + inline __device__ __host__ float3x3(const float values[9]) { + m11 = values[0]; m12 = values[1]; m13 = values[2]; + m21 = values[3]; m22 = values[4]; m23 = values[5]; + m31 = values[6]; m32 = values[7]; m33 = values[8]; + } + + inline __device__ __host__ float3x3(const float3x3& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + m31 = other.m31; m32 = other.m32; m33 = other.m33; + } + + explicit inline __device__ __host__ float3x3(const float2x2& other) { + m11 = other.m11; m12 = other.m12; m13 = 0.0; + m21 = other.m21; m22 = other.m22; m23 = 0.0; + m31 = 0.0; m32 = 0.0; m33 = 0.0; + } + + inline __device__ __host__ float3x3& operator=(const float3x3 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + m31 = other.m31; m32 = other.m32; m33 = other.m33; + return *this; + } + + inline __device__ __host__ float& operator()(int i, int j) { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const { + return entries2[i][j]; + } + + + static inline __device__ __host__ void swap(float& v0, float& v1) { + float tmp = v0; + v0 = v1; + v1 = tmp; + } + + inline __device__ __host__ void transpose() { + swap(m12, m21); + swap(m13, m31); + swap(m23, m32); + } + inline __device__ __host__ float3x3 getTranspose() const { + float3x3 ret = *this; + ret.transpose(); + return ret; + } + + //! inverts the matrix + inline __device__ __host__ void invert() { + *this = getInverse(); + } + + //! computes the inverse of the matrix; the result is returned + inline __device__ __host__ float3x3 getInverse() const { + float3x3 res; + res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7]; + res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7]; + res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4]; + + res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6]; + res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6]; + res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3]; + + res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6]; + res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6]; + res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3]; + float nom = 1.0f/det(); + return res * nom; + } + + inline __device__ __host__ void setZero(float value = 0.0f) { + m11 = m12 = m13 = value; + m21 = m22 = m23 = value; + m31 = m32 = m33 = value; + } + + inline __device__ __host__ float det() const { + return + + m11*m22*m33 + + m12*m23*m31 + + m13*m21*m32 + - m31*m22*m13 + - m32*m23*m11 + - m33*m21*m12; + } + + inline __device__ __host__ float3 getRow(unsigned int i) { + return make_float3(entries[3*i+0], entries[3*i+1], entries[3*i+2]); + } + + inline __device__ __host__ void setRow(unsigned int i, float3& r) { + entries[3*i+0] = r.x; + entries[3*i+1] = r.y; + entries[3*i+2] = r.z; + } + + inline __device__ __host__ void normalizeRows() + { + //#pragma unroll 3 + for(unsigned int i = 0; i<3; i++) + { + float3 r = getRow(i); + float l = length(r); + r.x /= l; + r.y /= l; + r.z /= l; + setRow(i, r); + } + } + + //! computes the product of two matrices (result stored in this) + inline __device__ __host__ void mult(const float3x3 &other) { + float3x3 res; + res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31; + res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32; + res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33; + + res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31; + res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32; + res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33; + + res.m31 = m21 * other.m11 + m32 * other.m21 + m33 * other.m31; + res.m32 = m21 * other.m12 + m32 * other.m22 + m33 * other.m32; + res.m33 = m21 * other.m13 + m32 * other.m23 + m33 * other.m33; + *this = res; + } + + //! computes the sum of two matrices (result stored in this) + inline __device__ __host__ void add(const float3x3 &other) { + m11 += other.m11; m12 += other.m12; m13 += other.m13; + m21 += other.m21; m22 += other.m22; m23 += other.m23; + m31 += other.m31; m32 += other.m32; m33 += other.m33; + } + + //! standard matrix matrix multiplication + inline __device__ __host__ float3x3 operator*(const float3x3 &other) const { + float3x3 res; + res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31; + res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32; + res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33; + + res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31; + res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32; + res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33; + + res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31; + res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32; + res.m33 = m31 * other.m13 + m32 * other.m23 + m33 * other.m33; + return res; + } + + //! standard matrix matrix multiplication + inline __device__ __host__ float3x2 operator*(const float3x2 &other) const { + float3x2 res; + res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31; + res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32; + + res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31; + res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32; + + res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31; + res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32; + return res; + } + + inline __device__ __host__ float3 operator*(const float3 &v) const { + return make_float3( + m11*v.x + m12*v.y + m13*v.z, + m21*v.x + m22*v.y + m23*v.z, + m31*v.x + m32*v.y + m33*v.z + ); + } + + inline __device__ __host__ float3x3 operator*(const float t) const { + float3x3 res; + res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t; + res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t; + res.m31 = m31 * t; res.m32 = m32 * t; res.m33 = m33 * t; + return res; + } + + + inline __device__ __host__ float3x3 operator+(const float3x3 &other) const { + float3x3 res; + res.m11 = m11 + other.m11; res.m12 = m12 + other.m12; res.m13 = m13 + other.m13; + res.m21 = m21 + other.m21; res.m22 = m22 + other.m22; res.m23 = m23 + other.m23; + res.m31 = m31 + other.m31; res.m32 = m32 + other.m32; res.m33 = m33 + other.m33; + return res; + } + + inline __device__ __host__ float3x3 operator-(const float3x3 &other) const { + float3x3 res; + res.m11 = m11 - other.m11; res.m12 = m12 - other.m12; res.m13 = m13 - other.m13; + res.m21 = m21 - other.m21; res.m22 = m22 - other.m22; res.m23 = m23 - other.m23; + res.m31 = m31 - other.m31; res.m32 = m32 - other.m32; res.m33 = m33 - other.m33; + return res; + } + + static inline __device__ __host__ float3x3 getIdentity() { + float3x3 res; + res.setZero(); + res.m11 = res.m22 = res.m33 = 1.0f; + return res; + } + + static inline __device__ __host__ float3x3 getZeroMatrix() { + float3x3 res; + res.setZero(); + return res; + } + + static inline __device__ __host__ float3x3 getDiagonalMatrix(float diag = 1.0f) { + float3x3 res; + res.m11 = diag; res.m12 = 0.0f; res.m13 = 0.0f; + res.m21 = 0.0f; res.m22 = diag; res.m23 = 0.0f; + res.m31 = 0.0f; res.m32 = 0.0f; res.m33 = diag; + return res; + } + + static inline __device__ __host__ float3x3 tensorProduct(const float3 &v, const float3 &vt) { + float3x3 res; + res.m11 = v.x * vt.x; res.m12 = v.x * vt.y; res.m13 = v.x * vt.z; + res.m21 = v.y * vt.x; res.m22 = v.y * vt.y; res.m23 = v.y * vt.z; + res.m31 = v.z * vt.x; res.m32 = v.z * vt.y; res.m33 = v.z * vt.z; + return res; + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union { + struct { + float m11; float m12; float m13; + float m21; float m22; float m23; + float m31; float m32; float m33; + }; + float entries[9]; + float entries2[3][3]; + }; +}; + + +inline __device__ __host__ float2x3 matMul(const float2x3& m0, const float3x3& m1) +{ + float2x3 res; + res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31; + res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32; + res.m13 = m0.m11*m1.m13+m0.m12*m1.m23+m0.m13*m1.m33; + + res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31; + res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32; + res.m23 = m0.m21*m1.m13+m0.m22*m1.m23+m0.m23*m1.m33; + return res; +} + +// (1x2) row matrix as float2 +inline __device__ __host__ float3 matMul(const float2& m0, const float2x3& m1) +{ + float3 res; + res.x = m0.x*m1.m11+m0.y*m1.m21; + res.y = m0.x*m1.m12+m0.y*m1.m22; + res.z = m0.x*m1.m13+m0.y*m1.m23; + + return res; +} + + +class float3x4 { +public: + inline __device__ __host__ float3x4() { + + } + inline __device__ __host__ float3x4(const float values[12]) { + m11 = values[0]; m12 = values[1]; m13 = values[2]; m14 = values[3]; + m21 = values[4]; m22 = values[5]; m23 = values[6]; m24 = values[7]; + m31 = values[8]; m32 = values[9]; m33 = values[10]; m34 = values[11]; + } + + inline __device__ __host__ float3x4(const float3x4& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + } + + inline __device__ __host__ float3x4(const float3x3& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f; + } + + inline __device__ __host__ float3x4 operator=(const float3x4 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + return *this; + } + + inline __device__ __host__ float3x4& operator=(const float3x3 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f; + return *this; + } + + //! assumes the last line of the matrix implicitly to be (0,0,0,1) + inline __device__ __host__ float4 operator*(const float4 &v) const { + return make_float4( + m11*v.x + m12*v.y + m13*v.z + m14*v.w, + m21*v.x + m22*v.y + m23*v.z + m24*v.w, + m31*v.x + m32*v.y + m33*v.z + m34*v.w, + v.w + ); + } + + //! assumes an implicit 1 in w component of the input vector + inline __device__ __host__ float3 operator*(const float3 &v) const { + return make_float3( + m11*v.x + m12*v.y + m13*v.z + m14, + m21*v.x + m22*v.y + m23*v.z + m24, + m31*v.x + m32*v.y + m33*v.z + m34 + ); + } + + //! matrix scalar multiplication + inline __device__ __host__ float3x4 operator*(const float t) const { + float3x4 res; + res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t; res.m14 = m14 * t; + res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t; res.m24 = m24 * t; + res.m31 = m31 * t; res.m32 = m32 * t; res.m33 = m33 * t; res.m34 = m34 * t; + return res; + } + inline __device__ __host__ float3x4& operator*=(const float t) { + *this = *this * t; + return *this; + } + + //! matrix scalar division + inline __device__ __host__ float3x4 operator/(const float t) const { + float3x4 res; + res.m11 = m11 / t; res.m12 = m12 / t; res.m13 = m13 / t; res.m14 = m14 / t; + res.m21 = m21 / t; res.m22 = m22 / t; res.m23 = m23 / t; res.m24 = m24 / t; + res.m31 = m31 / t; res.m32 = m32 / t; res.m33 = m33 / t; res.m34 = m34 / t; + return res; + } + inline __device__ __host__ float3x4& operator/=(const float t) { + *this = *this / t; + return *this; + } + + //! assumes the last line of the matrix implicitly to be (0,0,0,1) + inline __device__ __host__ float3x4 operator*(const float3x4 &other) const { + float3x4 res; + res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31; + res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32; + res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33; + res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14; + + res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31; + res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32; + res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33; + res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24; + + res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31; + res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32; + res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33; + res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34; + + //res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41; + //res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42; + //res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43; + //res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44; + + return res; + } + + //! assumes the last line of the matrix implicitly to be (0,0,0,1); and a (0,0,0) translation of other + inline __device__ __host__ float3x4 operator*(const float3x3 &other) const { + float3x4 res; + res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31; + res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32; + res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33; + res.m14 = m14; + + res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31; + res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32; + res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33; + res.m24 = m24; + + res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31; + res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32; + res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33; + res.m34 = m34; + + return res; + } + + + + inline __device__ __host__ float& operator()(int i, int j) { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const { + return entries2[i][j]; + } + + //! returns the translation part of the matrix + inline __device__ __host__ float3 getTranslation() { + return make_float3(m14, m24, m34); + } + + //! sets only the translation part of the matrix (other values remain unchanged) + inline __device__ __host__ void setTranslation(const float3 &t) { + m14 = t.x; + m24 = t.y; + m34 = t.z; + } + + //! returns the 3x3 part of the matrix + inline __device__ __host__ float3x3 getFloat3x3() { + float3x3 ret; + ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; + ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; + ret.m31 = m31; ret.m32 = m32; ret.m33 = m33; + return ret; + } + + //! sets the 3x3 part of the matrix (other values remain unchanged) + inline __device__ __host__ void setFloat3x3(const float3x3 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + m31 = other.m31; m32 = other.m32; m33 = other.m33; + } + + //! inverts the matrix + inline __device__ __host__ void inverse() { + *this = getInverse(); + } + + //! computes the inverse of the matrix + inline __device__ __host__ float3x4 getInverse() { + float3x3 A = getFloat3x3(); + A.invert(); + float3 t = getTranslation(); + t = A*t; + + float3x4 ret; + ret.setFloat3x3(A); + ret.setTranslation(make_float3(-t.x, -t.y, -t.z)); //float3 doesn't have unary '-'... thank you cuda + return ret; + } + + //! prints the matrix; only host + __host__ void print() { + std::cout << + m11 << " " << m12 << " " << m13 << " " << m14 << std::endl << + m21 << " " << m22 << " " << m23 << " " << m24 << std::endl << + m31 << " " << m32 << " " << m33 << " " << m34 << std::endl << + std::endl; + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union { + struct { + float m11; float m12; float m13; float m14; + float m21; float m22; float m23; float m24; + float m31; float m32; float m33; float m34; + }; + float entries[9]; + float entries2[3][4]; + }; +}; + + + +class float4x4 { +public: + inline __device__ __host__ float4x4() { + + } + inline __device__ __host__ float4x4(const float values[16]) { + m11 = values[0]; m12 = values[1]; m13 = values[2]; m14 = values[3]; + m21 = values[4]; m22 = values[5]; m23 = values[6]; m24 = values[7]; + m31 = values[8]; m32 = values[9]; m33 = values[10]; m34 = values[11]; + m41 = values[12]; m42 = values[13]; m43 = values[14]; m44 = values[15]; + } + + inline __device__ __host__ float4x4(const float4x4& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + m41 = other.m41; m42 = other.m42; m43 = other.m43; m44 = other.m44; + } + + //implicitly assumes last line to (0,0,0,1) + inline __device__ __host__ float4x4(const float3x4& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f; + } + + inline __device__ __host__ float4x4(const float3x3& other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f; + m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f; + } + + inline __device__ __host__ float4x4 operator=(const float4x4 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + m41 = other.m41; m42 = other.m42; m43 = other.m43; m44 = other.m44; + return *this; + } + + inline __device__ __host__ float4x4 operator=(const float3x4 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f; + return *this; + } + + inline __device__ __host__ float4x4& operator=(const float3x3 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f; + m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f; + return *this; + } + + + //! not tested + inline __device__ __host__ float4x4 operator*(const float4x4 &other) const { + float4x4 res; + res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31 + m14*other.m41; + res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32 + m14*other.m42; + res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33 + m14*other.m43; + res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14*other.m44; + + res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31 + m24*other.m41; + res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32 + m24*other.m42; + res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33 + m24*other.m43; + res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24*other.m44; + + res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31 + m34*other.m41; + res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32 + m34*other.m42; + res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33 + m34*other.m43; + res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34*other.m44; + + res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41; + res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42; + res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43; + res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44; + + return res; + } + + // untested + inline __device__ __host__ float4 operator*(const float4& v) const + { + return make_float4( + m11*v.x + m12*v.y + m13*v.z + m14*v.w, + m21*v.x + m22*v.y + m23*v.z + m24*v.w, + m31*v.x + m32*v.y + m33*v.z + m34*v.w, + m41*v.x + m42*v.y + m43*v.z + m44*v.w + ); + } + + // untested + //implicitly assumes w to be 1 + inline __device__ __host__ float3 operator*(const float3& v) const + { + return make_float3( + m11*v.x + m12*v.y + m13*v.z + m14*1.0f, + m21*v.x + m22*v.y + m23*v.z + m24*1.0f, + m31*v.x + m32*v.y + m33*v.z + m34*1.0f + ); + } + + inline __device__ __host__ float& operator()(int i, int j) { + return entries2[i][j]; + } + + inline __device__ __host__ float operator()(int i, int j) const { + return entries2[i][j]; + } + + + static inline __device__ __host__ void swap(float& v0, float& v1) { + float tmp = v0; + v0 = v1; + v1 = tmp; + } + + inline __device__ __host__ void transpose() { + swap(m12, m21); + swap(m13, m31); + swap(m23, m32); + swap(m41, m14); + swap(m42, m24); + swap(m43, m34); + } + inline __device__ __host__ float4x4 getTranspose() const { + float4x4 ret = *this; + ret.transpose(); + return ret; + } + + + inline __device__ __host__ void invert() { + *this = getInverse(); + } + + //! return the inverse matrix; but does not change the current matrix + inline __device__ __host__ float4x4 getInverse() const { + float inv[16]; + + inv[0] = entries[5] * entries[10] * entries[15] - + entries[5] * entries[11] * entries[14] - + entries[9] * entries[6] * entries[15] + + entries[9] * entries[7] * entries[14] + + entries[13] * entries[6] * entries[11] - + entries[13] * entries[7] * entries[10]; + + inv[4] = -entries[4] * entries[10] * entries[15] + + entries[4] * entries[11] * entries[14] + + entries[8] * entries[6] * entries[15] - + entries[8] * entries[7] * entries[14] - + entries[12] * entries[6] * entries[11] + + entries[12] * entries[7] * entries[10]; + + inv[8] = entries[4] * entries[9] * entries[15] - + entries[4] * entries[11] * entries[13] - + entries[8] * entries[5] * entries[15] + + entries[8] * entries[7] * entries[13] + + entries[12] * entries[5] * entries[11] - + entries[12] * entries[7] * entries[9]; + + inv[12] = -entries[4] * entries[9] * entries[14] + + entries[4] * entries[10] * entries[13] + + entries[8] * entries[5] * entries[14] - + entries[8] * entries[6] * entries[13] - + entries[12] * entries[5] * entries[10] + + entries[12] * entries[6] * entries[9]; + + inv[1] = -entries[1] * entries[10] * entries[15] + + entries[1] * entries[11] * entries[14] + + entries[9] * entries[2] * entries[15] - + entries[9] * entries[3] * entries[14] - + entries[13] * entries[2] * entries[11] + + entries[13] * entries[3] * entries[10]; + + inv[5] = entries[0] * entries[10] * entries[15] - + entries[0] * entries[11] * entries[14] - + entries[8] * entries[2] * entries[15] + + entries[8] * entries[3] * entries[14] + + entries[12] * entries[2] * entries[11] - + entries[12] * entries[3] * entries[10]; + + inv[9] = -entries[0] * entries[9] * entries[15] + + entries[0] * entries[11] * entries[13] + + entries[8] * entries[1] * entries[15] - + entries[8] * entries[3] * entries[13] - + entries[12] * entries[1] * entries[11] + + entries[12] * entries[3] * entries[9]; + + inv[13] = entries[0] * entries[9] * entries[14] - + entries[0] * entries[10] * entries[13] - + entries[8] * entries[1] * entries[14] + + entries[8] * entries[2] * entries[13] + + entries[12] * entries[1] * entries[10] - + entries[12] * entries[2] * entries[9]; + + inv[2] = entries[1] * entries[6] * entries[15] - + entries[1] * entries[7] * entries[14] - + entries[5] * entries[2] * entries[15] + + entries[5] * entries[3] * entries[14] + + entries[13] * entries[2] * entries[7] - + entries[13] * entries[3] * entries[6]; + + inv[6] = -entries[0] * entries[6] * entries[15] + + entries[0] * entries[7] * entries[14] + + entries[4] * entries[2] * entries[15] - + entries[4] * entries[3] * entries[14] - + entries[12] * entries[2] * entries[7] + + entries[12] * entries[3] * entries[6]; + + inv[10] = entries[0] * entries[5] * entries[15] - + entries[0] * entries[7] * entries[13] - + entries[4] * entries[1] * entries[15] + + entries[4] * entries[3] * entries[13] + + entries[12] * entries[1] * entries[7] - + entries[12] * entries[3] * entries[5]; + + inv[14] = -entries[0] * entries[5] * entries[14] + + entries[0] * entries[6] * entries[13] + + entries[4] * entries[1] * entries[14] - + entries[4] * entries[2] * entries[13] - + entries[12] * entries[1] * entries[6] + + entries[12] * entries[2] * entries[5]; + + inv[3] = -entries[1] * entries[6] * entries[11] + + entries[1] * entries[7] * entries[10] + + entries[5] * entries[2] * entries[11] - + entries[5] * entries[3] * entries[10] - + entries[9] * entries[2] * entries[7] + + entries[9] * entries[3] * entries[6]; + + inv[7] = entries[0] * entries[6] * entries[11] - + entries[0] * entries[7] * entries[10] - + entries[4] * entries[2] * entries[11] + + entries[4] * entries[3] * entries[10] + + entries[8] * entries[2] * entries[7] - + entries[8] * entries[3] * entries[6]; + + inv[11] = -entries[0] * entries[5] * entries[11] + + entries[0] * entries[7] * entries[9] + + entries[4] * entries[1] * entries[11] - + entries[4] * entries[3] * entries[9] - + entries[8] * entries[1] * entries[7] + + entries[8] * entries[3] * entries[5]; + + inv[15] = entries[0] * entries[5] * entries[10] - + entries[0] * entries[6] * entries[9] - + entries[4] * entries[1] * entries[10] + + entries[4] * entries[2] * entries[9] + + entries[8] * entries[1] * entries[6] - + entries[8] * entries[2] * entries[5]; + + float matrixDet = entries[0] * inv[0] + entries[1] * inv[4] + entries[2] * inv[8] + entries[3] * inv[12]; + + float matrixDetr = 1.0f / matrixDet; + + float4x4 res; + for (unsigned int i = 0; i < 16; i++) { + res.entries[i] = inv[i] * matrixDetr; + } + return res; + + } + + + + + + //! returns the 3x3 part of the matrix + inline __device__ __host__ float3x3 getFloat3x3() { + float3x3 ret; + ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; + ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; + ret.m31 = m31; ret.m32 = m32; ret.m33 = m33; + return ret; + } + + //! sets the 3x3 part of the matrix (other values remain unchanged) + inline __device__ __host__ void setFloat3x3(const float3x3 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; + m21 = other.m21; m22 = other.m22; m23 = other.m23; + m31 = other.m31; m32 = other.m32; m33 = other.m33; + } + + //! sets the 4x4 part of the matrix to identity + inline __device__ __host__ void setIdentity() + { + m11 = 1.0f; m12 = 0.0f; m13 = 0.0f; m14 = 0.0f; + m21 = 0.0f; m22 = 1.0f; m23 = 0.0f; m24 = 0.0f; + m31 = 0.0f; m32 = 0.0f; m33 = 1.0f; m34 = 0.0f; + m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f; + } + + //! sets the 4x4 part of the matrix to identity + inline __device__ __host__ void setValue(float v) + { + m11 = v; m12 = v; m13 = v; m14 = v; + m21 = v; m22 = v; m23 = v; m24 = v; + m31 = v; m32 = v; m33 = v; m34 = v; + m41 = v; m42 = v; m43 = v; m44 = v; + } + + //! returns the 3x4 part of the matrix + inline __device__ __host__ float3x4 getFloat3x4() { + float3x4 ret; + ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; ret.m14 = m14; + ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; ret.m24 = m24; + ret.m31 = m31; ret.m32 = m32; ret.m33 = m33; ret.m34 = m34; + return ret; + } + + //! sets the 3x4 part of the matrix (other values remain unchanged) + inline __device__ __host__ void setFloat3x4(const float3x4 &other) { + m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14; + m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24; + m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34; + } + + + + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + union { + struct { + float m11; float m12; float m13; float m14; + float m21; float m22; float m23; float m24; + float m31; float m32; float m33; float m34; + float m41; float m42; float m43; float m44; + }; + float entries[16]; + float entries2[4][4]; + }; +}; + + + +////////////////////////////// +// matNxM +////////////////////////////// + +template<unsigned int N, unsigned int M> +class matNxM +{ + public: + + ////////////////////////////// + // Initialization + ////////////////////////////// + inline __device__ __host__ matNxM() + { + } + + inline __device__ __host__ matNxM(float* values) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i]; + } + + inline __device__ __host__ matNxM(const float* values) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i]; + } + + inline __device__ __host__ matNxM(const matNxM& other) + { + (*this) = other; + } + + inline __device__ __host__ matNxM<N,M>& operator=(const matNxM<N,M>& other) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] = other.entries[i]; + return *this; + } + + inline __device__ __host__ void setZero() + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] = 0.0f; + } + + inline __device__ __host__ void setIdentity() + { + setZero(); + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<min(N, M); i++) entries2D[i][i] = 1.0f; + } + + static inline __device__ __host__ matNxM<N, M> getIdentity() + { + matNxM<N, M> R; R.setIdentity(); + return R; + } + + ////////////////////////////// + // Conversion + ////////////////////////////// + + // declare generic constructors for compile time checking of matrix size + template<class B> + explicit inline __device__ __host__ matNxM(const B& other); + + template<class B> + explicit inline __device__ __host__ matNxM(const B& other0, const B& other1); + + // declare generic casts for compile time checking of matrix size + inline __device__ __host__ operator float(); + inline __device__ __host__ operator float2(); + inline __device__ __host__ operator float3(); + inline __device__ __host__ operator float4(); + + inline __device__ __host__ operator float2x2(); + inline __device__ __host__ operator float3x3(); + inline __device__ __host__ operator float4x4(); + + ////////////////////////////// + // Matrix - Matrix Multiplication + ////////////////////////////// + template<unsigned int NOther, unsigned int MOther> + inline __device__ __host__ matNxM<N,MOther> operator*(const matNxM<NOther,MOther>& other) const + { + cudaAssert(M == NOther); + matNxM<N,MOther> res; + + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<MOther; j++) + { + float sum = 0.0f; + __CONDITIONAL_UNROLL__ + for(unsigned int k = 0; k<M; k++) + { + sum += (*this)(i, k)*other(k, j); + } + + res(i, j) = sum; + } + } + + return res; + } + + ////////////////////////////// + // Matrix - Inversion + ////////////////////////////// + + inline __device__ __host__ float det() const; + inline __device__ __host__ matNxM<N, M> getInverse() const; + + ////////////////////////////// + // Matrix - Transpose + ////////////////////////////// + inline __device__ __host__ matNxM<M,N> getTranspose() const + { + matNxM<M,N> res; + + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<M; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<N; j++) + { + res(i, j) = (*this)(j, i); + } + } + + return res; + } + + inline __device__ void printCUDA() const + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<M; j++) + { + printf("%f ", (*this)(i, j)); + } + printf("\n"); + } + } + + inline __device__ bool checkMINF() const + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<M; j++) + { + if((*this)(i, j) == MINF) return true; + } + } + + return false; + } + + inline __device__ bool checkINF() const + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<M; j++) + { + if((*this)(i, j) == INF) return true; + } + } + + return false; + } + + inline __device__ bool checkQNAN() const + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<M; j++) + { + if((*this)(i, j) != (*this)(i, j)) return true; + } + } + + return false; + } + + ////////////////////////////// + // Matrix - Matrix Addition + ////////////////////////////// + inline __device__ __host__ matNxM<N,M> operator+(const matNxM<N,M>& other) const + { + matNxM<N,M> res = (*this); + res+=other; + return res; + } + + inline __device__ __host__ matNxM<N,M>& operator+=(const matNxM<N,M>& other) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] += other.entries[i]; + return (*this); + } + + ////////////////////////////// + // Matrix - Negation + ////////////////////////////// + inline __device__ __host__ matNxM<N,M> operator-() const + { + matNxM<N,M> res = (*this)*(-1.0f); + return res; + } + + ////////////////////////////// + // Matrix - Matrix Subtraction + ////////////////////////////// + inline __device__ __host__ matNxM<N,M> operator-(const matNxM<N,M>& other) const + { + matNxM<N,M> res = (*this); + res-=other; + return res; + } + + inline __device__ __host__ matNxM<N,M>& operator-=(const matNxM<N,M>& other) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] -= other.entries[i]; + return (*this); + } + + ////////////////////////////// + // Matrix - Scalar Multiplication + ////////////////////////////// + inline __device__ __host__ matNxM<N,M> operator*(const float t) const + { + matNxM<N,M> res = (*this); + res*=t; + return res; + } + + inline __device__ __host__ matNxM<N, M>& operator*=(const float t) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] *= t; + return (*this); + } + + ////////////////////////////// + // Matrix - Scalar Division + ////////////////////////////// + inline __device__ __host__ matNxM<N, M> operator/(const float t) const + { + matNxM<N, M> res = (*this); + res/=t; + return res; + } + + inline __device__ __host__ matNxM<N, M>& operator/=(const float t) + { + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<N*M; i++) entries[i] /= t; + return (*this); + } + + ////////////////////////// + // Element Access + ////////////////////////// + inline __device__ __host__ unsigned int nRows() + { + return N; + } + + inline __device__ __host__ unsigned int nCols() + { + return M; + } + + inline __device__ __host__ float& operator()(unsigned int i, unsigned int j) + { + cudaAssert(i<N && j<M); + return entries2D[i][j]; + } + + inline __device__ __host__ float operator()(unsigned int i, unsigned int j) const + { + cudaAssert(i<N && j<M); + return entries2D[i][j]; + } + + inline __device__ __host__ float& operator()(unsigned int i) + { + cudaAssert(i<N*M); + return entries[i]; + } + + inline __device__ __host__ float operator()(unsigned int i) const + { + cudaAssert(i<N*M); + return entries[i]; + } + + template<unsigned int NOther, unsigned int MOther> + inline __device__ __host__ void getBlock(unsigned int xStart, unsigned int yStart, matNxM<NOther, MOther>& res) const + { + cudaAssert(xStart+NOther <= N && yStart+MOther <= M); + + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<NOther; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<MOther; j++) + { + res(i, j) = (*this)(xStart+i, yStart+j); + } + } + } + + template<unsigned int NOther, unsigned int MOther> + inline __device__ __host__ void setBlock(matNxM<NOther, MOther>& input, unsigned int xStart, unsigned int yStart) + { + cudaAssert(xStart+NOther <= N && yStart+MOther <= M); + + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<NOther; i++) + { + __CONDITIONAL_UNROLL__ + for(unsigned int j = 0; j<MOther; j++) + { + (*this)(xStart+i, yStart+j) = input(i, j); + } + } + } + + inline __device__ __host__ const float* ptr() const { + return entries; + } + inline __device__ __host__ float* ptr() { + return entries; + } + + // Operators + + inline __device__ __host__ float norm1DSquared() const + { + cudaAssert(M==1 || N==1); + + float sum = 0.0f; + for(unsigned int i = 0; i<max(N, M); i++) sum += entries[i]*entries[i]; + + return sum; + } + + inline __device__ __host__ float norm1D() const + { + return sqrt(norm1DSquared()); + } + + private: + + union + { + float entries[N*M]; + float entries2D[N][M]; + }; +}; + +////////////////////////////// +// Scalar - Matrix Multiplication +////////////////////////////// +template<unsigned int N, unsigned int M> +inline __device__ __host__ matNxM<N,M> operator*(const float t, const matNxM<N, M>& mat) +{ + matNxM<N,M> res = mat; + res*=t; + return res; +} + +////////////////////////////// +// Matrix Inversion +////////////////////////////// + +template<> +inline __device__ __host__ float matNxM<3, 3>::det() const +{ + const float& m11 = entries2D[0][0]; + const float& m12 = entries2D[0][1]; + const float& m13 = entries2D[0][2]; + + const float& m21 = entries2D[1][0]; + const float& m22 = entries2D[1][1]; + const float& m23 = entries2D[1][2]; + + const float& m31 = entries2D[2][0]; + const float& m32 = entries2D[2][1]; + const float& m33 = entries2D[2][2]; + + return m11*m22*m33 + m12*m23*m31 + m13*m21*m32 - m31*m22*m13 - m32*m23*m11 - m33*m21*m12; +} + +template<> +inline __device__ __host__ matNxM<3, 3> matNxM<3, 3>::getInverse() const +{ + matNxM<3, 3> res; + res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7]; + res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7]; + res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4]; + + res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6]; + res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6]; + res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3]; + + res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6]; + res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6]; + res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3]; + return res*(1.0f/det()); +} + +template<> +inline __device__ __host__ float matNxM<2, 2>::det() const +{ + return (*this)(0, 0)*(*this)(1, 1)-(*this)(1, 0)*(*this)(0, 1); +} + +template<> +inline __device__ __host__ matNxM<2, 2> matNxM<2, 2>::getInverse() const +{ + matNxM<2, 2> res; + res(0, 0) = (*this)(1, 1); res(0, 1) = -(*this)(0, 1); + res(1, 0) = -(*this)(1, 0); res(1, 1) = (*this)(0, 0); + + return res*(1.0f/det()); +} + +////////////////////////////// +// Conversion +////////////////////////////// + +// To Matrix from floatNxN +template<> +template<> +inline __device__ __host__ matNxM<1, 1>::matNxM(const float& other) +{ + entries[0] = other; +} + +// To Matrix from floatNxN +template<> +template<> +inline __device__ __host__ matNxM<2, 2>::matNxM(const float2x2& other) +{ + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<4; i++) entries[i] = other.entries[i]; +} + +template<> +template<> +inline __device__ __host__ matNxM<3, 3>::matNxM(const float3x3& other) +{ + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<9; i++) entries[i] = other.entries[i]; +} + +template<> +template<> +inline __device__ __host__ matNxM<4, 4>::matNxM(const float4x4& other) +{ + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<16; i++) entries[i] = other.entries[i]; +} + +template<> +template<> +inline __device__ __host__ matNxM<3, 2>::matNxM(const float3& col0, const float3& col1) +{ + entries2D[0][0] = col0.x; entries2D[0][1] = col1.x; + entries2D[1][0] = col0.y; entries2D[1][1] = col1.y; + entries2D[2][0] = col0.z; entries2D[2][1] = col1.z; +} + +// To floatNxM from Matrix +template<> +inline __device__ __host__ matNxM<4, 4>::operator float4x4() +{ + float4x4 res; + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<16; i++) res.entries[i] = entries[i]; + return res; +} + +template<> +inline __device__ __host__ matNxM<3, 3>::operator float3x3() +{ + float3x3 res; + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<9; i++) res.entries[i] = entries[i]; + return res; +} + +template<> +inline __device__ __host__ matNxM<2, 2>::operator float2x2() +{ + float2x2 res; + __CONDITIONAL_UNROLL__ + for(unsigned int i = 0; i<4; i++) res.entries[i] = entries[i]; + return res; +} + +// To Matrix from floatN +template<> +template<> +inline __device__ __host__ matNxM<2, 1>::matNxM(const float2& other) +{ + entries[0] = other.x; + entries[1] = other.y; +} + +template<> +template<> +inline __device__ __host__ matNxM<3, 1>::matNxM(const float3& other) +{ + entries[0] = other.x; + entries[1] = other.y; + entries[2] = other.z; +} + +template<> +template<> +inline __device__ __host__ matNxM<4, 1>::matNxM(const float4& other) +{ + entries[0] = other.x; + entries[1] = other.y; + entries[2] = other.z; + entries[3] = other.w; +} + +// To floatN from Matrix +template<> +inline __device__ __host__ matNxM<1, 1>::operator float() +{ + return entries[0]; +} + +template<> +inline __device__ __host__ matNxM<2, 1>::operator float2() +{ + return make_float2(entries[0], entries[1]); +} + +template<> +inline __device__ __host__ matNxM<3, 1>::operator float3() +{ + return make_float3(entries[0], entries[1], entries[2]); +} + +template<> +inline __device__ __host__ matNxM<4, 1>::operator float4() +{ + return make_float4(entries[0], entries[1], entries[2], entries[3]); +} + +////////////////////////////// +// Typedefs +////////////////////////////// + +typedef matNxM<9, 3> mat9x3; +typedef matNxM<3, 9> mat3x9; + +typedef matNxM<9, 1> mat9x1; +typedef matNxM<1, 9> mat1x9; + +typedef matNxM<6, 6> mat6x6; + +typedef matNxM<6, 1> mat6x1; +typedef matNxM<1, 6> mat1x6; + +typedef matNxM<3, 6> mat3x6; +typedef matNxM<6, 3> mat6x3; + +typedef matNxM<4, 4> mat4x4; + +typedef matNxM<4, 1> mat4x1; +typedef matNxM<1, 4> mat1x4; + +typedef matNxM<3, 3> mat3x3; + +typedef matNxM<2, 3> mat2x3; +typedef matNxM<3, 2> mat3x2; + +typedef matNxM<2, 2> mat2x2; + +typedef matNxM<1, 2> mat1x2; +typedef matNxM<2, 1> mat2x1; + +typedef matNxM<1, 3> mat1x3; +typedef matNxM<3, 1> mat3x1; + +typedef matNxM<1, 1> mat1x1; + +typedef matNxM<4, 4> mat4f; + + +#endif \ No newline at end of file diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp new file mode 100644 index 0000000000000000000000000000000000000000..eeb6f26c239cea0ab3673b8d6a8795aa09d92f06 --- /dev/null +++ b/applications/reconstruct/include/ftl/cuda_operators.hpp @@ -0,0 +1,770 @@ +/* + * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual property and + * proprietary rights in and to this software and related documentation and + * any modifications thereto. Any use, reproduction, disclosure, or distribution + * of this software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + * + */ + +/* + This file implements common mathematical operations on vector types + (float3, float4 etc.) since these are not provided as standard by CUDA. + + The syntax is modelled on the Cg standard library. +*/ + +#ifndef _FTL_CUDA_OPERATORS_HPP_ +#define _FTL_CUDA_OPERATORS_HPP_ + +#include <cuda_runtime.h> + + +//////////////////////////////////////////////////////////////////////////////// +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef __CUDACC__ +#include <math.h> + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +// float functions +//////////////////////////////////////////////////////////////////////////////// + +// lerp +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} + +inline __device__ __host__ int sign(float x) { + int t = x<0 ? -1 : 0; + return x > 0 ? 1 : t; +} + +// int2 functions +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ int2 make_int2(float2 f) +{ + int2 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); return t; +} + +inline __host__ __device__ uint2 make_uint2(int2 i) +{ + uint2 t; t.x = static_cast<uint>(i.x); t.y = static_cast<uint>(i.y); return t; +} + +// negate +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} + +// addition +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; a.y += b.y; +} + +// subtract +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; a.y -= b.y; +} + +// multiply +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ int2 operator*(int2 a, int s) +{ + return make_int2(a.x * s, a.y * s); +} +inline __host__ __device__ int2 operator*(int s, int2 a) +{ + return make_int2(a.x * s, a.y * s); +} +inline __host__ __device__ void operator*=(int2 &a, int s) +{ + a.x *= s; a.y *= s; +} + +// float2 functions +//////////////////////////////////////////////////////////////////////////////// + +// negate +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} + +// addition +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; a.y += b.y; +} + +// subtract +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; a.y -= b.y; +} + +// multiply +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ float2 operator*(float2 a, float s) +{ + return make_float2(a.x * s, a.y * s); +} +inline __host__ __device__ float2 operator*(float s, float2 a) +{ + return make_float2(a.x * s, a.y * s); +} +inline __host__ __device__ void operator*=(float2 &a, float s) +{ + a.x *= s; a.y *= s; +} + +// divide +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ float2 operator/(float2 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float2 operator/(float s, float2 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float2 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} + +// dot product +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} + +// length +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} + +// normalize +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float2 floor(const float2 v) +{ + return make_float2(floor(v.x), floor(v.y)); +} + +// reflect +inline __host__ __device__ float2 reflect(float2 i, float2 n) +{ + return i - 2.0f * n * dot(n,i); +} + +// absolute value +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} + +inline __device__ __host__ int2 sign(float2 f) { + return make_int2(sign(f.x), sign(f.y)); +} + +// float3 functions +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 make_float3(int3 i) +{ + float3 t; t.x = static_cast<float>(i.x); t.y = static_cast<float>(i.y); t.z = static_cast<float>(i.z); return t; +} + +inline __host__ __device__ float3 make_float3(float4 f) +{ + return make_float3(f.x,f.y,f.z); +} + +inline __host__ __device__ float3 make_float3(uchar3 c) +{ + return make_float3(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z)); +} + +inline __host__ __device__ uchar3 make_uchar3(float3 f) +{ + return make_uchar3(static_cast<unsigned char>(f.x), static_cast<unsigned char>(f.y), static_cast<unsigned char>(f.z)); +} + +inline __host__ __device__ float3 make_float3(float f) +{ + return make_float3(f,f,f); +} + +// negate +inline __host__ __device__ float3 operator-(const float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} + +// min +static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} + +// max +static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} + +// addition +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ float3 operator*(float3 a, float s) +{ + return make_float3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ float3 operator*(float s, float3 a) +{ + return make_float3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(float3 &a, float s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ float3 operator/(float3 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float3 operator/(float s, float3 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float3 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + +// dot product +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +// cross product +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +// length +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} + +// normalize +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float3 floor(const float3 v) +{ + return make_float3(floor(v.x), floor(v.y), floor(v.z)); +} + +// reflect +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +// absolute value +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} + +inline __device__ __host__ int3 sign(float3 f) { + return make_int3(sign(f.x), sign(f.y), sign(f.z)); +} + +// float4 functions +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float4 make_float4(float a) +{ + return make_float4(a,a,a,a); +} + +inline __host__ __device__ float4 make_float4(float3 f, float a) +{ + return make_float4(f.x,f.y,f.z,a); +} + +// negate +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} + +// min +static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +// max +static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +// addition +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; +} + +// subtract +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; +} + +// multiply +inline __host__ __device__ float4 operator*(float4 a, float s) +{ + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); +} +inline __host__ __device__ float4 operator*(float s, float4 a) +{ + return make_float4(a.x * s, a.y * s, a.z * s, a.w * s); +} +inline __host__ __device__ void operator*=(float4 &a, float s) +{ + a.x *= s; a.y *= s; a.z *= s; a.w *= s; +} + +// divide +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ float4 operator/(float4 a, float s) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ float4 operator/(float s, float4 a) +{ + float inv = 1.0f / s; + return a * inv; +} +inline __host__ __device__ void operator/=(float4 &a, float s) +{ + float inv = 1.0f / s; + a *= inv; +} + +// lerp +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +// clamp +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} + +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +// dot product +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +// length +inline __host__ __device__ float length(float4 r) +{ + return sqrtf(dot(r, r)); +} + +// normalize +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +// floor +inline __host__ __device__ float4 floor(const float4 v) +{ + return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w)); +} + +// absolute value +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +// int3 functions +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ int3 make_int3(float3 f) +{ + int3 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); t.z = static_cast<int>(f.z); return t; +} + +inline __host__ __device__ int3 make_int3(uint3 i) +{ + int3 t; t.x = static_cast<int>(i.x); t.y = static_cast<int>(i.y); t.z = static_cast<int>(i.z); return t; +} + +inline __host__ __device__ int3 make_int3(int i) +{ + int3 t; t.x = i; t.y = i; t.z = i; return t; +} + +// negate +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} + +// min +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} + +// max +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} + +// addition +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} + +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ int3 operator*(int3 a, int s) +{ + return make_int3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ int3 operator*(int s, int3 a) +{ + return make_int3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(int3 &a, int s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ int3 operator/(int3 a, int3 b) +{ + return make_int3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ int3 operator/(int3 a, int s) +{ + return make_int3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ int3 operator/(int s, int3 a) +{ + return make_int3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ void operator/=(int3 &a, int s) +{ + a.x /= s; a.y /= s; a.z /= s; +} + +// clamp +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + + +// uint3 functions +//////////////////////////////////////////////////////////////////////////////// + +// min +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} + +// max +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} + +// addition +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; a.y += b.y; a.z += b.z; +} + +// subtract +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} + +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; a.y -= b.y; a.z -= b.z; +} + +// multiply +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ uint3 operator*(uint3 a, uint s) +{ + return make_uint3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ uint3 operator*(uint s, uint3 a) +{ + return make_uint3(a.x * s, a.y * s, a.z * s); +} +inline __host__ __device__ void operator*=(uint3 &a, uint s) +{ + a.x *= s; a.y *= s; a.z *= s; +} + +// divide +inline __host__ __device__ uint3 operator/(uint3 a, uint3 b) +{ + return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ uint3 operator/(uint3 a, uint s) +{ + return make_uint3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ uint3 operator/(uint s, uint3 a) +{ + return make_uint3(a.x / s, a.y / s, a.z / s); +} +inline __host__ __device__ void operator/=(uint3 &a, uint s) +{ + a.x /= s; a.y /= s; a.z /= s; +} + +// clamp +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} + +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} + +#endif // _FTL_CUDA_OPERATORS_HPP_ diff --git a/applications/reconstruct/include/ftl/cuda_util.hpp b/applications/reconstruct/include/ftl/cuda_util.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bf018f07919fe52c71a1104a6bf029ce52f17b8e --- /dev/null +++ b/applications/reconstruct/include/ftl/cuda_util.hpp @@ -0,0 +1,22 @@ +#pragma once + +#ifndef _CUDA_UTIL_ +#define _CUDA_UTIL_ + +#undef max +#undef min + +#include <cuda_runtime.h> +#include <ftl/cuda_operators.hpp> + +// Enable run time assertion checking in kernel code +#define cudaAssert(condition) if (!(condition)) { printf("ASSERT: %s %s\n", #condition, __FILE__); } +//#define cudaAssert(condition) + +#if defined(__CUDA_ARCH__) +#define __CONDITIONAL_UNROLL__ #pragma unroll +#else +#define __CONDITIONAL_UNROLL__ +#endif + +#endif diff --git a/applications/reconstruct/include/ftl/depth_camera.hpp b/applications/reconstruct/include/ftl/depth_camera.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e521af34fae2c5cf93aba8ebe3d5bd726d14d8c1 --- /dev/null +++ b/applications/reconstruct/include/ftl/depth_camera.hpp @@ -0,0 +1,193 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/DepthCameraUtil.h + +#pragma once + +//#include <cutil_inline.h> +//#include <cutil_math.h> +#include <vector_types.h> +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/cuda_common.hpp> + +#include <ftl/depth_camera_params.hpp> + + +extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params); +extern __constant__ DepthCameraParams c_depthCameraParams; + + +struct DepthCameraData { + + /////////////// + // Host part // + /////////////// + + __device__ __host__ + DepthCameraData() { + /*d_depthData = NULL; + d_colorData = NULL; + d_depthArray = NULL; + d_colorArray = NULL;*/ + + depth_mat_ = nullptr; + colour_mat_ = nullptr; + depth_tex_ = nullptr; + colour_tex_ = nullptr; + } + + __host__ + void alloc(const DepthCameraParams& params) { //! todo resizing??? + /*cudaSafeCall(cudaMalloc(&d_depthData, sizeof(float) * params.m_imageWidth * params.m_imageHeight)); + cudaSafeCall(cudaMalloc(&d_colorData, sizeof(float4) * params.m_imageWidth * params.m_imageHeight)); + + h_depthChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaSafeCall(cudaMallocArray(&d_depthArray, &h_depthChannelDesc, params.m_imageWidth, params.m_imageHeight)); + h_colorChannelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); + cudaSafeCall(cudaMallocArray(&d_colorArray, &h_colorChannelDesc, params.m_imageWidth, params.m_imageHeight));*/ + + std::cout << "Create texture objects: " << params.m_imageWidth << "," << params.m_imageHeight << std::endl; + depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1); + colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4); + depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_); + colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_); + depth_obj_ = depth_tex_->cudaTexture(); + colour_obj_ = colour_tex_->cudaTexture(); + } + + __host__ + void updateParams(const DepthCameraParams& params) { + updateConstantDepthCameraParams(params); + } + + __host__ + void updateData(const cv::Mat &depth, const cv::Mat &rgb) { + depth_mat_->upload(depth); + colour_mat_->upload(rgb); + } + + __host__ + void free() { + /*if (d_depthData) cudaSafeCall(cudaFree(d_depthData)); + if (d_colorData) cudaSafeCall(cudaFree(d_colorData)); + if (d_depthArray) cudaSafeCall(cudaFreeArray(d_depthArray)); + if (d_colorArray) cudaSafeCall(cudaFreeArray(d_colorArray));*/ + + /*d_depthData = NULL; + d_colorData = NULL; + d_depthArray = NULL; + d_colorArray = NULL;*/ + + if (depth_mat_) delete depth_mat_; + if (colour_mat_) delete colour_mat_; + delete depth_tex_; + delete colour_tex_; + } + + + ///////////////// + // Device part // + ///////////////// + + static inline const DepthCameraParams& params() { + return c_depthCameraParams; + } + + /////////////////////////////////////////////////////////////// + // Camera to Screen + /////////////////////////////////////////////////////////////// + + __device__ + static inline float2 cameraToKinectScreenFloat(const float3& pos) { + //return make_float2(pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx, c_depthCameraParams.my - pos.y*c_depthCameraParams.fy/pos.z); + return make_float2( + pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx, + pos.y*c_depthCameraParams.fy/pos.z + c_depthCameraParams.my); + } + + __device__ + static inline int2 cameraToKinectScreenInt(const float3& pos) { + float2 pImage = cameraToKinectScreenFloat(pos); + return make_int2(pImage + make_float2(0.5f, 0.5f)); + } + + __device__ + static inline uint2 cameraToKinectScreen(const float3& pos) { + int2 p = cameraToKinectScreenInt(pos); + return make_uint2(p.x, p.y); + } + + __device__ + static inline float cameraToKinectProjZ(float z) { + return (z - c_depthCameraParams.m_sensorDepthWorldMin)/(c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin); + } + + __device__ + static inline float3 cameraToKinectProj(const float3& pos) { + float2 proj = cameraToKinectScreenFloat(pos); + + float3 pImage = make_float3(proj.x, proj.y, pos.z); + + pImage.x = (2.0f*pImage.x - (c_depthCameraParams.m_imageWidth- 1.0f))/(c_depthCameraParams.m_imageWidth- 1.0f); + //pImage.y = (2.0f*pImage.y - (c_depthCameraParams.m_imageHeight-1.0f))/(c_depthCameraParams.m_imageHeight-1.0f); + pImage.y = ((c_depthCameraParams.m_imageHeight-1.0f) - 2.0f*pImage.y)/(c_depthCameraParams.m_imageHeight-1.0f); + pImage.z = cameraToKinectProjZ(pImage.z); + + return pImage; + } + + /////////////////////////////////////////////////////////////// + // Screen to Camera (depth in meters) + /////////////////////////////////////////////////////////////// + + __device__ + static inline float3 kinectDepthToSkeleton(uint ux, uint uy, float depth) { + const float x = ((float)ux-c_depthCameraParams.mx) / c_depthCameraParams.fx; + const float y = ((float)uy-c_depthCameraParams.my) / c_depthCameraParams.fy; + //const float y = (c_depthCameraParams.my-(float)uy) / c_depthCameraParams.fy; + return make_float3(depth*x, depth*y, depth); + } + + /////////////////////////////////////////////////////////////// + // RenderScreen to Camera -- ATTENTION ASSUMES [1,0]-Z range!!!! + /////////////////////////////////////////////////////////////// + + __device__ + static inline float kinectProjToCameraZ(float z) { + return z * (c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin) + c_depthCameraParams.m_sensorDepthWorldMin; + } + + // z has to be in [0, 1] + __device__ + static inline float3 kinectProjToCamera(uint ux, uint uy, float z) { + float fSkeletonZ = kinectProjToCameraZ(z); + return kinectDepthToSkeleton(ux, uy, fSkeletonZ); + } + + __device__ + static inline bool isInCameraFrustumApprox(const float4x4& viewMatrixInverse, const float3& pos) { + float3 pCamera = viewMatrixInverse * pos; + float3 pProj = cameraToKinectProj(pCamera); + //pProj *= 1.5f; //TODO THIS IS A HACK FIX IT :) + pProj *= 0.95; + return !(pProj.x < -1.0f || pProj.x > 1.0f || pProj.y < -1.0f || pProj.y > 1.0f || pProj.z < 0.0f || pProj.z > 1.0f); + } + + //float* d_depthData; //depth data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor + //float4* d_colorData; + + //uchar4* d_colorData; //color data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor + + cv::cuda::GpuMat *depth_mat_; + cv::cuda::GpuMat *colour_mat_; + ftl::cuda::TextureObject<float> *depth_tex_; + ftl::cuda::TextureObject<uchar4> *colour_tex_; + cudaTextureObject_t depth_obj_; + cudaTextureObject_t colour_obj_; + + // cuda arrays for texture access + /*cudaArray* d_depthArray; + cudaArray* d_colorArray; + cudaChannelFormatDesc h_depthChannelDesc; + cudaChannelFormatDesc h_colorChannelDesc;*/ +}; diff --git a/applications/reconstruct/include/ftl/depth_camera_params.hpp b/applications/reconstruct/include/ftl/depth_camera_params.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f425ccf136730b1df3c73b60c14f9698b15eb904 --- /dev/null +++ b/applications/reconstruct/include/ftl/depth_camera_params.hpp @@ -0,0 +1,23 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDADepthCameraParams.h + +//#include <cutil_inline.h> +//#include <cutil_math.h> +#include <vector_types.h> +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/camera_params.hpp> + +struct __align__(16) DepthCameraParams { + float fx; + float fy; + float mx; + float my; + + unsigned short m_imageWidth; + unsigned short m_imageHeight; + unsigned int flags; + + float m_sensorDepthWorldMin; + float m_sensorDepthWorldMax; +}; diff --git a/applications/reconstruct/include/ftl/matrix_conversion.hpp b/applications/reconstruct/include/ftl/matrix_conversion.hpp new file mode 100644 index 0000000000000000000000000000000000000000..48e27de718aa0fd538a192a239c1fdc0831f008d --- /dev/null +++ b/applications/reconstruct/include/ftl/matrix_conversion.hpp @@ -0,0 +1,164 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/MatrixConversion.h + +#pragma once + +#include <eigen3/Eigen/Eigen> +// #include "d3dx9math.h" +#include <ftl/cuda_matrix_util.hpp> + +namespace MatrixConversion +{ + //static D3DXMATRIX EigToMat(const Eigen::Matrix4f& mat) + //{ + // D3DXMATRIX m(mat.data()); + // D3DXMATRIX res; D3DXMatrixTranspose(&res, &m); + + // return res; + //} + //static Eigen::Matrix4f MatToEig(const D3DXMATRIX& mat) + //{ + // return Eigen::Matrix4f((float*)mat.m).transpose(); + //} + + /*static mat4f EigToMat(const Eigen::Matrix4f& mat) + { + return mat4f(mat.data()).getTranspose(); + } + + static Eigen::Matrix4f MatToEig(const mat4f& mat) + { + return Eigen::Matrix4f(mat.ptr()).transpose(); + }*/ + + static Eigen::Vector4f VecH(const Eigen::Vector3f& v) + { + return Eigen::Vector4f(v[0], v[1], v[2], 1.0); + } + + static Eigen::Vector3f VecDH(const Eigen::Vector4f& v) + { + return Eigen::Vector3f(v[0]/v[3], v[1]/v[3], v[2]/v[3]); + } + + /*static Eigen::Vector3f VecToEig(const vec3f& v) + { + return Eigen::Vector3f(v[0], v[1], v[2]); + } + + static vec3f EigToVec(const Eigen::Vector3f& v) + { + return vec3f(v[0], v[1], v[2]); + } + + static vec3f EigToVec(const D3DXMATRIX v) + { + return vec3f(v[0], v[1], v[2]); + }*/ + + + + // dx/cuda conversion + /*static vec3f toMlib(const D3DXVECTOR3& v) { + return vec3f(v.x, v.y, v.z); + } + static vec4f toMlib(const D3DXVECTOR4& v) { + return vec4f(v.x, v.y, v.z, v.w); + } + static mat4f toMlib(const D3DXMATRIX& m) { + mat4f c((const float*)&m); + return c.getTranspose(); + } + static D3DXVECTOR3 toDX(const vec3f& v) { + return D3DXVECTOR3(v.x, v.y, v.z); + } + static D3DXVECTOR4 toDX(const vec4f& v) { + return D3DXVECTOR4(v.x, v.y, v.z, v.w); + } + static D3DXMATRIX toDX(const mat4f& m) { + D3DXMATRIX c((const float*)m.ptr()); + D3DXMatrixTranspose(&c, &c); + return c; + }*/ + + /*static mat4f toMlib(const float4x4& m) { + return mat4f(m.ptr()); + } + static vec4f toMlib(const float4& v) { + return vec4f(v.x, v.y, v.z, v.w); + } + static vec3f toMlib(const float3& v) { + return vec3f(v.x, v.y, v.z); + } + static vec4i toMlib(const int4& v) { + return vec4i(v.x, v.y, v.z, v.w); + } + static vec3i toMlib(const int3& v) { + return vec3i(v.x, v.y, v.z); + } + static float4x4 toCUDA(const mat4f& m) { + return float4x4(m.ptr()); + }*/ + static float4x4 toCUDA(const Eigen::Matrix4f& mat) { + return float4x4(mat.data()).getTranspose(); + } + + static Eigen::Matrix4f toEigen(const float4x4& m) { + return Eigen::Matrix4f(m.ptr()).transpose(); + } + + /*static float4 toCUDA(const vec4f& v) { + return make_float4(v.x, v.y, v.z, v.w); + } + static float3 toCUDA(const vec3f& v) { + return make_float3(v.x, v.y, v.z); + } + static int4 toCUDA(const vec4i& v) { + return make_int4(v.x, v.y, v.z, v.w); + } + static int3 toCUDA(const vec3i& v) { + return make_int3(v.x, v.y, v.z); + }*/ + + + //doesnt really work + //template<class FloatType> + //point3d<FloatType> eulerAngles(const Matrix3x3<FloatType>& r) { + // point3d<FloatType> res; + + // //check for gimbal lock + // if (r(0,2) == (FloatType)-1.0) { + // FloatType x = 0; //gimbal lock, value of x doesn't matter + // FloatType y = (FloatType)M_PI / 2; + // FloatType z = x + atan2(r(1,0), r(2,0)); + // res = point3d<FloatType>(x, y, z); + // } else if (r(0,2) == (FloatType)1.0) { + // FloatType x = 0; + // FloatType y = -(FloatType)M_PI / 2; + // FloatType z = -x + atan2(-r(1,0), -r(2,0)); + // res = point3d<FloatType>(x, y, z); + // } else { //two solutions exist + // FloatType x1 = -asin(r(0,2)); + // FloatType x2 = (FloatType)M_PI - x1; + + // FloatType y1 = atan2(r(1,2) / cos(x1), r(2,2) / cos(x1)); + // FloatType y2 = atan2(r(1,2) / cos(x2), r(2,2) / cos(x2)); + + // FloatType z2 = atan2(r(0,1) / cos(x2), r(0,0) / cos(x2)); + // FloatType z1 = atan2(r(0,1) / cos(x1), r(0,0) / cos(x1)); + + // //choose one solution to return + // //for example the "shortest" rotation + // if ((std::abs(x1) + std::abs(y1) + std::abs(z1)) <= (std::abs(x2) + std::abs(y2) + std::abs(z2))) { + // res = point3d<FloatType>(x1, y1, z1); + // } else { + // res = point3d<FloatType>(x2, y2, z2); + // } + // } + + // res.x = math::radiansToDegrees(-res.x); + // res.y = math::radiansToDegrees(-res.y); + // res.z = math::radiansToDegrees(-res.z); + + // return res; + //} +} diff --git a/applications/reconstruct/include/ftl/ray_cast_params.hpp b/applications/reconstruct/include/ftl/ray_cast_params.hpp new file mode 100644 index 0000000000000000000000000000000000000000..788c1155e86433a01ce05f78d704f8b9df91419f --- /dev/null +++ b/applications/reconstruct/include/ftl/ray_cast_params.hpp @@ -0,0 +1,26 @@ +#include <ftl/cuda_util.hpp> + +#include <ftl/cuda_matrix_util.hpp> + +struct __align__(16) RayCastParams { + float4x4 m_viewMatrix; + float4x4 m_viewMatrixInverse; + float4x4 m_intrinsics; + float4x4 m_intrinsicsInverse; + + unsigned int m_width; + unsigned int m_height; + + unsigned int m_numOccupiedSDFBlocks; + unsigned int m_maxNumVertices; + int m_splatMinimum; + + float m_minDepth; + float m_maxDepth; + float m_rayIncrement; + float m_thresSampleDist; + float m_thresDist; + bool m_useGradients; + + uint dummy0; +}; diff --git a/applications/reconstruct/include/ftl/ray_cast_sdf.hpp b/applications/reconstruct/include/ftl/ray_cast_sdf.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5ccb268d45cbbbde4cf753579ff081d627beb83b --- /dev/null +++ b/applications/reconstruct/include/ftl/ray_cast_sdf.hpp @@ -0,0 +1,67 @@ +#pragma once + +#include <ftl/configurable.hpp> +#include <ftl/matrix_conversion.hpp> +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/depth_camera.hpp> +#include <ftl/ray_cast_util.hpp> +#include <nlohmann/json.hpp> + +// #include "DX11RayIntervalSplatting.h" + +class CUDARayCastSDF : public ftl::Configurable +{ +public: + CUDARayCastSDF(nlohmann::json& config) : ftl::Configurable(config) { + create(parametersFromConfig(config)); + hash_render_ = config.value("hash_renderer", false); + } + + ~CUDARayCastSDF(void) { + destroy(); + } + + static RayCastParams parametersFromConfig(const nlohmann::json& gas) { + RayCastParams params; + params.m_width = gas["adapterWidth"].get<unsigned int>(); + params.m_height = gas["adapterHeight"].get<unsigned int>(); + params.m_intrinsics = MatrixConversion::toCUDA(Eigen::Matrix4f()); + params.m_intrinsicsInverse = MatrixConversion::toCUDA(Eigen::Matrix4f()); + params.m_minDepth = gas["sensorDepthMin"].get<float>(); + params.m_maxDepth = gas["sensorDepthMax"].get<float>(); + params.m_rayIncrement = gas["SDFRayIncrementFactor"].get<float>() * gas["SDFTruncation"].get<float>(); + params.m_thresSampleDist = gas["SDFRayThresSampleDistFactor"].get<float>() * params.m_rayIncrement; + params.m_thresDist = gas["SDFRayThresDistFactor"].get<float>() * params.m_rayIncrement; + params.m_useGradients = gas["SDFUseGradients"].get<bool>(); + + params.m_maxNumVertices = gas["hashNumSDFBlocks"].get<unsigned int>() * 6; + + return params; + } + + void render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform); + + const RayCastData& getRayCastData(void) { + return m_data; + } + const RayCastParams& getRayCastParams() const { + return m_params; + } + + + // debugging + void convertToCameraSpace(const DepthCameraData& cameraData); + +private: + + void create(const RayCastParams& params); + void destroy(void); + + void rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform); // rasterize + + RayCastParams m_params; + RayCastData m_data; + bool hash_render_; + + // DX11RayIntervalSplatting m_rayIntervalSplatting; +}; diff --git a/applications/reconstruct/include/ftl/ray_cast_util.hpp b/applications/reconstruct/include/ftl/ray_cast_util.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e35275279d81a36222c7f4178bb08b17153a9468 --- /dev/null +++ b/applications/reconstruct/include/ftl/ray_cast_util.hpp @@ -0,0 +1,284 @@ +#pragma once + +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/depth_camera.hpp> +#include <ftl/voxel_hash.hpp> + +#include <ftl/ray_cast_params.hpp> + +struct RayCastSample +{ + float sdf; + float alpha; + uint weight; + //uint3 color; +}; + +#ifndef MINF +#define MINF asfloat(0xff800000) +#endif + +extern __constant__ RayCastParams c_rayCastParams; +extern "C" void updateConstantRayCastParams(const RayCastParams& params); + + +struct RayCastData { + + /////////////// + // Host part // + /////////////// + + __device__ __host__ + RayCastData() { + d_depth = NULL; + d_depth3 = NULL; + d_normals = NULL; + d_colors = NULL; + + //d_vertexBuffer = NULL; + point_cloud_ = nullptr; + + d_rayIntervalSplatMinArray = NULL; + d_rayIntervalSplatMaxArray = NULL; + } + +#ifndef __CUDACC__ + __host__ + void allocate(const RayCastParams& params) { + //point_cloud_ = new cv::cuda::GpuMat(params.m_width*params.m_height, 1, CV_32FC3); + cudaSafeCall(cudaMalloc(&d_depth, sizeof(float) * params.m_width * params.m_height)); + cudaSafeCall(cudaMalloc(&d_depth3, sizeof(float3) * params.m_width * params.m_height)); + cudaSafeCall(cudaMalloc(&d_normals, sizeof(float4) * params.m_width * params.m_height)); + cudaSafeCall(cudaMalloc(&d_colors, sizeof(uchar3) * params.m_width * params.m_height)); + //cudaSafeCall(cudaMalloc(&point_cloud_, sizeof(float3) * params.m_width * params.m_height)); + //printf("Allocate ray cast data: %lld \n", (unsigned long long)point_cloud_); + } + + __host__ + void updateParams(const RayCastParams& params) { + updateConstantRayCastParams(params); + } + + __host__ void download(float3 *points, uchar3 *colours, const RayCastParams& params) const { + //printf("Download: %d,%d\n", params.m_width, params.m_height); + //cudaSafeCall(cudaMemcpy(points, d_depth3, sizeof(float3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(colours, d_colors, sizeof(uchar3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost)); + } + + __host__ + void free() { + //cudaFree(point_cloud_); + cudaFree(d_depth); + cudaFree(d_depth3); + cudaFree(d_normals); + cudaFree(d_colors); + } +#endif + + ///////////////// + // Device part // + ///////////////// +#ifdef __CUDACC__ + + __device__ + const RayCastParams& params() const { + return c_rayCastParams; + } + + __device__ + float frac(float val) const { + return (val - floorf(val)); + } + __device__ + float3 frac(const float3& val) const { + return make_float3(frac(val.x), frac(val.y), frac(val.z)); + } + + __device__ + bool trilinearInterpolationSimpleFastFast(const ftl::voxhash::HashData& hash, const float3& pos, float& dist, uchar3& color) const { + const float oSet = c_hashParams.m_virtualVoxelSize; + const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f); + float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos)); + + dist = 0.0f; + float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f); + ftl::voxhash::Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; float3 vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*vColor; + v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*vColor; + v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*vColor; + v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *vColor; + v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x * weight.y *(1.0f-weight.z)*v.sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*vColor; + v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)* weight.y * weight.z *v.sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *vColor; + v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x *(1.0f-weight.y)* weight.z *v.sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *vColor; + v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x * weight.y * weight.z *v.sdf; colorFloat+= weight.x * weight.y * weight.z *vColor; + + color = make_uchar3(colorFloat.x, colorFloat.y, colorFloat.z);//v.color; + + return true; + } + //__device__ + //bool trilinearInterpolationSimpleFastFast(const HashData& hash, const float3& pos, float& dist, uchar3& color) const { + // const float oSet = c_hashParams.m_virtualVoxelSize; + // const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f); + // float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos)); + + // dist = 0.0f; + // Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; + // v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false; dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; + // v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*v.sdf; + // v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *v.sdf; + // v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false; dist+= weight.x * weight.y *(1.0f-weight.z)*v.sdf; + // v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)* weight.y * weight.z *v.sdf; + // v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false; dist+= weight.x *(1.0f-weight.y)* weight.z *v.sdf; + // v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false; dist+= weight.x * weight.y * weight.z *v.sdf; + + // color = v.color; + + // return true; + //} + + + __device__ + float findIntersectionLinear(float tNear, float tFar, float dNear, float dFar) const + { + return tNear+(dNear/(dNear-dFar))*(tFar-tNear); + } + + static const unsigned int nIterationsBisection = 3; + + // d0 near, d1 far + __device__ + bool findIntersectionBisection(const ftl::voxhash::HashData& hash, const float3& worldCamPos, const float3& worldDir, float d0, float r0, float d1, float r1, float& alpha, uchar3& color) const + { + float a = r0; float aDist = d0; + float b = r1; float bDist = d1; + float c = 0.0f; + +#pragma unroll 1 + for(uint i = 0; i<nIterationsBisection; i++) + { + c = findIntersectionLinear(a, b, aDist, bDist); + + float cDist; + if(!trilinearInterpolationSimpleFastFast(hash, worldCamPos+c*worldDir, cDist, color)) return false; + + if(aDist*cDist > 0.0) { a = c; aDist = cDist; } + else { b = c; bDist = cDist; } + } + + alpha = c; + + return true; + } + + + __device__ + float3 gradientForPoint(const ftl::voxhash::HashData& hash, const float3& pos) const + { + const float voxelSize = c_hashParams.m_virtualVoxelSize; + float3 offset = make_float3(voxelSize, voxelSize, voxelSize); + + float distp00; uchar3 colorp00; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.5f*offset.x, 0.0f, 0.0f), distp00, colorp00); + float dist0p0; uchar3 color0p0; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.5f*offset.y, 0.0f), dist0p0, color0p0); + float dist00p; uchar3 color00p; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.0f, 0.5f*offset.z), dist00p, color00p); + + float dist100; uchar3 color100; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.5f*offset.x, 0.0f, 0.0f), dist100, color100); + float dist010; uchar3 color010; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.5f*offset.y, 0.0f), dist010, color010); + float dist001; uchar3 color001; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.0f, 0.5f*offset.z), dist001, color001); + + float3 grad = make_float3((distp00-dist100)/offset.x, (dist0p0-dist010)/offset.y, (dist00p-dist001)/offset.z); + + float l = length(grad); + if(l == 0.0f) { + return make_float3(0.0f, 0.0f, 0.0f); + } + + return -grad/l; + } + + __device__ + void traverseCoarseGridSimpleSampleAll(const ftl::voxhash::HashData& hash, const DepthCameraData& cameraData, const float3& worldCamPos, const float3& worldDir, const float3& camDir, const int3& dTid, float minInterval, float maxInterval) const + { + const RayCastParams& rayCastParams = c_rayCastParams; + + // Last Sample + RayCastSample lastSample; lastSample.sdf = 0.0f; lastSample.alpha = 0.0f; lastSample.weight = 0; // lastSample.color = int3(0, 0, 0); + const float depthToRayLength = 1.0f/camDir.z; // scale factor to convert from depth to ray length + + float rayCurrent = depthToRayLength * max(rayCastParams.m_minDepth, minInterval); // Convert depth to raylength + float rayEnd = depthToRayLength * min(rayCastParams.m_maxDepth, maxInterval); // Convert depth to raylength + //float rayCurrent = depthToRayLength * rayCastParams.m_minDepth; // Convert depth to raylength + //float rayEnd = depthToRayLength * rayCastParams.m_maxDepth; // Convert depth to raylength + + //printf("MINMAX: %f,%f\n", minInterval, maxInterval); + +#pragma unroll 1 + while(rayCurrent < rayEnd) + { + float3 currentPosWorld = worldCamPos+rayCurrent*worldDir; + float dist; uchar3 color; + + //printf("RAY LOOP: %f,%f,%f\n", currentPosWorld.x, currentPosWorld.y, currentPosWorld.z); + + if(trilinearInterpolationSimpleFastFast(hash, currentPosWorld, dist, color)) + { + if(lastSample.weight > 0 && lastSample.sdf > 0.0f && dist < 0.0f) // current sample is always valid here + { + + float alpha; // = findIntersectionLinear(lastSample.alpha, rayCurrent, lastSample.sdf, dist); + uchar3 color2; + bool b = findIntersectionBisection(hash, worldCamPos, worldDir, lastSample.sdf, lastSample.alpha, dist, rayCurrent, alpha, color2); + + float3 currentIso = worldCamPos+alpha*worldDir; + if(b && abs(lastSample.sdf - dist) < rayCastParams.m_thresSampleDist) + { + if(abs(dist) < rayCastParams.m_thresDist) + { + float depth = alpha / depthToRayLength; // Convert ray length to depth depthToRayLength + + d_depth[dTid.y*rayCastParams.m_width+dTid.x] = depth; + d_depth3[dTid.y*rayCastParams.m_width+dTid.x] = cameraData.kinectDepthToSkeleton(dTid.x, dTid.y, depth); + d_colors[dTid.y*rayCastParams.m_width+dTid.x] = make_uchar3(color2.x, color2.y, color2.z); + + if(rayCastParams.m_useGradients) + { + float3 normal = -gradientForPoint(hash, currentIso); + float4 n = rayCastParams.m_viewMatrix * make_float4(normal, 0.0f); + d_normals[dTid.y*rayCastParams.m_width+dTid.x] = make_float4(n.x, n.y, n.z, 1.0f); + } + + return; + } + } + } + + lastSample.sdf = dist; + lastSample.alpha = rayCurrent; + // lastSample.color = color; + lastSample.weight = 1; + rayCurrent += rayCastParams.m_rayIncrement; + } else { + lastSample.weight = 0; + rayCurrent += rayCastParams.m_rayIncrement; + } + + + } + + } + +#endif // __CUDACC__ + + union { + float* d_depth; + int* d_depth_i; + }; + float3* d_depth3; + float4* d_normals; + uchar3* d_colors; + + //float4* d_vertexBuffer; // ray interval splatting triangles, mapped from directx (memory lives there) + float3 *point_cloud_; + + cudaArray* d_rayIntervalSplatMinArray; + cudaArray* d_rayIntervalSplatMaxArray; +}; \ No newline at end of file diff --git a/applications/reconstruct/include/ftl/scene_rep_hash_sdf.hpp b/applications/reconstruct/include/ftl/scene_rep_hash_sdf.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a28567931b55cf2059687f924ec6efdb08e00451 --- /dev/null +++ b/applications/reconstruct/include/ftl/scene_rep_hash_sdf.hpp @@ -0,0 +1,382 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDASceneRepHashSDF.h + +#pragma once + +#include <cuda_runtime.h> + +#include <ftl/configurable.hpp> +#include <ftl/matrix_conversion.hpp> +#include <ftl/voxel_hash.hpp> +#include <ftl/depth_camera.hpp> +#include <unordered_set> +//#include "CUDAScan.h" +// #include "CUDATimer.h" + +// #include "GlobalAppState.h" +// #include "TimingLog.h" + +#define SAFE_DELETE_ARRAY(a) { delete [] (a); (a) = NULL; } + +extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask); +extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData); +extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams); +extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData); + +extern "C" void starveVoxelsKernelCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" void garbageCollectIdentifyCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); +extern "C" void garbageCollectFreeCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); + +namespace ftl { +namespace voxhash { + +class SceneRep : public ftl::Configurable { + public: + SceneRep(nlohmann::json &config) : Configurable(config) { + REQUIRED({ + {"hashNumBuckets", "Desired hash entries divide bucket size", "number"}, + {"hashMaxCollisionLinkedListSize", "", "number"}, + {"hashNumSDFBlocks", "", "number"}, + {"SDFVoxelSize", "Size in meters of one voxel", "number"}, + {"SDFMaxIntegrationDistance", "", "number"}, + {"SDFTruncation", "Base error size", "number"}, + {"SDFTruncationScale", "Error size scale with depth", "number"}, + {"SDFIntegrationWeightSample", "", "number"}, + {"SDFIntegrationWeightMax", "", "number"} + }); + create(parametersFromConfig(config)); + } + ~SceneRep() { + destroy(); + } + + static HashParams parametersFromConfig(nlohmann::json &config) { + HashParams params; + // First camera view is set to identity pose to be at the centre of + // the virtual coordinate space. + params.m_rigidTransform.setIdentity(); + params.m_rigidTransformInverse.setIdentity(); + params.m_hashNumBuckets = config["hashNumBuckets"]; + params.m_hashBucketSize = HASH_BUCKET_SIZE; + params.m_hashMaxCollisionLinkedListSize = config["hashMaxCollisionLinkedListSize"]; + params.m_SDFBlockSize = SDF_BLOCK_SIZE; + params.m_numSDFBlocks = config["hashNumSDFBlocks"]; + params.m_virtualVoxelSize = config["SDFVoxelSize"]; + params.m_maxIntegrationDistance = config["SDFMaxIntegrationDistance"]; + params.m_truncation = config["SDFTruncation"]; + params.m_truncScale = config["SDFTruncationScale"]; + params.m_integrationWeightSample = config["SDFIntegrationWeightSample"]; + params.m_integrationWeightMax = config["SDFIntegrationWeightMax"]; + // Note (Nick): We are not streaming voxels in/out of GPU + //params.m_streamingVoxelExtents = MatrixConversion::toCUDA(gas.s_streamingVoxelExtents); + //params.m_streamingGridDimensions = MatrixConversion::toCUDA(gas.s_streamingGridDimensions); + //params.m_streamingMinGridPos = MatrixConversion::toCUDA(gas.s_streamingMinGridPos); + //params.m_streamingInitialChunkListSize = gas.s_streamingInitialChunkListSize; + return params; + } + + void bindDepthCameraTextures(const DepthCameraData& depthCameraData) { + //bindInputDepthColorTextures(depthCameraData); + } + + /** + * Note: lastRigidTransform appears to be the estimated camera pose. + * Note: bitMask can be nullptr if not streaming out voxels from GPU + */ + void integrate(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, unsigned int* d_bitMask) { + + setLastRigidTransform(lastRigidTransform); + + //make the rigid transform available on the GPU + m_hashData.updateParams(m_hashParams); + + //allocate all hash blocks which are corresponding to depth map entries + alloc(depthCameraData, depthCameraParams, d_bitMask); + + //generate a linear hash array with only occupied entries + compactifyHashEntries(depthCameraData); + + //volumetrically integrate the depth data into the depth SDFBlocks + integrateDepthMap(depthCameraData, depthCameraParams); + + garbageCollect(depthCameraData); + + m_numIntegratedFrames++; + } + + void setLastRigidTransform(const Eigen::Matrix4f& lastRigidTransform) { + m_hashParams.m_rigidTransform = MatrixConversion::toCUDA(lastRigidTransform); + m_hashParams.m_rigidTransformInverse = m_hashParams.m_rigidTransform.getInverse(); + } + + void setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) { + setLastRigidTransform(lastRigidTransform); + compactifyHashEntries(depthCameraData); + } + + + const Eigen::Matrix4f getLastRigidTransform() const { + return MatrixConversion::toEigen(m_hashParams.m_rigidTransform); + } + + /* Nick: To reduce weights between frames */ + void nextFrame() { + starveVoxelsKernelCUDA(m_hashData, m_hashParams); + m_numIntegratedFrames = 0; + } + + //! resets the hash to the initial state (i.e., clears all data) + void reset() { + m_numIntegratedFrames = 0; + + m_hashParams.m_rigidTransform.setIdentity(); + m_hashParams.m_rigidTransformInverse.setIdentity(); + m_hashParams.m_numOccupiedBlocks = 0; + m_hashData.updateParams(m_hashParams); + resetCUDA(m_hashData, m_hashParams); + } + + + ftl::voxhash::HashData& getHashData() { + return m_hashData; + } + + const HashParams& getHashParams() const { + return m_hashParams; + } + + + //! debug only! + unsigned int getHeapFreeCount() { + unsigned int count; + cudaSafeCall(cudaMemcpy(&count, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + return count+1; //there is one more free than the address suggests (0 would be also a valid address) + } + + //! debug only! + void debugHash() { + HashEntry* hashCPU = new HashEntry[m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets]; + unsigned int* heapCPU = new unsigned int[m_hashParams.m_numSDFBlocks]; + unsigned int heapCounterCPU; + + cudaSafeCall(cudaMemcpy(&heapCounterCPU, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + heapCounterCPU++; //points to the first free entry: number of blocks is one more + + cudaSafeCall(cudaMemcpy(heapCPU, m_hashData.d_heap, sizeof(unsigned int)*m_hashParams.m_numSDFBlocks, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashCPU, m_hashData.d_hash, sizeof(HashEntry)*m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets, cudaMemcpyDeviceToHost)); + + //Check for duplicates + class myint3Voxel { + public: + myint3Voxel() {} + ~myint3Voxel() {} + bool operator<(const myint3Voxel& other) const { + if (x == other.x) { + if (y == other.y) { + return z < other.z; + } + return y < other.y; + } + return x < other.x; + } + + bool operator==(const myint3Voxel& other) const { + return x == other.x && y == other.y && z == other.z; + } + + int x,y,z, i; + int offset; + int ptr; + }; + + + std::unordered_set<unsigned int> pointersFreeHash; + std::vector<unsigned int> pointersFreeVec(m_hashParams.m_numSDFBlocks, 0); + for (unsigned int i = 0; i < heapCounterCPU; i++) { + pointersFreeHash.insert(heapCPU[i]); + pointersFreeVec[heapCPU[i]] = FREE_ENTRY; + } + if (pointersFreeHash.size() != heapCounterCPU) { + throw std::runtime_error("ERROR: duplicate free pointers in heap array"); + } + + + unsigned int numOccupied = 0; + unsigned int numMinusOne = 0; + unsigned int listOverallFound = 0; + + std::list<myint3Voxel> l; + //std::vector<myint3Voxel> v; + + for (unsigned int i = 0; i < m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets; i++) { + if (hashCPU[i].ptr == -1) { + numMinusOne++; + } + + if (hashCPU[i].ptr != -2) { + numOccupied++; // != FREE_ENTRY + myint3Voxel a; + a.x = hashCPU[i].pos.x; + a.y = hashCPU[i].pos.y; + a.z = hashCPU[i].pos.z; + l.push_back(a); + //v.push_back(a); + + unsigned int linearBlockSize = m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize; + if (pointersFreeHash.find(hashCPU[i].ptr / linearBlockSize) != pointersFreeHash.end()) { + throw std::runtime_error("ERROR: ptr is on free heap, but also marked as an allocated entry"); + } + pointersFreeVec[hashCPU[i].ptr / linearBlockSize] = LOCK_ENTRY; + } + } + + unsigned int numHeapFree = 0; + unsigned int numHeapOccupied = 0; + for (unsigned int i = 0; i < m_hashParams.m_numSDFBlocks; i++) { + if (pointersFreeVec[i] == FREE_ENTRY) numHeapFree++; + else if (pointersFreeVec[i] == LOCK_ENTRY) numHeapOccupied++; + else { + throw std::runtime_error("memory leak detected: neither free nor allocated"); + } + } + if (numHeapFree + numHeapOccupied == m_hashParams.m_numSDFBlocks) std::cout << "HEAP OK!" << std::endl; + else throw std::runtime_error("HEAP CORRUPTED"); + + l.sort(); + size_t sizeBefore = l.size(); + l.unique(); + size_t sizeAfter = l.size(); + + + std::cout << "diff: " << sizeBefore - sizeAfter << std::endl; + std::cout << "minOne: " << numMinusOne << std::endl; + std::cout << "numOccupied: " << numOccupied << "\t numFree: " << getHeapFreeCount() << std::endl; + std::cout << "numOccupied + free: " << numOccupied + getHeapFreeCount() << std::endl; + std::cout << "numInFrustum: " << m_hashParams.m_numOccupiedBlocks << std::endl; + + SAFE_DELETE_ARRAY(heapCPU); + SAFE_DELETE_ARRAY(hashCPU); + + //getchar(); + } +private: + + void create(const HashParams& params) { + m_hashParams = params; + m_hashData.allocate(m_hashParams); + + reset(); + } + + void destroy() { + m_hashData.free(); + } + + void alloc(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask) { + //Start Timing + //if (GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); } + + // NOTE (nick): We might want this later... + if (true) { + //allocate until all blocks are allocated + unsigned int prevFree = getHeapFreeCount(); + while (1) { + resetHashBucketMutexCUDA(m_hashData, m_hashParams); + allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask); + + unsigned int currFree = getHeapFreeCount(); + + if (prevFree != currFree) { + prevFree = currFree; + } + else { + break; + } + } + } + else { + //this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame) + resetHashBucketMutexCUDA(m_hashData, m_hashParams); + allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask); + } + + + + + // Stop Timing + //if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeAlloc += m_timer.getElapsedTimeMS(); TimingLog::countTimeAlloc++; } + } + + + void compactifyHashEntries(const DepthCameraData& depthCameraData) { + //Start Timing + //if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); } + + //CUDATimer t; + + //t.startEvent("fillDecisionArray"); + //fillDecisionArrayCUDA(m_hashData, m_hashParams, depthCameraData); + //t.endEvent(); + + //t.startEvent("prefixSum"); + //m_hashParams.m_numOccupiedBlocks = + // m_cudaScan.prefixSum( + // m_hashParams.m_hashNumBuckets*m_hashParams.m_hashBucketSize, + // m_hashData.d_hashDecision, + // m_hashData.d_hashDecisionPrefix); + //t.endEvent(); + + //t.startEvent("compactifyHash"); + //m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU + //compactifyHashCUDA(m_hashData, m_hashParams); + //t.endEvent(); + + //t.startEvent("compactifyAllInOne"); + m_hashParams.m_numOccupiedBlocks = compactifyHashAllInOneCUDA(m_hashData, m_hashParams); //this version uses atomics over prefix sums, which has a much better performance + std::cout << "Occ blocks = " << m_hashParams.m_numOccupiedBlocks << std::endl; + m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU + //t.endEvent(); + //t.evaluate(); + + // Stop Timing + //if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeCompactifyHash += m_timer.getElapsedTimeMS(); TimingLog::countTimeCompactifyHash++; } + + //std::cout << "numOccupiedBlocks: " << m_hashParams.m_numOccupiedBlocks << std::endl; + } + + void integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) { + //Start Timing + //if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); } + + integrateDepthMapCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams); + + // Stop Timing + //if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeIntegrate += m_timer.getElapsedTimeMS(); TimingLog::countTimeIntegrate++; } + } + + void garbageCollect(const DepthCameraData& depthCameraData) { + //only perform if enabled by global app state + //if (GlobalAppState::get().s_garbageCollectionEnabled) { + + garbageCollectIdentifyCUDA(m_hashData, m_hashParams); + resetHashBucketMutexCUDA(m_hashData, m_hashParams); //needed if linked lists are enabled -> for memeory deletion + garbageCollectFreeCUDA(m_hashData, m_hashParams); + //} + } + + + + HashParams m_hashParams; + ftl::voxhash::HashData m_hashData; + + //CUDAScan m_cudaScan; + unsigned int m_numIntegratedFrames; //used for garbage collect + + // static Timer m_timer; +}; + +}; // namespace voxhash +}; // namespace ftl diff --git a/applications/reconstruct/include/ftl/sdf_util.hpp b/applications/reconstruct/include/ftl/sdf_util.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/applications/reconstruct/include/ftl/voxel_hash.hpp b/applications/reconstruct/include/ftl/voxel_hash.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5fa458bfc93eadd0dce6979c5a2e069129c04228 --- /dev/null +++ b/applications/reconstruct/include/ftl/voxel_hash.hpp @@ -0,0 +1,832 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/VoxelUtilHashSDF.h + +#pragma once + +#ifndef sint +typedef signed int sint; +#endif + +#ifndef uint +typedef unsigned int uint; +#endif + +#ifndef slong +typedef signed long slong; +#endif + +#ifndef ulong +typedef unsigned long ulong; +#endif + +#ifndef uchar +typedef unsigned char uchar; +#endif + +#ifndef schar +typedef signed char schar; +#endif + + + + +#include <ftl/cuda_util.hpp> + +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/voxel_hash_params.hpp> + +#include <ftl/depth_camera.hpp> + +#define HANDLE_COLLISIONS +#define SDF_BLOCK_SIZE 8 +#define HASH_BUCKET_SIZE 10 + +#ifndef MINF +#define MINF __int_as_float(0xff800000) +#endif + +#ifndef PINF +#define PINF __int_as_float(0x7f800000) +#endif + +extern __constant__ ftl::voxhash::HashParams c_hashParams; +extern "C" void updateConstantHashParams(const ftl::voxhash::HashParams& hashParams); + +namespace ftl { +namespace voxhash { + +//status flags for hash entries +static const int LOCK_ENTRY = -1; +static const int FREE_ENTRY = -2; +static const int NO_OFFSET = 0; + +struct __align__(16) HashEntry +{ + int3 pos; //hash position (lower left corner of SDFBlock)) + int ptr; //pointer into heap to SDFBlock + uint offset; //offset for collisions + uint flags; //for interframe checks (Nick) + + __device__ void operator=(const struct HashEntry& e) { + ((long long*)this)[0] = ((const long long*)&e)[0]; + ((long long*)this)[1] = ((const long long*)&e)[1]; + //((int*)this)[4] = ((const int*)&e)[4]; + ((long long*)this)[2] = ((const long long*)&e)[2]; + } +}; + +struct __align__(8) Voxel { + float sdf; //signed distance function + uchar3 color; //color + uchar weight; //accumulated sdf weight + + __device__ void operator=(const struct Voxel& v) { + ((long long*)this)[0] = ((const long long*)&v)[0]; + } + +}; + +/** + * Voxel Hash Table structure and operations. Works on both CPU and GPU with + * host <-> device transfer included. + */ +struct HashData { + + /////////////// + // Host part // + /////////////// + + __device__ __host__ + HashData() { + d_heap = NULL; + d_heapCounter = NULL; + d_hash = NULL; + d_hashDecision = NULL; + d_hashDecisionPrefix = NULL; + d_hashCompactified = NULL; + d_hashCompactifiedCounter = NULL; + d_SDFBlocks = NULL; + d_hashBucketMutex = NULL; + m_bIsOnGPU = false; + } + + __host__ + void allocate(const HashParams& params, bool dataOnGPU = true) { + m_bIsOnGPU = dataOnGPU; + if (m_bIsOnGPU) { + cudaSafeCall(cudaMalloc(&d_heap, sizeof(unsigned int) * params.m_numSDFBlocks)); + cudaSafeCall(cudaMalloc(&d_heapCounter, sizeof(unsigned int))); + cudaSafeCall(cudaMalloc(&d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize)); + cudaSafeCall(cudaMalloc(&d_hashDecision, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize)); + cudaSafeCall(cudaMalloc(&d_hashDecisionPrefix, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize)); + cudaSafeCall(cudaMalloc(&d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize)); + cudaSafeCall(cudaMalloc(&d_hashCompactifiedCounter, sizeof(int))); + cudaSafeCall(cudaMalloc(&d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize)); + cudaSafeCall(cudaMalloc(&d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets)); + } else { + d_heap = new unsigned int[params.m_numSDFBlocks]; + d_heapCounter = new unsigned int[1]; + d_hash = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize]; + d_hashDecision = new int[params.m_hashNumBuckets * params.m_hashBucketSize]; + d_hashDecisionPrefix = new int[params.m_hashNumBuckets * params.m_hashBucketSize]; + d_hashCompactified = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize]; + d_hashCompactifiedCounter = new int[1]; + d_SDFBlocks = new Voxel[params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize]; + d_hashBucketMutex = new int[params.m_hashNumBuckets]; + } + + updateParams(params); + } + + __host__ + void updateParams(const HashParams& params) { + if (m_bIsOnGPU) { + updateConstantHashParams(params); + } + } + + __host__ + void free() { + if (m_bIsOnGPU) { + cudaSafeCall(cudaFree(d_heap)); + cudaSafeCall(cudaFree(d_heapCounter)); + cudaSafeCall(cudaFree(d_hash)); + cudaSafeCall(cudaFree(d_hashDecision)); + cudaSafeCall(cudaFree(d_hashDecisionPrefix)); + cudaSafeCall(cudaFree(d_hashCompactified)); + cudaSafeCall(cudaFree(d_hashCompactifiedCounter)); + cudaSafeCall(cudaFree(d_SDFBlocks)); + cudaSafeCall(cudaFree(d_hashBucketMutex)); + } else { + if (d_heap) delete[] d_heap; + if (d_heapCounter) delete[] d_heapCounter; + if (d_hash) delete[] d_hash; + if (d_hashDecision) delete[] d_hashDecision; + if (d_hashDecisionPrefix) delete[] d_hashDecisionPrefix; + if (d_hashCompactified) delete[] d_hashCompactified; + if (d_hashCompactifiedCounter) delete[] d_hashCompactifiedCounter; + if (d_SDFBlocks) delete[] d_SDFBlocks; + if (d_hashBucketMutex) delete[] d_hashBucketMutex; + } + + d_hash = NULL; + d_heap = NULL; + d_heapCounter = NULL; + d_hashDecision = NULL; + d_hashDecisionPrefix = NULL; + d_hashCompactified = NULL; + d_hashCompactifiedCounter = NULL; + d_SDFBlocks = NULL; + d_hashBucketMutex = NULL; + } + + __host__ + HashData copyToCPU() const { + HashParams params; + + HashData hashData; + hashData.allocate(params, false); //allocate the data on the CPU + cudaSafeCall(cudaMemcpy(hashData.d_heap, d_heap, sizeof(unsigned int) * params.m_numSDFBlocks, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_heapCounter, d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hash, d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashDecision, d_hashDecision, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashDecisionPrefix, d_hashDecisionPrefix, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashCompactifiedCounter, d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_SDFBlocks, d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashBucketMutex, d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); + + return hashData; //TODO MATTHIAS look at this (i.e,. when does memory get destroyed ; if it's in the destructer it would kill everything here + } + + + + ///////////////// + // Device part // + ///////////////// +//#define __CUDACC__ +#ifdef __CUDACC__ + + __device__ + const HashParams& params() const { + return c_hashParams; + } + + //! see teschner et al. (but with correct prime values) + __device__ + uint computeHashPos(const int3& virtualVoxelPos) const { + const int p0 = 73856093; + const int p1 = 19349669; + const int p2 = 83492791; + + int res = ((virtualVoxelPos.x * p0) ^ (virtualVoxelPos.y * p1) ^ (virtualVoxelPos.z * p2)) % c_hashParams.m_hashNumBuckets; + if (res < 0) res += c_hashParams.m_hashNumBuckets; + return (uint)res; + } + + //merges two voxels (v0 the currently stored voxel, v1 is the input voxel) + __device__ + void combineVoxel(const Voxel &v0, const Voxel& v1, Voxel &out) const { + + //v.color = (10*v0.weight * v0.color + v1.weight * v1.color)/(10*v0.weight + v1.weight); //give the currently observed color more weight + //v.color = (v0.weight * v0.color + v1.weight * v1.color)/(v0.weight + v1.weight); + //out.color = 0.5f * (v0.color + v1.color); //exponential running average + + + float3 c0 = make_float3(v0.color.x, v0.color.y, v0.color.z); + float3 c1 = make_float3(v1.color.x, v1.color.y, v1.color.z); + + float3 res = (c0.x+c0.y+c0.z == 0) ? c1 : 0.5f*c0 + 0.5f*c1; + //float3 res = (c0+c1)/2; + //float3 res = (c0 * (float)v0.weight + c1 * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); + //float3 res = c1; + + out.color.x = (uchar)(res.x+0.5f); out.color.y = (uchar)(res.y+0.5f); out.color.z = (uchar)(res.z+0.5f); + + + + out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); + out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight); + } + + + //! returns the truncation of the SDF for a given distance value + __device__ + float getTruncation(float z) const { + return c_hashParams.m_truncation + c_hashParams.m_truncScale * z; + } + + + __device__ + float3 worldToVirtualVoxelPosFloat(const float3& pos) const { + return pos / c_hashParams.m_virtualVoxelSize; + } + + __device__ + int3 worldToVirtualVoxelPos(const float3& pos) const { + //const float3 p = pos*g_VirtualVoxelResolutionScalar; + const float3 p = pos / c_hashParams.m_virtualVoxelSize; + return make_int3(p+make_float3(sign(p))*0.5f); + } + + __device__ + int3 virtualVoxelPosToSDFBlock(int3 virtualVoxelPos) const { + if (virtualVoxelPos.x < 0) virtualVoxelPos.x -= SDF_BLOCK_SIZE-1; + if (virtualVoxelPos.y < 0) virtualVoxelPos.y -= SDF_BLOCK_SIZE-1; + if (virtualVoxelPos.z < 0) virtualVoxelPos.z -= SDF_BLOCK_SIZE-1; + + return make_int3( + virtualVoxelPos.x/SDF_BLOCK_SIZE, + virtualVoxelPos.y/SDF_BLOCK_SIZE, + virtualVoxelPos.z/SDF_BLOCK_SIZE); + } + + // Computes virtual voxel position of corner sample position + __device__ + int3 SDFBlockToVirtualVoxelPos(const int3& sdfBlock) const { + return sdfBlock*SDF_BLOCK_SIZE; + } + + __device__ + float3 virtualVoxelPosToWorld(const int3& pos) const { + return make_float3(pos)*c_hashParams.m_virtualVoxelSize; + } + + __device__ + float3 SDFBlockToWorld(const int3& sdfBlock) const { + return virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)); + } + + __device__ + int3 worldToSDFBlock(const float3& worldPos) const { + return virtualVoxelPosToSDFBlock(worldToVirtualVoxelPos(worldPos)); + } + + __device__ + bool isSDFBlockInCameraFrustumApprox(const int3& sdfBlock) { + // NOTE (Nick): Changed, just assume all voxels are potentially in frustrum + //float3 posWorld = virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)) + c_hashParams.m_virtualVoxelSize * 0.5f * (SDF_BLOCK_SIZE - 1.0f); + //return DepthCameraData::isInCameraFrustumApprox(c_hashParams.m_rigidTransformInverse, posWorld); + return true; + } + + //! computes the (local) virtual voxel pos of an index; idx in [0;511] + __device__ + uint3 delinearizeVoxelIndex(uint idx) const { + uint x = idx % SDF_BLOCK_SIZE; + uint y = (idx % (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE)) / SDF_BLOCK_SIZE; + uint z = idx / (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE); + return make_uint3(x,y,z); + } + + //! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3 + __device__ + uint linearizeVoxelPos(const int3& virtualVoxelPos) const { + return + virtualVoxelPos.z * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE + + virtualVoxelPos.y * SDF_BLOCK_SIZE + + virtualVoxelPos.x; + } + + __device__ + int virtualVoxelPosToLocalSDFBlockIndex(const int3& virtualVoxelPos) const { + int3 localVoxelPos = make_int3( + virtualVoxelPos.x % SDF_BLOCK_SIZE, + virtualVoxelPos.y % SDF_BLOCK_SIZE, + virtualVoxelPos.z % SDF_BLOCK_SIZE); + + if (localVoxelPos.x < 0) localVoxelPos.x += SDF_BLOCK_SIZE; + if (localVoxelPos.y < 0) localVoxelPos.y += SDF_BLOCK_SIZE; + if (localVoxelPos.z < 0) localVoxelPos.z += SDF_BLOCK_SIZE; + + return linearizeVoxelPos(localVoxelPos); + } + + __device__ + int worldToLocalSDFBlockIndex(const float3& world) const { + int3 virtualVoxelPos = worldToVirtualVoxelPos(world); + return virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos); + } + + + //! returns the hash entry for a given worldPos; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set + __device__ + HashEntry getHashEntry(const float3& worldPos) const { + //int3 blockID = worldToSDFVirtualVoxelPos(worldPos)/SDF_BLOCK_SIZE; //position of sdf block + int3 blockID = worldToSDFBlock(worldPos); + return getHashEntryForSDFBlockPos(blockID); + } + + + __device__ + void deleteHashEntry(uint id) { + deleteHashEntry(d_hash[id]); + } + + __device__ + void deleteHashEntry(HashEntry& hashEntry) { + hashEntry.pos = make_int3(0); + hashEntry.offset = 0; + hashEntry.ptr = FREE_ENTRY; + } + + __device__ + bool voxelExists(const float3& worldPos) const { + HashEntry hashEntry = getHashEntry(worldPos); + return (hashEntry.ptr != FREE_ENTRY); + } + + __device__ + void deleteVoxel(Voxel& v) const { + v.color = make_uchar3(0,0,0); + v.weight = 0; + v.sdf = 0.0f; + } + __device__ + void deleteVoxel(uint id) { + deleteVoxel(d_SDFBlocks[id]); + } + + + __device__ + Voxel getVoxel(const float3& worldPos) const { + HashEntry hashEntry = getHashEntry(worldPos); + Voxel v; + if (hashEntry.ptr == FREE_ENTRY) { + deleteVoxel(v); + } else { + int3 virtualVoxelPos = worldToVirtualVoxelPos(worldPos); + v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)]; + } + return v; + } + + __device__ + Voxel getVoxel(const int3& virtualVoxelPos) const { + HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); + Voxel v; + if (hashEntry.ptr == FREE_ENTRY) { + deleteVoxel(v); + } else { + v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)]; + } + return v; + } + + __device__ + void setVoxel(const int3& virtualVoxelPos, Voxel& voxelInput) const { + HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); + if (hashEntry.ptr != FREE_ENTRY) { + d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)] = voxelInput; + } + } + + //! returns the hash entry for a given sdf block id; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set + __device__ + HashEntry getHashEntryForSDFBlockPos(const int3& sdfBlock) const + { + uint h = computeHashPos(sdfBlock); //hash bucket + uint hp = h * HASH_BUCKET_SIZE; //hash position + + HashEntry entry; + entry.pos = sdfBlock; + entry.offset = 0; + entry.ptr = FREE_ENTRY; + + for (uint j = 0; j < HASH_BUCKET_SIZE; j++) { + uint i = j + hp; + HashEntry curr = d_hash[i]; + if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) { + return curr; + } + } + +#ifdef HANDLE_COLLISIONS + const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; + int i = idxLastEntryInBucket; //start with the last entry of the current bucket + HashEntry curr; + //traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end + + unsigned int maxIter = 0; + uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { + curr = d_hash[i]; + + if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) { + return curr; + } + + if (curr.offset == 0) { //we have found the end of the list + break; + } + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + + maxIter++; + } +#endif + return entry; + } + + //for histogram (no collision traversal) + __device__ + unsigned int getNumHashEntriesPerBucket(unsigned int bucketID) { + unsigned int h = 0; + for (uint i = 0; i < HASH_BUCKET_SIZE; i++) { + if (d_hash[bucketID*HASH_BUCKET_SIZE+i].ptr != FREE_ENTRY) { + h++; + } + } + return h; + } + + //for histogram (collisions traversal only) + __device__ + unsigned int getNumHashLinkedList(unsigned int bucketID) { + unsigned int listLen = 0; + +#ifdef HANDLE_COLLISIONS + const uint idxLastEntryInBucket = (bucketID+1)*HASH_BUCKET_SIZE - 1; + unsigned int i = idxLastEntryInBucket; //start with the last entry of the current bucket + //int offset = 0; + HashEntry curr; curr.offset = 0; + //traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end + + unsigned int maxIter = 0; + uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { + //offset = curr.offset; + //curr = getHashEntry(g_Hash, i); + curr = d_hash[i]; + + if (curr.offset == 0) { //we have found the end of the list + break; + } + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + listLen++; + + maxIter++; + } +#endif + + return listLen; + } + + + + __device__ + uint consumeHeap() { + uint addr = atomicSub(&d_heapCounter[0], 1); + //TODO MATTHIAS check some error handling? + return d_heap[addr]; + } + __device__ + void appendHeap(uint ptr) { + uint addr = atomicAdd(&d_heapCounter[0], 1); + //TODO MATTHIAS check some error handling? + d_heap[addr+1] = ptr; + } + + //pos in SDF block coordinates + __device__ + void allocBlock(const int3& pos, const uchar frame) { + + + uint h = computeHashPos(pos); //hash bucket + uint hp = h * HASH_BUCKET_SIZE; //hash position + + int firstEmpty = -1; + for (uint j = 0; j < HASH_BUCKET_SIZE; j++) { + uint i = j + hp; + HashEntry& curr = d_hash[i]; + + //in that case the SDF-block is already allocated and corresponds to the current position -> exit thread + if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) { + //curr.flags = frame; // Flag block as valid in this frame (Nick) + return; + } + + //store the first FREE_ENTRY hash entry + if (firstEmpty == -1 && curr.ptr == FREE_ENTRY) { + firstEmpty = i; + } + } + + +#ifdef HANDLE_COLLISIONS + //updated variables as after the loop + const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; //get last index of bucket + uint i = idxLastEntryInBucket; //start with the last entry of the current bucket + //int offset = 0; + HashEntry curr; curr.offset = 0; + //traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end + //int k = 0; + + unsigned int maxIter = 0; + uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { + //offset = curr.offset; + curr = d_hash[i]; //TODO MATTHIAS do by reference + if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) { + //curr.flags = frame; // Flag block as valid in this frame (Nick) + return; + } + if (curr.offset == 0) { //we have found the end of the list + break; + } + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + + maxIter++; + } +#endif + + if (firstEmpty != -1) { //if there is an empty entry and we haven't allocated the current entry before + //int prevValue = 0; + //InterlockedExchange(d_hashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket + int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY); + if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked + HashEntry& entry = d_hash[firstEmpty]; + entry.pos = pos; + entry.offset = NO_OFFSET; + entry.flags = 0; // Flag block as valid in this frame (Nick) + entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc + } + return; + } + +#ifdef HANDLE_COLLISIONS + //if (i != idxLastEntryInBucket) return; + int offset = 0; + //linear search for free entry + + maxIter = 0; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { + offset++; + i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //go to next hash element + if ((offset % HASH_BUCKET_SIZE) == 0) continue; //cannot insert into a last bucket element (would conflict with other linked lists) + curr = d_hash[i]; + //if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) { + // return; + //} + if (curr.ptr == FREE_ENTRY) { //this is the first free entry + //int prevValue = 0; + //InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the original hash bucket + int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY); + if (prevValue != LOCK_ENTRY) { + HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket]; + h = i / HASH_BUCKET_SIZE; + //InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket where we have found a free entry + prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY); + if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked + HashEntry& entry = d_hash[i]; + entry.pos = pos; + entry.offset = lastEntryInBucket.offset; + entry.flags = 0; // Flag block as valid in this frame (Nick) + entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc + + lastEntryInBucket.offset = offset; + d_hash[idxLastEntryInBucket] = lastEntryInBucket; + //setHashEntry(g_Hash, idxLastEntryInBucket, lastEntryInBucket); + } + } + return; //bucket was already locked + } + + maxIter++; + } +#endif + } + + + //!inserts a hash entry without allocating any memory: used by streaming: TODO MATTHIAS check the atomics in this function + __device__ + bool insertHashEntry(HashEntry entry) + { + uint h = computeHashPos(entry.pos); + uint hp = h * HASH_BUCKET_SIZE; + + for (uint j = 0; j < HASH_BUCKET_SIZE; j++) { + uint i = j + hp; + //const HashEntry& curr = d_hash[i]; + int prevWeight = 0; + //InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight); + prevWeight = atomicCAS(&d_hash[i].ptr, FREE_ENTRY, LOCK_ENTRY); + if (prevWeight == FREE_ENTRY) { + d_hash[i] = entry; + //setHashEntry(hash, i, entry); + return true; + } + } + +#ifdef HANDLE_COLLISIONS + //updated variables as after the loop + const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; //get last index of bucket + + uint i = idxLastEntryInBucket; //start with the last entry of the current bucket + HashEntry curr; + + unsigned int maxIter = 0; + //[allow_uav_condition] + uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { //traverse list until end // why find the end? we you are inserting at the start !!! + //curr = getHashEntry(hash, i); + curr = d_hash[i]; //TODO MATTHIAS do by reference + if (curr.offset == 0) break; //we have found the end of the list + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + + maxIter++; + } + + maxIter = 0; + int offset = 0; + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { //linear search for free entry + offset++; + uint i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //go to next hash element + if ((offset % HASH_BUCKET_SIZE) == 0) continue; //cannot insert into a last bucket element (would conflict with other linked lists) + + int prevWeight = 0; + //InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight); //check for a free entry + uint* d_hashUI = (uint*)d_hash; + prevWeight = prevWeight = atomicCAS(&d_hashUI[3*idxLastEntryInBucket+1], (uint)FREE_ENTRY, (uint)LOCK_ENTRY); + if (prevWeight == FREE_ENTRY) { //if free entry found set prev->next = curr & curr->next = prev->next + //[allow_uav_condition] + //while(hash[3*idxLastEntryInBucket+2] == LOCK_ENTRY); // expects setHashEntry to set the ptr last, required because pos.z is packed into the same value -> prev->next = curr -> might corrput pos.z + + HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket]; //get prev (= lastEntry in Bucket) + + int newOffsetPrev = (offset << 16) | (lastEntryInBucket.pos.z & 0x0000ffff); //prev->next = curr (maintain old z-pos) + int oldOffsetPrev = 0; + //InterlockedExchange(hash[3*idxLastEntryInBucket+1], newOffsetPrev, oldOffsetPrev); //set prev offset atomically + uint* d_hashUI = (uint*)d_hash; + oldOffsetPrev = prevWeight = atomicExch(&d_hashUI[3*idxLastEntryInBucket+1], newOffsetPrev); + entry.offset = oldOffsetPrev >> 16; //remove prev z-pos from old offset + + //setHashEntry(hash, i, entry); //sets the current hashEntry with: curr->next = prev->next + d_hash[i] = entry; + return true; + } + + maxIter++; + } +#endif + + return false; + } + + + + //! deletes a hash entry position for a given sdfBlock index (returns true uppon successful deletion; otherwise returns false) + __device__ + bool deleteHashEntryElement(const int3& sdfBlock) { + uint h = computeHashPos(sdfBlock); //hash bucket + uint hp = h * HASH_BUCKET_SIZE; //hash position + + for (uint j = 0; j < HASH_BUCKET_SIZE; j++) { + uint i = j + hp; + const HashEntry& curr = d_hash[i]; + if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) { +#ifndef HANDLE_COLLISIONS + const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + appendHeap(curr.ptr / linBlockSize); + //heapAppend.Append(curr.ptr / linBlockSize); + deleteHashEntry(i); + return true; +#endif +#ifdef HANDLE_COLLISIONS + if (curr.offset != 0) { //if there was a pointer set it to the next list element + //int prevValue = 0; + //InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket + int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY); + if (prevValue == LOCK_ENTRY) return false; + if (prevValue != LOCK_ENTRY) { + const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + appendHeap(curr.ptr / linBlockSize); + //heapAppend.Append(curr.ptr / linBlockSize); + int nextIdx = (i + curr.offset) % (HASH_BUCKET_SIZE*c_hashParams.m_hashNumBuckets); + //setHashEntry(hash, i, getHashEntry(hash, nextIdx)); + d_hash[i] = d_hash[nextIdx]; + deleteHashEntry(nextIdx); + return true; + } + } else { + const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + appendHeap(curr.ptr / linBlockSize); + //heapAppend.Append(curr.ptr / linBlockSize); + deleteHashEntry(i); + return true; + } +#endif //HANDLE_COLLSISION + } + } +#ifdef HANDLE_COLLISIONS + const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; + int i = idxLastEntryInBucket; + HashEntry curr; + curr = d_hash[i]; + int prevIdx = i; + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + + unsigned int maxIter = 0; + uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize; + + #pragma unroll 1 + while (maxIter < g_MaxLoopIterCount) { + curr = d_hash[i]; + //found that dude that we need/want to delete + if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) { + //int prevValue = 0; + //InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket + int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY); + if (prevValue == LOCK_ENTRY) return false; + if (prevValue != LOCK_ENTRY) { + const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + appendHeap(curr.ptr / linBlockSize); + //heapAppend.Append(curr.ptr / linBlockSize); + deleteHashEntry(i); + HashEntry prev = d_hash[prevIdx]; + prev.offset = curr.offset; + //setHashEntry(hash, prevIdx, prev); + d_hash[prevIdx] = prev; + return true; + } + } + + if (curr.offset == 0) { //we have found the end of the list + return false; //should actually never happen because we need to find that guy before + } + prevIdx = i; + i = idxLastEntryInBucket + curr.offset; //go to next element in the list + i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow + + maxIter++; + } +#endif // HANDLE_COLLSISION + return false; + } + +#endif //CUDACC + + uint* d_heap; //heap that manages free memory + uint* d_heapCounter; //single element; used as an atomic counter (points to the next free block) + int* d_hashDecision; // + int* d_hashDecisionPrefix; // + HashEntry* d_hash; //hash that stores pointers to sdf blocks + HashEntry* d_hashCompactified; //same as before except that only valid pointers are there + int* d_hashCompactifiedCounter; //atomic counter to add compactified entries atomically + Voxel* d_SDFBlocks; //sub-blocks that contain 8x8x8 voxels (linearized); are allocated by heap + int* d_hashBucketMutex; //binary flag per hash bucket; used for allocation to atomically lock a bucket + + bool m_bIsOnGPU; //the class be be used on both cpu and gpu +}; + +} // namespace voxhash +} // namespace ftl diff --git a/applications/reconstruct/include/ftl/voxel_hash_params.hpp b/applications/reconstruct/include/ftl/voxel_hash_params.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8216310174c7ae554557f403bfea1bdb6f88ccd7 --- /dev/null +++ b/applications/reconstruct/include/ftl/voxel_hash_params.hpp @@ -0,0 +1,47 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDAHashParams.h + +#pragma once + +//#include <cutil_inline.h> +//#include <cutil_math.h> +#include <vector_types.h> +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> + +namespace ftl { +namespace voxhash { + +//TODO might have to be split into static and dynamics +struct __align__(16) HashParams { + HashParams() { + } + + float4x4 m_rigidTransform; + float4x4 m_rigidTransformInverse; + + unsigned int m_hashNumBuckets; + unsigned int m_hashBucketSize; + unsigned int m_hashMaxCollisionLinkedListSize; + unsigned int m_numSDFBlocks; + + int m_SDFBlockSize; + float m_virtualVoxelSize; + unsigned int m_numOccupiedBlocks; //occupied blocks in the viewing frustum + + float m_maxIntegrationDistance; + float m_truncScale; + float m_truncation; + unsigned int m_integrationWeightSample; + unsigned int m_integrationWeightMax; + + float3 m_streamingVoxelExtents; + int3 m_streamingGridDimensions; + int3 m_streamingMinGridPos; + unsigned int m_streamingInitialChunkListSize; + uint2 m_dummy; + +}; + +} // namespace voxhash +} // namespace ftl diff --git a/applications/reconstruct/src/camera_util.cu b/applications/reconstruct/src/camera_util.cu new file mode 100644 index 0000000000000000000000000000000000000000..55d3e00c578ffbfa2233283191f426432c1ab751 --- /dev/null +++ b/applications/reconstruct/src/camera_util.cu @@ -0,0 +1,1769 @@ +#ifndef _CAMERA_UTIL_ +#define _CAMERA_UTIL_ + +#include <ftl/cuda_util.hpp> + +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/depth_camera.hpp> + +#ifndef BYTE +#define BYTE unsigned char +#endif + +#define T_PER_BLOCK 16 +#define MINF __int_as_float(0xff800000) + +#ifndef CUDART_PI_F +#define CUDART_PI_F 3.141592654f +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Copy Float Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void copyFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = d_input[y*width+x]; +} + +extern "C" void copyFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + copyFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +__global__ void copyDepthFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const float depth = d_input[y*width+x]; + if (depth >= minDepth && depth <= maxDepth) + d_output[y*width+x] = depth; + else + d_output[y*width+x] = MINF; +} + +extern "C" void copyDepthFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + copyDepthFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height,minDepth, maxDepth); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Copy Float Map Fill +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void initializeOptimizerMapsDevice(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const float depth = d_input[y*width+x]; + if(d_mask[y*width+x] != MINF) { d_output[y*width+x] = depth; } + else { d_output[y*width+x] = MINF; d_input[y*width+x] = MINF; d_input2[y*width+x] = MINF; } +} + +extern "C" void initializeOptimizerMaps(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + initializeOptimizerMapsDevice<<<gridSize, blockSize>>>(d_output, d_input, d_input2, d_mask, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Copy Float4 Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void copyFloat4MapDevice(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = d_input[y*width+x]; +} + +extern "C" void copyFloat4Map(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + copyFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Raw Color to float +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertColorRawToFloatDevice(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + //uchar4 c = make_uchar4(d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+3]); //note the flip from BGRW to RGBW + uchar4 c = make_uchar4(d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+3]); + if (c.x == 0 && c.y == 0 && c.z == 0) { + d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF); + } else { + d_output[y*width+x] = make_float4(c.x/255.0f, c.y/255.0f, c.z/255.0f, c.w/255); + } +} + +extern "C" void convertColorRawToFloat4(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertColorRawToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Float4 Color to UCHAR4 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertColorFloat4ToUCHAR4Device(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + float4 color = d_input[y*width+x]; + d_output[y*width+x] = make_uchar4(color.x*255.0f, color.y*255.0f, color.z*255.0f, color.w*255.0f); +} + +extern "C" void convertColorFloat4ToUCHAR4(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK); + + convertColorFloat4ToUCHAR4Device<<<blockSize, gridSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Mask Color Map using Depth +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void maskColorMapFloat4MapDevice(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + float4 color = d_inputColor[y*width+x]; + + if(d_inputDepth[y*width+x].x != MINF) d_inputColor[y*width+x] = color; + else d_inputColor[y*width+x] = make_float4(MINF, MINF, MINF, MINF); +} + +extern "C" void maskColorMapFloat4Map(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height) +{ + const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK); + + maskColorMapFloat4MapDevice<<<blockSize, gridSize>>>(d_inputColor, d_inputDepth, width, height); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Color to Intensity Float4 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertColorToIntensityFloat4Device(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const float4 color = d_input[y*width+x]; + const float I = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z; + + d_output[y*width+x] = make_float4(I, I, I, 1.0f); +} + +extern "C" void convertColorToIntensityFloat4(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertColorToIntensityFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, width, height); + + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Color to Intensity +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertColorToIntensityFloatDevice(float* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const float4 color = d_input[y*width+x]; + d_output[y*width+x] = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z; +} + +extern "C" void convertColorToIntensityFloat(float* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertColorToIntensityFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); + + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert depth map to color map view +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertDepthToColorSpaceDevice(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x < depthWidth && y < depthHeight) + { + const float depth = d_input[y*depthWidth+x]; + + if(depth != MINF && depth < 1.0f) + { + // Cam space depth + float4 depthCamSpace = depthIntrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth); + depthCamSpace = make_float4(depthCamSpace.x, depthCamSpace.y, depthCamSpace.w, 1.0f); + + // World Space + const float4 worldSpace = depthExtrinsicsInv*depthCamSpace; + + // Cam space color + float4 colorCamSpace = colorExtrinsics*worldSpace; + //colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, 0.0f, colorCamSpace.z); + colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, colorCamSpace.z, 1.0f); + + // Get coordinates in color image and set pixel to new depth + const float4 screenSpaceColor = colorIntrinsics*colorCamSpace; + //const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.w + 0.5f); + //const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.w + 0.5f); + const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.z + 0.5f); + const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.z + 0.5f); + + //if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.w; // Check for minimum !!! + if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.z; // Check for minimum !!! + } + } +} + +extern "C" void convertDepthToColorSpace(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight) +{ + const dim3 gridSize((depthWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertDepthToColorSpaceDevice<<<gridSize, blockSize>>>(d_output, d_input, depthIntrinsicsInv, colorIntrinsics, depthExtrinsicsInv, colorExtrinsics, depthWidth, depthHeight, colorWidth, colorHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Set invalid float map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void setInvalidFloatMapDevice(float* d_output, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = MINF; +} + +extern "C" void setInvalidFloatMap(float* d_output, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + setInvalidFloatMapDevice<<<gridSize, blockSize>>>(d_output, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Set invalid float4 map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void setInvalidFloat4MapDevice(float4* d_output, unsigned int width, unsigned int height) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = make_float4(MINF, MINF, MINF ,MINF); +} + +extern "C" void setInvalidFloat4Map(float4* d_output, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + setInvalidFloat4MapDevice<<<gridSize, blockSize>>>(d_output, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Depth to Camera Space Positions +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertDepthFloatToCameraSpaceFloat3Device(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, DepthCameraData depthCameraData) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < width && y < height) { + d_output[y*width+x] = make_float3(MINF, MINF, MINF); + + float depth = d_input[y*width+x]; + + if(depth != MINF) + { + //float4 cameraSpace(intrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth)); + //d_output[y*width+x] = make_float4(cameraSpace.x, cameraSpace.y, cameraSpace.w, 1.0f); + d_output[y*width+x] = depthCameraData.kinectDepthToSkeleton(x, y, depth); + } + } +} + +extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertDepthFloatToCameraSpaceFloat3Device<<<gridSize, blockSize>>>(d_output, d_input, intrinsicsInv, width, height, depthCameraData); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Bilateral Filter Float Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float gaussR(float sigma, float dist) +{ + return exp(-(dist*dist)/(2.0*sigma*sigma)); +} + +inline __device__ float linearR(float sigma, float dist) +{ + return max(1.0f, min(0.0f, 1.0f-(dist*dist)/(2.0*sigma*sigma))); +} + +inline __device__ float gaussD(float sigma, int x, int y) +{ + return exp(-((x*x+y*y)/(2.0f*sigma*sigma))); +} + +inline __device__ float gaussD(float sigma, int x) +{ + return exp(-((x*x)/(2.0f*sigma*sigma))); +} + +__global__ void bilateralFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const int kernelRadius = (int)ceil(2.0*sigmaD); + + d_output[y*width+x] = MINF; + + float sum = 0.0f; + float sumWeight = 0.0f; + + const float depthCenter = d_input[y*width+x]; + if(depthCenter != MINF) + { + for(int m = x-kernelRadius; m <= x+kernelRadius; m++) + { + for(int n = y-kernelRadius; n <= y+kernelRadius; n++) + { + if(m >= 0 && n >= 0 && m < width && n < height) + { + const float currentDepth = d_input[n*width+m]; + + if (currentDepth != MINF) { + const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, currentDepth-depthCenter); + + sumWeight += weight; + sum += weight*currentDepth; + } + } + } + } + + if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight; + } +} + +extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + bilateralFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Bilateral Filter Float4 Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void bilateralFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const int kernelRadius = (int)ceil(2.0*sigmaD); + + //d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF); + + float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + float sumWeight = 0.0f; + + const float4 depthCenter = d_input[y*width+x]; + if (depthCenter.x != MINF) { + for(int m = x-kernelRadius; m <= x+kernelRadius; m++) + { + for(int n = y-kernelRadius; n <= y+kernelRadius; n++) + { + if(m >= 0 && n >= 0 && m < width && n < height) + { + const float4 currentDepth = d_input[n*width+m]; + + if (currentDepth.x != MINF) { + const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, length(currentDepth-depthCenter)); + + sum += weight*currentDepth; + sumWeight += weight; + } + } + } + } + } + if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight; +} + +extern "C" void bilateralFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK); + const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + + bilateralFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Gauss Filter Float Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void gaussFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const int kernelRadius = (int)ceil(2.0*sigmaD); + + d_output[y*width+x] = MINF; + + float sum = 0.0f; + float sumWeight = 0.0f; + + const float depthCenter = d_input[y*width+x]; + if(depthCenter != MINF) + { + for(int m = x-kernelRadius; m <= x+kernelRadius; m++) + { + for(int n = y-kernelRadius; n <= y+kernelRadius; n++) + { + if(m >= 0 && n >= 0 && m < width && n < height) + { + const float currentDepth = d_input[n*width+m]; + + if(currentDepth != MINF && fabs(depthCenter-currentDepth) < sigmaR) + { + const float weight = gaussD(sigmaD, m-x, n-y); + + sumWeight += weight; + sum += weight*currentDepth; + } + } + } + } + } + + if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight; +} + +extern "C" void gaussFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + gaussFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Gauss Filter Float4 Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void gaussFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const int kernelRadius = (int)ceil(2.0*sigmaD); + + //d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF); + + float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + float sumWeight = 0.0f; + + const float4 depthCenter = d_input[y*width+x]; + if (depthCenter.x != MINF) { + for(int m = x-kernelRadius; m <= x+kernelRadius; m++) + { + for(int n = y-kernelRadius; n <= y+kernelRadius; n++) + { + if(m >= 0 && n >= 0 && m < width && n < height) + { + const float4 currentDepth = d_input[n*width+m]; + + if (currentDepth.x != MINF) { + if(length(depthCenter-currentDepth) < sigmaR) + { + const float weight = gaussD(sigmaD, m-x, n-y); + + sumWeight += weight; + sum += weight*currentDepth; + } + } + } + } + } + } + + if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight; +} + +extern "C" void gaussFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + gaussFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Normal Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeNormalsDevice(float4* d_output, float3* d_input, unsigned int width, unsigned int height) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF); + + if(x > 0 && x < width-1 && y > 0 && y < height-1) + { + const float3 CC = d_input[(y+0)*width+(x+0)]; + const float3 PC = d_input[(y+1)*width+(x+0)]; + const float3 CP = d_input[(y+0)*width+(x+1)]; + const float3 MC = d_input[(y-1)*width+(x+0)]; + const float3 CM = d_input[(y+0)*width+(x-1)]; + + if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF) + { + const float3 n = cross(PC-MC, CP-CM); + const float l = length(n); + + if(l > 0.0f) + { + d_output[y*width+x] = make_float4(n/-l, 1.0f); + } + } + } +} + +extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeNormalsDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Normal Map 2 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeNormalsDevice2(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF); + + if(x > 0 && x < width-1 && y > 0 && y < height-1) + { + const float4 CC = d_input[(y+0)*width+(x+0)]; + const float4 MC = d_input[(y-1)*width+(x+0)]; + const float4 CM = d_input[(y+0)*width+(x-1)]; + + if(CC.x != MINF && MC.x != MINF && CM.x != MINF) + { + const float3 n = cross(make_float3(MC)-make_float3(CC), make_float3(CM)-make_float3(CC)); + const float l = length(n); + + if(l > 0.0f) + { + d_output[y*width+x] = make_float4(n/-l, 1.0f); + } + } + } +} + +extern "C" void computeNormals2(float4* d_output, float4* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeNormalsDevice2<<<gridSize, blockSize>>>(d_output, d_input, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Shading Value +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void evaluateLightingModelTerms(float* d_out, float4 n) +{ + d_out[0] = 1.0; + d_out[1] = n.y; + d_out[2] = n.z; + d_out[3] = n.x; + d_out[4] = n.x*n.y; + d_out[5] = n.y*n.z; + d_out[6] = 3*n.z*n.z - 1; + d_out[7] = n.z*n.x; + d_out[8] = n.x*n.x-n.y*n.y; +} + +inline __device__ float evaluateLightingModel(float* d_lit, float4 n) +{ + float tmp[9]; + evaluateLightingModelTerms(tmp, n); + + float sum = 0.0f; + for(unsigned int i = 0; i<9; i++) sum += tmp[i]*d_lit[i]; + + return sum; +} + +__global__ void computeShadingValueDevice(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 Intrinsic, float* d_litcoeff, unsigned int width, unsigned int height) +{ + const unsigned int posx = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int posy = blockIdx.y*blockDim.y + threadIdx.y; + + if(posx >= width || posy >= height) return; + + d_outShading[posy*width+posx] = 0; + + if(posx > 0 && posx < width-1 && posy > 0 && posy < height-1) + { + float4 n = d_normals[posy*width+posx]; + + if(n.x != MINF) + { + n.x = -n.x; // Change handedness + n.z = -n.z; + + float albedo = d_albedoEstimates[(unsigned int)(d_clusterIDs[posy*width+posx]+0.5f)]; + float shadingval = albedo*evaluateLightingModel(d_litcoeff, n); + + if(shadingval<0.0f) shadingval = 0.0f; + if(shadingval>1.0f) shadingval = 1.0f; + + d_outShading[posy*width+posx] = shadingval; + } + } +} + +extern "C" void computeShadingValue(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 &Intrinsic, float* d_lighting, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeShadingValueDevice<<<gridSize, blockSize>>>(d_outShading, d_indepth, d_normals, d_clusterIDs, d_albedoEstimates, Intrinsic, d_lighting, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Simple Segmentation +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeSimpleSegmentationDevice(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + const float inputDepth = d_input[y*width+x]; + if(inputDepth != MINF && inputDepth < depthThres) d_output[y*width+x] = inputDepth; + else d_output[y*width+x] = MINF; +} + +extern "C" void computeSimpleSegmentation(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeSimpleSegmentationDevice<<<gridSize, blockSize>>>(d_output, d_input, depthThres, width, height); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Edge Mask +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeMaskEdgeMapFloat4Device(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = 1; + d_output[width*height+y*width+x] = 1; + + const float thre = threshold *threshold *3.0f; + if(x > 0 && y > 0 && x < width-1 && y < height-1) + { + if(d_indepth[y*width+x] == MINF) + { + d_output[y*width+x] = 0; + d_output[y*width+x-1] = 0; + d_output[width*height+y*width+x] = 0; + d_output[width*height+(y-1)*width+x] = 0; + + return; + } + + const float4& p0 = d_input[(y+0)*width+(x+0)]; + const float4& p1 = d_input[(y+0)*width+(x+1)]; + const float4& p2 = d_input[(y+1)*width+(x+0)]; + + float dU = sqrt(((p1.x-p0.x)*(p1.x-p0.x) + (p1.y-p0.y) * (p1.y-p0.y) + (p1.z-p0.z)*(p1.z-p0.z))/3.0f); + float dV = sqrt(((p2.x-p0.x)*(p2.x-p0.x) + (p2.y-p0.y) * (p2.y-p0.y) + (p2.z-p0.z)*(p2.z-p0.z))/3.0f); + + //float dgradx = abs(d_indepth[y*width+x-1] + d_indepth[y*width+x+1] - 2.0f * d_indepth[y*width+x]); + //float dgrady = abs(d_indepth[y*width+x-width] + d_indepth[y*width+x+width] - 2.0f * d_indepth[y*width+x]); + + + if(dU > thre ) d_output[y*width+x] = 0; + if(dV > thre ) d_output[width*height+y*width+x] = 0; + + //remove depth discontinuities + const int r = 1; + const float thres = 0.01f; + + const float pCC = d_indepth[y*width+x]; + for(int i = -r; i<=r; i++) + { + for(int j = -r; j<=r; j++) + { + int currentX = x+j; + int currentY = y+i; + + if(currentX >= 0 && currentX < width && currentY >= 0 && currentY < height) + { + float d = d_indepth[currentY*width+currentX]; + + if(d != MINF && abs(pCC-d) > thres) + { + d_output[y*width+x] = 0; + d_output[width*height+y*width+x] = 0; + return; + } + } + } + } + } +} + +extern "C" void computeMaskEdgeMapFloat4(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeMaskEdgeMapFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, d_indepth, threshold,width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Clear Decission Array for Patch Depth Mask +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void clearDecissionArrayPatchDepthMaskDevice(int* d_output, unsigned int inputWidth, unsigned int inputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight) d_output[y*inputWidth+x] = 0; +} + +extern "C" void clearDecissionArrayPatchDepthMask(int* d_output, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + clearDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, inputWidth, inputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Decission Array for Patch Depth Mask +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeDecissionArrayPatchDepthMaskDevice(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight) + { + const int patchId_x = x/patchSize; + const int patchId_y = y/patchSize; + const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize; + + const float d = d_input[y*inputWidth+x]; + if(d != MINF) atomicMax(&d_output[patchId_y*nPatchesWidth+patchId_x], 1); + } +} + +extern "C" void computeDecissionArrayPatchDepthMask(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, patchSize, inputWidth, inputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Compute Remapping Array for Patch Depth Mask +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void computeRemappingArrayPatchDepthMaskDevice(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight) + { + const int patchId_x = x/patchSize; + const int patchId_y = y/patchSize; + + const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize; + + const float d = d_input[y*inputWidth+x]; + if(d != MINF) d_output[d_prefixSum[patchId_y*nPatchesWidth+patchId_x]-1] = patchId_y*nPatchesWidth+patchId_x; + } +} + +extern "C" void computeRemappingArrayPatchDepthMask(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeRemappingArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, d_prefixSum, patchSize, inputWidth, inputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Debug Patch Remap Array +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void DebugPatchRemapArrayDevice(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + + if(x < numElements) + { + int patchID = d_remapArray[x]; + + const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize; + const int patchId_x = patchID%nPatchesWidth; + const int patchId_y = patchID/nPatchesWidth; + + for(unsigned int i = 0; i<patchSize; i++) + { + for(unsigned int j = 0; j<patchSize; j++) + { + const int pixel_x = patchId_x*patchSize; + const int pixel_y = patchId_y*patchSize; + + d_mask[(pixel_y+i)*inputWidth+(pixel_x+j)] = 3.0f; + } + } + } +} + +extern "C" void DebugPatchRemapArray(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((numElements + T_PER_BLOCK*T_PER_BLOCK - 1)/(T_PER_BLOCK*T_PER_BLOCK)); + const dim3 blockSize(T_PER_BLOCK*T_PER_BLOCK); + + DebugPatchRemapArrayDevice<<<gridSize, blockSize>>>(d_mask, d_remapArray, patchSize, numElements, inputWidth, inputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Resample Float Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float bilinearInterpolationFloat(float x, float y, float* d_input, unsigned int imageWidth, unsigned int imageHeight) +{ + const int2 p00 = make_int2(floor(x), floor(y)); + const int2 p01 = p00 + make_int2(0.0f, 1.0f); + const int2 p10 = p00 + make_int2(1.0f, 0.0f); + const int2 p11 = p00 + make_int2(1.0f, 1.0f); + + const float alpha = x - p00.x; + const float beta = y - p00.y; + + float s0 = 0.0f; float w0 = 0.0f; + if(p00.x < imageWidth && p00.y < imageHeight) { float v00 = d_input[p00.y*imageWidth + p00.x]; if(v00 != MINF) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } } + if(p10.x < imageWidth && p10.y < imageHeight) { float v10 = d_input[p10.y*imageWidth + p10.x]; if(v10 != MINF) { s0 += alpha *v10; w0 += alpha ; } } + + float s1 = 0.0f; float w1 = 0.0f; + if(p01.x < imageWidth && p01.y < imageHeight) { float v01 = d_input[p01.y*imageWidth + p01.x]; if(v01 != MINF) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} } + if(p11.x < imageWidth && p11.y < imageHeight) { float v11 = d_input[p11.y*imageWidth + p11.x]; if(v11 != MINF) { s1 += alpha *v11; w1 += alpha ;} } + + const float p0 = s0/w0; + const float p1 = s1/w1; + + float ss = 0.0f; float ww = 0.0f; + if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); } + if(w1 > 0.0f) { ss += beta *p1; ww += beta ; } + + if(ww > 0.0f) return ss/ww; + else return MINF; +} + +__global__ void resampleFloatMapDevice(float* d_colorMapResampledFloat, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x < outputWidth && y < outputHeight) + { + const float scaleWidth = (float)(inputWidth-1) /(float)(outputWidth-1); + const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1); + + const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f); + const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f); + + if(xInput < inputWidth && yInput < inputHeight) + { + d_colorMapResampledFloat[y*outputWidth+x] = bilinearInterpolationFloat(x*scaleWidth, y*scaleHeight, d_colorMapFloat, inputWidth, inputHeight); + } + } +} + +extern "C" void resampleFloatMap(float* d_colorMapResampledFloat, unsigned int outputWidth, unsigned int outputHeight, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + resampleFloatMapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat, d_colorMapFloat, inputWidth, inputHeight, outputWidth, outputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Resample Float4 Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 bilinearInterpolationFloat4(float x, float y, float4* d_input, unsigned int imageWidth, unsigned int imageHeight) +{ + const int2 p00 = make_int2(floor(x), floor(y)); + const int2 p01 = p00 + make_int2(0.0f, 1.0f); + const int2 p10 = p00 + make_int2(1.0f, 0.0f); + const int2 p11 = p00 + make_int2(1.0f, 1.0f); + + const float alpha = x - p00.x; + const float beta = y - p00.y; + + //const float INVALID = 0.0f; + const float INVALID = MINF; + + float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f; + if(p00.x < imageWidth && p00.y < imageHeight) { float4 v00 = d_input[p00.y*imageWidth + p00.x]; if(v00.x != INVALID && v00.y != INVALID && v00.z != INVALID) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } } + if(p10.x < imageWidth && p10.y < imageHeight) { float4 v10 = d_input[p10.y*imageWidth + p10.x]; if(v10.x != INVALID && v10.y != INVALID && v10.z != INVALID) { s0 += alpha *v10; w0 += alpha ; } } + + float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f; + if(p01.x < imageWidth && p01.y < imageHeight) { float4 v01 = d_input[p01.y*imageWidth + p01.x]; if(v01.x != INVALID && v01.y != INVALID && v01.z != INVALID) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} } + if(p11.x < imageWidth && p11.y < imageHeight) { float4 v11 = d_input[p11.y*imageWidth + p11.x]; if(v11.x != INVALID && v11.y != INVALID && v11.z != INVALID) { s1 += alpha *v11; w1 += alpha ;} } + + const float4 p0 = s0/w0; + const float4 p1 = s1/w1; + + float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f; + if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); } + if(w1 > 0.0f) { ss += beta *p1; ww += beta ; } + + if(ww > 0.0f) return ss/ww; + else return make_float4(MINF, MINF, MINF, MINF); +} + +__global__ void resampleFloat4MapDevice(float4* d_colorMapResampledFloat4, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x < outputWidth && y < outputHeight) + { + const float scaleWidth = (float)(inputWidth-1) /(float)(outputWidth-1); + const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1); + + const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f); + const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f); + + if(xInput < inputWidth && yInput < inputHeight) + { + d_colorMapResampledFloat4[y*outputWidth+x] = bilinearInterpolationFloat4(x*scaleWidth, y*scaleHeight, d_colorMapFloat4, inputWidth, inputHeight); + } + } +} + +extern "C" void resampleFloat4Map(float4* d_colorMapResampledFloat4, unsigned int outputWidth, unsigned int outputHeight, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + resampleFloat4MapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat4, d_colorMapFloat4, inputWidth, inputHeight, outputWidth, outputHeight); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Resample Unsigned Char Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void downsampleUnsignedCharMapDevice(unsigned char* d_MapResampled, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight, unsigned int layerOffsetInput, unsigned int layerOffsetOutput) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= outputWidth || y >= outputHeight) return; + + unsigned char res = 0; + + const unsigned int inputX = 2*x; + const unsigned int inputY = 2*y; + + if((inputY+0) < inputHeight && (inputX+0) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+0))]; + if((inputY+0) < inputHeight && (inputX+1) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+1))]; + if((inputY+1) < inputHeight && (inputX+0) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+0))]; + if((inputY+1) < inputHeight && (inputX+1) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+1))]; + + if(res == 4) d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 1; + else d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 0; +} + +extern "C" void downsampleUnsignedCharMap(unsigned char* d_MapResampled, unsigned int outputWidth, unsigned int outputHeight, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight) +{ + const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, 0, 0); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + + downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, inputWidth*inputHeight, outputWidth*outputHeight); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Convert Edge Mask to Float Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convertEdgeMaskToFloatDevice(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= width || y >= height) return; + + d_output[y*width+x] = min(d_input[y*width+x], d_input[width*height+y*width+x]); +} + +extern "C" void convertEdgeMaskToFloat(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + convertEdgeMaskToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Dilate Depth Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void dilateDepthMapDevice(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if(x >= 0 && x < width && y >= 0 && y < height) + { + float sum = 0.0f; + float count = 0.0f; + float oldDepth = d_inputOrig[y*width+x]; + if(oldDepth != MINF && oldDepth != 0) + { + for(int i = -structureSize; i<=structureSize; i++) + { + for(int j = -structureSize; j<=structureSize; j++) + { + if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height) + { + const float d = d_input[(y+i)*width+(x+j)]; + + if(d != MINF && d != 0.0f && fabs(d-oldDepth) < 0.05f) + { + sum += d; + count += 1.0f; + } + } + } + } + } + + if(count > ((2*structureSize+1)*(2*structureSize+1))/36) d_output[y*width+x] = 1.0f; + else d_output[y*width+x] = MINF; + } +} + +extern "C" void dilateDepthMapMask(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + dilateDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, d_inputOrig, structureSize, width, height); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Mean Filter Depth Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void removeDevMeanMapMaskDevice(float* d_output, float* d_input, int structureSize, int width, int height) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + d_output[y*width+x] = d_input[y*width+x]; + + if(x >= 0 && x < width && y >= 0 && y < height) + { + float oldDepth = d_input[y*width+x]; + + float mean = 0.0f; + float meanSquared = 0.0f; + float count = 0.0f; + for(int i = -structureSize; i<=structureSize; i++) + { + for(int j = -structureSize; j<=structureSize; j++) + { + if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height) + { + float depth = d_input[(y+i)*width+(x+j)]; + if(depth == MINF) + { + depth = 8.0f; + } + + if(depth > 0.0f) + { + mean += depth; + meanSquared += depth*depth; + count += 1.0f; + } + } + } + } + + mean/=count; + meanSquared/=count; + + float stdDev = sqrt(meanSquared-mean*mean); + + if(fabs(oldDepth-mean) > 0.5f*stdDev)// || stdDev> 0.005f) + { + d_output[y*width+x] = MINF; + } + } +} + +extern "C" void removeDevMeanMapMask(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + removeDevMeanMapMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + + + +// Nearest neighbour +inline __device__ bool getValueNearestNeighbourNoCheck(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue) +{ + const int u = (int)(p.x + 0.5f); + const int v = (int)(p.y + 0.5f); + + if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false; + + *outValue = inputMap[v*imageWidth + u]; + + return true; +} + +inline __device__ bool getValueNearestNeighbour(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue) +{ + bool valid = getValueNearestNeighbourNoCheck(p, inputMap, imageWidth, imageHeight, outValue); + return valid && (outValue->x != MINF && outValue->y != MINF && outValue->z != MINF); +} + +// Nearest neighbour +inline __device__ bool getValueNearestNeighbourFloatNoCheck(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue) +{ + const int u = (int)(p.x + 0.5f); + const int v = (int)(p.y + 0.5f); + + if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false; + + *outValue = inputMap[v*imageWidth + u]; + + return true; +} + +inline __device__ bool getValueNearestNeighbourFloat(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue) +{ + bool valid = getValueNearestNeighbourFloatNoCheck(p, inputMap, imageWidth, imageHeight, outValue); + return valid && (*outValue != MINF); +} + +///////////////////////////////////////////// +// Compute derivatives in camera space +///////////////////////////////////////////// + +__global__ void computeDerivativesCameraSpaceDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int index = y*imageWidth+x; + + if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight) + { + d_positionsDU[index] = make_float4(MINF, MINF, MINF, MINF); + d_positionsDV[index] = make_float4(MINF, MINF, MINF, MINF); + + if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1) + { + float4 pos00; bool valid00 = getValueNearestNeighbour(make_float2(x-1, y-1), d_positions, imageWidth, imageHeight, &pos00); if(!valid00) return; + float4 pos01; bool valid01 = getValueNearestNeighbour(make_float2(x-1, y-0), d_positions, imageWidth, imageHeight, &pos01); if(!valid01) return; + float4 pos02; bool valid02 = getValueNearestNeighbour(make_float2(x-1, y+1), d_positions, imageWidth, imageHeight, &pos02); if(!valid02) return; + + float4 pos10; bool valid10 = getValueNearestNeighbour(make_float2(x-0, y-1), d_positions, imageWidth, imageHeight, &pos10); if(!valid10) return; + float4 pos11; bool valid11 = getValueNearestNeighbour(make_float2(x-0, y-0), d_positions, imageWidth, imageHeight, &pos11); if(!valid11) return; + float4 pos12; bool valid12 = getValueNearestNeighbour(make_float2(x-0, y+1), d_positions, imageWidth, imageHeight, &pos12); if(!valid12) return; + + float4 pos20; bool valid20 = getValueNearestNeighbour(make_float2(x+1, y-1), d_positions, imageWidth, imageHeight, &pos20); if(!valid20) return; + float4 pos21; bool valid21 = getValueNearestNeighbour(make_float2(x+1, y-0), d_positions, imageWidth, imageHeight, &pos21); if(!valid21) return; + float4 pos22; bool valid22 = getValueNearestNeighbour(make_float2(x+1, y+1), d_positions, imageWidth, imageHeight, &pos22); if(!valid22) return; + + float4 resU = (-1.0f)*pos00 + (1.0f)*pos20 + + (-2.0f)*pos01 + (2.0f)*pos21 + + (-1.0f)*pos02 + (1.0f)*pos22; + resU /= 8.0f; + + float4 resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 + + ( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22; + resV /= 8.0f; + + //if(mat3x1(make_float3(resU)).norm1D() > 0.02f) return; + //if(mat3x1(make_float3(resV)).norm1D() > 0.02f) return; + + d_positionsDU[index] = resU; + d_positionsDV[index] = resV; + } + } +} + +extern "C" void computeDerivativesCameraSpace(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV) +{ + + const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeDerivativesCameraSpaceDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_positionsDU, d_positionsDV); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + +///////////////////////////////////////////// +// Compute Intensity and Derivatives +///////////////////////////////////////////// + +__global__ void computeIntensityAndDerivativesDevice(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int index = y*imageWidth+x; + + if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight) + { + d_intensityAndDerivatives[index] = make_float4(MINF, MINF, MINF, MINF); + + if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1) + { + float pos00; bool valid00 = getValueNearestNeighbourFloat(make_float2(x-1, y-1), d_intensity, imageWidth, imageHeight, &pos00); if(!valid00) return; + float pos01; bool valid01 = getValueNearestNeighbourFloat(make_float2(x-1, y-0), d_intensity, imageWidth, imageHeight, &pos01); if(!valid01) return; + float pos02; bool valid02 = getValueNearestNeighbourFloat(make_float2(x-1, y+1), d_intensity, imageWidth, imageHeight, &pos02); if(!valid02) return; + + float pos10; bool valid10 = getValueNearestNeighbourFloat(make_float2(x-0, y-1), d_intensity, imageWidth, imageHeight, &pos10); if(!valid10) return; + float pos11; bool valid11 = getValueNearestNeighbourFloat(make_float2(x-0, y-0), d_intensity, imageWidth, imageHeight, &pos11); if(!valid11) return; + float pos12; bool valid12 = getValueNearestNeighbourFloat(make_float2(x-0, y+1), d_intensity, imageWidth, imageHeight, &pos12); if(!valid12) return; + + float pos20; bool valid20 = getValueNearestNeighbourFloat(make_float2(x+1, y-1), d_intensity, imageWidth, imageHeight, &pos20); if(!valid20) return; + float pos21; bool valid21 = getValueNearestNeighbourFloat(make_float2(x+1, y-0), d_intensity, imageWidth, imageHeight, &pos21); if(!valid21) return; + float pos22; bool valid22 = getValueNearestNeighbourFloat(make_float2(x+1, y+1), d_intensity, imageWidth, imageHeight, &pos22); if(!valid22) return; + + float resU = (-1.0f)*pos00 + (1.0f)*pos20 + + (-2.0f)*pos01 + (2.0f)*pos21 + + (-1.0f)*pos02 + (1.0f)*pos22; + resU /= 8.0f; + + float resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 + + ( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22; + resV /= 8.0f; + + d_intensityAndDerivatives[index] = make_float4(pos11, resU, resV, 1.0f); + } + } +} + +extern "C" void computeIntensityAndDerivatives(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives) +{ + const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeIntensityAndDerivativesDevice<<<gridSize, blockSize>>>(d_intensity, imageWidth, imageHeight, d_intensityAndDerivatives); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + +///////////////////////////////////////////// +// Compute grdient intensity magnitude +///////////////////////////////////////////// + +__global__ void computeGradientIntensityMagnitudeDevice(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int index = y*imageWidth+x; + + d_ouput[index] = make_float4(MINF, MINF, MINF, MINF); + + float4 DU = d_inputDU[index]; + float4 DV = d_inputDV[index]; + + if(DU.x != MINF && DV.x != MINF) + { + float m = sqrtf(DU.x*DU.x+DV.x*DV.x); + + if(m > 0.005f) + { + d_ouput[index] = make_float4(m, m, m, 1.0f); + } + } +} + +extern "C" void computeGradientIntensityMagnitude(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput) +{ + const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + computeGradientIntensityMagnitudeDevice<<<gridSize, blockSize>>>(d_inputDU, d_inputDV, imageWidth, imageHeight, d_ouput); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + +///////////////////////////////////////////// +// Transform +///////////////////////////////////////////// + +__global__ void transformCameraSpaceMapDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_output) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int index = y*imageWidth+x; + + if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight) + { + d_output[index] = d_positions[index]; + + if(d_positions[index].x != MINF && d_positions[index].y != MINF && d_positions[index].z != MINF) + { + d_output[index] = d_positions[index]+make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } +} + +extern "C" void transformCameraSpaceMap(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_output) +{ + const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + transformCameraSpaceMapDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_output); + #ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + + + + + + + + + + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Erode Depth Map +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void erodeDepthMapDevice(float* d_output, float* d_input, int structureSize, int width, int height, float dThresh, float fracReq) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + + if(x >= 0 && x < width && y >= 0 && y < height) + { + + + unsigned int count = 0; + + float oldDepth = d_input[y*width+x]; + for(int i = -structureSize; i<=structureSize; i++) + { + for(int j = -structureSize; j<=structureSize; j++) + { + if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height) + { + float depth = d_input[(y+i)*width+(x+j)]; + if(depth == MINF || depth == 0.0f || fabs(depth-oldDepth) > dThresh) + { + count++; + //d_output[y*width+x] = MINF; + //return; + } + } + } + } + + unsigned int sum = (2*structureSize+1)*(2*structureSize+1); + if ((float)count/(float)sum >= fracReq) { + d_output[y*width+x] = MINF; + } else { + d_output[y*width+x] = d_input[y*width+x]; + } + } +} + +extern "C" void erodeDepthMap(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height, float dThresh, float fracReq) +{ + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + erodeDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height, dThresh, fracReq); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + + + +//////////////////////////////////// +// Depth to HSV map conversion ///// +//////////////////////////////////// + +__device__ float3 convertHSVToRGB(const float3& hsv) { + float H = hsv.x; + float S = hsv.y; + float V = hsv.z; + + float hd = H/60.0f; + unsigned int h = (unsigned int)hd; + float f = hd-h; + + float p = V*(1.0f-S); + float q = V*(1.0f-S*f); + float t = V*(1.0f-S*(1.0f-f)); + + if(h == 0 || h == 6) + { + return make_float3(V, t, p); + } + else if(h == 1) + { + return make_float3(q, V, p); + } + else if(h == 2) + { + return make_float3(p, V, t); + } + else if(h == 3) + { + return make_float3(p, q, V); + } + else if(h == 4) + { + return make_float3(t, p, V); + } + else + { + return make_float3(V, p, q); + } +} + + +__device__ float3 convertDepthToRGB(float depth, float depthMin, float depthMax) { + float depthZeroOne = (depth - depthMin)/(depthMax - depthMin); + float x = 1.0f-depthZeroOne; + if (x < 0.0f) x = 0.0f; + if (x > 1.0f) x = 1.0f; + //return convertHSVToRGB(make_float3(240.0f*x, 1.0f, 0.5f)); + x = 360.0f*x - 120.0f; + if (x < 0.0f) x += 359.0f; + return convertHSVToRGB(make_float3(x, 1.0f, 0.5f)); +} + +__global__ void depthToHSVDevice(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) +{ + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x >= 0 && x < width && y >= 0 && y < height) { + + float depth = d_input[y*width + x]; + if (depth != MINF && depth != 0.0f && depth >= minDepth && depth <= maxDepth) { + float3 c = convertDepthToRGB(depth, minDepth, maxDepth); + d_output[y*width + x] = make_float4(c, 1.0f); + } else { + d_output[y*width + x] = make_float4(0.0f); + } + } +} + +extern "C" void depthToHSV(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) { + const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + depthToHSVDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height, minDepth, maxDepth); +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + + + + +#endif // _CAMERA_UTIL_ \ No newline at end of file diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp index 42e14f4581aecbf4a21aa01afa12d3d710523de0..a74c03a1c5377d81306100d0b3df3b0bf396f047 100644 --- a/applications/reconstruct/src/main.cpp +++ b/applications/reconstruct/src/main.cpp @@ -7,6 +7,9 @@ #include <glog/logging.h> #include <ftl/config.h> #include <ftl/configuration.hpp> +#include <ftl/depth_camera.hpp> +#include <ftl/scene_rep_hash_sdf.hpp> +#include <ftl/ray_cast_sdf.hpp> #include <ftl/rgbd.hpp> // #include <zlib.h> @@ -88,14 +91,14 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) { for(int i=0;i<rgb.rows;i++) { const float *sptr = depth.ptr<float>(i); for(int j=0;j<rgb.cols;j++) { - float d = sptr[j] * 1000.0f; + float d = sptr[j]; // * 1000.0f; pcl::PointXYZRGB point; point.x = (((double)j + CX) / FX) * d; point.y = (((double)i + CY) / FY) * d; point.z = d; - if (point.x == INFINITY || point.y == INFINITY || point.z > 20000.0f || point.z < 0.04f) { + if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) { point.x = 0.0f; point.y = 0.0f; point.z = 0.0f; } @@ -127,14 +130,14 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src, const for(int i=0;i<rgb.rows;i++) { const float *sptr = depth.ptr<float>(i); for(int j=0;j<rgb.cols;j++) { - float d = sptr[j] * 1000.0f; + float d = sptr[j]; // * 1000.0f; pcl::PointXYZRGB point; point.x = (((double)j + CX) / FX) * d; point.y = (((double)i + CY) / FY) * d; point.z = d; - if (point.x == INFINITY || point.y == INFINITY || point.z > 20000.0f || point.z < 0.04f) { + if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) { point.x = 0.0f; point.y = 0.0f; point.z = 0.0f; } @@ -220,8 +223,14 @@ void saveRegistration(std::map<string, Eigen::Matrix4f> registration) { file << save; } +struct Cameras { + RGBDSource *source; + DepthCameraData gpu; + DepthCameraParams params; +}; + template <template<class> class Container> -std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<RGBDSource*> &inputs) { +std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<Cameras> &inputs) { std::map<string, Eigen::Matrix4f> registration; @@ -244,17 +253,17 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta size_t ref_i; bool found = false; for (size_t i = 0; i < inputs.size(); i++) { - if (inputs[i]->getConfig()["uri"] == ref_uri) { + if (inputs[i].source->getConfig()["uri"] == ref_uri) { ref_i = i; found = true; break; } } - + if (!found) { LOG(ERROR) << "Reference input not found!"; return registration; } for (auto &input : inputs) { - cv::namedWindow("Registration: " + (string)input->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL); + cv::namedWindow("Registration: " + (string)input.source->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL); } // vector for every input: vector of point clouds of patterns @@ -266,7 +275,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta vector<PointCloud<PointXYZ>::Ptr> patterns_iter(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { - RGBDSource *input = inputs[i]; + RGBDSource *input = inputs[i].source; Mat rgb, depth; input->getRGBD(rgb, depth); @@ -289,7 +298,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta else { LOG(WARNING) << "Pattern not detected on all inputs";} } - for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input->getConfig()["uri"]); } + for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input.source->getConfig()["uri"]); } for (size_t i = 0; i < inputs.size(); i++) { Eigen::Matrix4f T; @@ -299,13 +308,48 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta else { T = ftl::registration::findTransformation(patterns[i], patterns[ref_i]); } - registration[(string)inputs[i]->getConfig()["uri"]] = T; + registration[(string)inputs[i].source->getConfig()["uri"]] = T; } saveRegistration(registration); return registration; } #endif +template<class T> +Eigen::Matrix<T,4,4> lookAt +( + Eigen::Matrix<T,3,1> const & eye, + Eigen::Matrix<T,3,1> const & center, + Eigen::Matrix<T,3,1> const & up +) +{ + typedef Eigen::Matrix<T,4,4> Matrix4; + typedef Eigen::Matrix<T,3,1> Vector3; + + Vector3 f = (center - eye).normalized(); + Vector3 u = up.normalized(); + Vector3 s = f.cross(u).normalized(); + u = s.cross(f); + + Matrix4 res; + res << s.x(),s.y(),s.z(),-s.dot(eye), + u.x(),u.y(),u.z(),-u.dot(eye), + -f.x(),-f.y(),-f.z(),f.dot(eye), + 0,0,0,1; + + return res; +} + +using MouseAction = std::function<void(int, int, int, int)>; + +static void setMouseAction(const std::string& winName, const MouseAction &action) +{ + cv::setMouseCallback(winName, + [] (int event, int x, int y, int flags, void* userdata) { + (*(MouseAction*)userdata)(event, x, y, flags); + }, (void*)&action); +} + static void run() { Universe net(config["net"]); @@ -317,8 +361,8 @@ static void run() { return; } - std::deque<RGBDSource*> inputs; - std::vector<Display> displays; + std::deque<Cameras> inputs; + //std::vector<Display> displays; // TODO Allow for non-net source types for (auto &src : config["sources"]) { @@ -326,9 +370,20 @@ static void run() { if (!in) { LOG(ERROR) << "Unrecognised source: " << src; } else { - inputs.push_back(in); - displays.emplace_back(config["display"], src["uri"]); - LOG(INFO) << (string)src["uri"] << " loaded"; + auto &cam = inputs.emplace_back(); + cam.source = in; + cam.params.fx = in->getParameters().fx; + cam.params.fy = in->getParameters().fy; + cam.params.mx = -in->getParameters().cx; + cam.params.my = -in->getParameters().cy; + cam.params.m_imageWidth = in->getParameters().width; + cam.params.m_imageHeight = in->getParameters().height; + cam.params.m_sensorDepthWorldMax = in->getParameters().maxDepth; + cam.params.m_sensorDepthWorldMin = in->getParameters().minDepth; + cam.gpu.alloc(cam.params); + + //displays.emplace_back(config["display"], src["uri"]); + LOG(INFO) << (string)src["uri"] << " loaded " << cam.params.m_imageWidth; } } @@ -336,63 +391,126 @@ static void run() { // load point cloud transformations -#ifdef HAVE_PCL std::map<string, Eigen::Matrix4f> registration; if (config["registration"]["calibration"]["run"]) { registration = runRegistration(net, inputs); } else { + LOG(INFO) << "LOAD REG"; registration = loadRegistration(); } + + LOG(INFO) << "Assigning poses"; vector<Eigen::Matrix4f> T; for (auto &input : inputs) { - Eigen::Matrix4f RT = (registration.count((string)input->getConfig()["uri"]) > 0) ? registration[(string)input->getConfig()["uri"]] : registration["default"]; + LOG(INFO) << (unsigned long long)input.source; + Eigen::Matrix4f RT = (registration.count(input.source->getConfig()["uri"].get<string>()) > 0) ? registration[(string)input.source->getConfig()["uri"]] : registration["default"]; T.push_back(RT); + input.source->setPose(RT); } - // - vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size()); + //vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size()); Display display_merged(config["display"], "Merged"); // todo + CUDARayCastSDF rays(config["voxelhash"]); + ftl::voxhash::SceneRep scene(config["voxelhash"]); + + cv::Mat colour_array(cv::Size(rays.getRayCastParams().m_width,rays.getRayCastParams().m_height), CV_8UC3); + + // Force window creation + display_merged.render(colour_array); + display_merged.wait(1); + + unsigned char frameCount = 0; + bool paused = false; + float cam_x = 0.0f; + float cam_z = 0.0f; + + Eigen::Vector3f eye(0.0f, 0.0f, 0.0f); + Eigen::Vector3f centre(0.0f, 0.0f, -4.0f); + Eigen::Vector3f up(0,1.0f,0); + Eigen::Vector3f lookPoint(0.0f,1.0f,-4.0f); + Eigen::Matrix4f viewPose; + float lerpSpeed = 0.4f; + + // Keyboard camera controls + display_merged.onKey([&paused,&eye,¢re](int key) { + LOG(INFO) << "Key = " << key; + if (key == 32) paused = !paused; + else if (key == 81) eye[0] += 0.02f; + else if (key == 83) eye[0] -= 0.02f; + else if (key == 84) eye[2] += 0.02f; + else if (key == 82) eye[2] -= 0.02f; + }); + + // TODO(Nick) Calculate "camera" properties of viewport. + MouseAction mouseact = [&inputs,&lookPoint,&viewPose]( int event, int ux, int uy, int) { + LOG(INFO) << "Mouse " << ux << "," << uy; + if (event == 1) { // click + const float x = ((float)ux-inputs[0].params.mx) / inputs[0].params.fx; + const float y = ((float)uy-inputs[0].params.my) / inputs[0].params.fy; + const float depth = -4.0f; + Eigen::Vector4f camPos(x*depth,y*depth,depth,1.0); + Eigen::Vector4f worldPos = viewPose * camPos; + lookPoint = Eigen::Vector3f(worldPos[0],worldPos[1],worldPos[2]); + } + }; + ::setMouseAction("Image", mouseact); - int active = displays.size(); + int active = inputs.size(); while (active > 0 && display_merged.active()) { active = 0; - net.broadcast("grab"); // To sync cameras - - PointCloud<PointXYZRGB>::Ptr cloud(new PointCloud<PointXYZRGB>); - - for (size_t i = 0; i < inputs.size(); i++) { - //Display &display = displays[i]; - RGBDSource *input = inputs[i]; - Mat rgb, depth; - input->getRGBD(rgb,depth); - - //if (!display.active()) continue; - active += 1; - - clouds[i] = ftl::rgbd::createPointCloud(input); //, (i==0) ? cv::Point3_<uchar>(255,0,0) : cv::Point3_<uchar>(0,0,255)); - - //display.render(rgb, depth,input->getParameters()); - //display.render(clouds[i]); - //display.wait(5); - } + if (!paused) { + net.broadcast("grab"); // To sync cameras + scene.nextFrame(); - for (size_t i = 0; i < clouds.size(); i++) { - pcl::transformPointCloud(*clouds[i], *clouds[i], T[i]); - *cloud += *clouds[i]; + for (size_t i = 0; i < inputs.size(); i++) { + //if (i == 1) continue; + //Display &display = displays[i]; + RGBDSource *input = inputs[i].source; + Mat rgb, depth; + //LOG(INFO) << "GetRGB"; + input->getRGBD(rgb,depth); + + //if (!display.active()) continue; + active += 1; + + //clouds[i] = ftl::rgbd::createPointCloud(input); + + //display.render(rgb, depth,input->getParameters()); + //display.render(clouds[i]); + //display.wait(5); + + //LOG(INFO) << "Data size: " << depth.cols << "," << depth.rows; + if (depth.cols == 0) continue; + + Mat rgba; + cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA); + + inputs[i].params.flags = frameCount; + + // Send to GPU and merge view into scene + inputs[i].gpu.updateParams(inputs[i].params); + inputs[i].gpu.updateData(depth, rgba); + scene.integrate(inputs[i].source->getPose(), inputs[i].gpu, inputs[i].params, nullptr); + } + } else { + active = 1; } - - pcl::UniformSampling<PointXYZRGB> uniform_sampling; - uniform_sampling.setInputCloud(cloud); - uniform_sampling.setRadiusSearch(0.1f); - uniform_sampling.filter(*cloud); - - display_merged.render(cloud); - display_merged.wait(50); + + frameCount++; + + // Set virtual camera transformation matrix + //Eigen::Affine3f transform(Eigen::Translation3f(cam_x,0.0f,cam_z)); + centre += (lookPoint - centre) * (lerpSpeed * 0.1f); + viewPose = lookAt<float>(eye,centre,up); // transform.matrix(); + + // Call GPU renderer and download result into OpenCV mat + rays.render(scene.getHashData(), scene.getHashParams(), inputs[0].gpu, viewPose); + rays.getRayCastData().download(nullptr, (uchar3*)colour_array.data, rays.getRayCastParams()); + display_merged.render(colour_array); + display_merged.wait(1); } -#endif - // TODO non-PCL (?) } int main(int argc, char **argv) { diff --git a/applications/reconstruct/src/ray_cast_sdf.cpp b/applications/reconstruct/src/ray_cast_sdf.cpp new file mode 100644 index 0000000000000000000000000000000000000000..093d1a2349f5cfd4f260342b7e870301242fbe92 --- /dev/null +++ b/applications/reconstruct/src/ray_cast_sdf.cpp @@ -0,0 +1,85 @@ +//#include <stdafx.h> + +#include <ftl/voxel_hash.hpp> + +//#include "Util.h" + +#include <ftl/ray_cast_sdf.hpp> + + +extern "C" void renderCS( + const ftl::voxhash::HashData& hashData, + const RayCastData &rayCastData, + const DepthCameraData &cameraData, + const RayCastParams &rayCastParams); + +extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height); +extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData); + +extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params); +extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData, + const RayCastData &rayCastData, const RayCastParams &rayCastParams); + +extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams ¶ms); + + + +void CUDARayCastSDF::create(const RayCastParams& params) +{ + m_params = params; + m_data.allocate(m_params); + //m_rayIntervalSplatting.OnD3D11CreateDevice(DXUTGetD3D11Device(), params.m_width, params.m_height); +} + +void CUDARayCastSDF::destroy(void) +{ + m_data.free(); + //m_rayIntervalSplatting.OnD3D11DestroyDevice(); +} + +void CUDARayCastSDF::render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform) +{ + rayIntervalSplatting(hashData, hashParams, cameraData, lastRigidTransform); + //m_data.d_rayIntervalSplatMinArray = m_rayIntervalSplatting.mapMinToCuda(); + //m_data.d_rayIntervalSplatMaxArray = m_rayIntervalSplatting.mapMaxToCuda(); + + if (hash_render_) nickRenderCUDA(hashData, hashParams, m_data, cameraData, m_params); + else renderCS(hashData, m_data, cameraData, m_params); + + //convertToCameraSpace(cameraData); + if (!m_params.m_useGradients) + { + computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height); + } + + //m_rayIntervalSplatting.unmapCuda(); + +} + + +void CUDARayCastSDF::convertToCameraSpace(const DepthCameraData& cameraData) +{ + convertDepthFloatToCameraSpaceFloat3(m_data.d_depth3, m_data.d_depth, m_params.m_intrinsicsInverse, m_params.m_width, m_params.m_height, cameraData); + + if(!m_params.m_useGradients) { + computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height); + } +} + +void CUDARayCastSDF::rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform) +{ + if (hashParams.m_numOccupiedBlocks == 0) return; + + if (m_params.m_maxNumVertices <= 6*hashParams.m_numOccupiedBlocks) { // 6 verts (2 triangles) per block + throw std::runtime_error("not enough space for vertex buffer for ray interval splatting"); + } + + m_params.m_numOccupiedSDFBlocks = hashParams.m_numOccupiedBlocks; + m_params.m_viewMatrix = MatrixConversion::toCUDA(lastRigidTransform.inverse()); + m_params.m_viewMatrixInverse = MatrixConversion::toCUDA(lastRigidTransform); + + m_data.updateParams(m_params); // !!! debugging + + //don't use ray interval splatting (cf CUDARayCastSDF.cu -> line 40 + //m_rayIntervalSplatting.rayIntervalSplatting(DXUTGetD3D11DeviceContext(), hashData, cameraData, m_data, m_params, m_params.m_numOccupiedSDFBlocks*6); +} \ No newline at end of file diff --git a/applications/reconstruct/src/ray_cast_sdf.cu b/applications/reconstruct/src/ray_cast_sdf.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2c7ef72e59a5c5b3f7e35e35bdd7dac567b3fb1 --- /dev/null +++ b/applications/reconstruct/src/ray_cast_sdf.cu @@ -0,0 +1,483 @@ + +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> + +#include <ftl/depth_camera.hpp> +#include <ftl/voxel_hash.hpp> +#include <ftl/ray_cast_util.hpp> + +#define T_PER_BLOCK 8 +#define NUM_GROUPS_X 1024 + +//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMinTextureRef; +//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMaxTextureRef; + +__global__ void renderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const RayCastParams& rayCastParams = c_rayCastParams; + + if (x < rayCastParams.m_width && y < rayCastParams.m_height) { + rayCastData.d_depth[y*rayCastParams.m_width+x] = MINF; + rayCastData.d_depth3[y*rayCastParams.m_width+x] = make_float3(MINF,MINF,MINF); + rayCastData.d_normals[y*rayCastParams.m_width+x] = make_float4(MINF,MINF,MINF,MINF); + rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0); + + float3 camDir = normalize(cameraData.kinectProjToCamera(x, y, 1.0f)); + float3 worldCamPos = rayCastParams.m_viewMatrixInverse * make_float3(0.0f, 0.0f, 0.0f); + float4 w = rayCastParams.m_viewMatrixInverse * make_float4(camDir, 0.0f); + float3 worldDir = normalize(make_float3(w.x, w.y, w.z)); + + ////use ray interval splatting + //float minInterval = tex2D(rayMinTextureRef, x, y); + //float maxInterval = tex2D(rayMaxTextureRef, x, y); + + //don't use ray interval splatting + float minInterval = rayCastParams.m_minDepth; + float maxInterval = rayCastParams.m_maxDepth; + + //if (minInterval == 0 || minInterval == MINF) minInterval = rayCastParams.m_minDepth; + //if (maxInterval == 0 || maxInterval == MINF) maxInterval = rayCastParams.m_maxDepth; + //TODO MATTHIAS: shouldn't this return in the case no interval is found? + if (minInterval == 0 || minInterval == MINF) return; + if (maxInterval == 0 || maxInterval == MINF) return; + + // debugging + //if (maxInterval < minInterval) { + // printf("ERROR (%d,%d): [ %f, %f ]\n", x, y, minInterval, maxInterval); + //} + + rayCastData.traverseCoarseGridSimpleSampleAll(hashData, cameraData, worldCamPos, worldDir, camDir, make_int3(x,y,1), minInterval, maxInterval); + } +} + +extern "C" void renderCS(const ftl::voxhash::HashData& hashData, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &rayCastParams) +{ + + const dim3 gridSize((rayCastParams.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (rayCastParams.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + //cudaBindTextureToArray(rayMinTextureRef, rayCastData.d_rayIntervalSplatMinArray, channelDesc); + //cudaBindTextureToArray(rayMaxTextureRef, rayCastData.d_rayIntervalSplatMaxArray, channelDesc); + + //printf("Ray casting render...\n"); + + renderKernel<<<gridSize, blockSize>>>(hashData, rayCastData, cameraData); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + //cutilCheckMsg(__FUNCTION__); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +// Nicks render approach +//////////////////////////////////////////////////////////////////////////////// + +__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const RayCastParams& rayCastParams = c_rayCastParams; + + if (x < rayCastParams.m_width && y < rayCastParams.m_height) { + rayCastData.d_depth_i[y*rayCastParams.m_width+x] = 0x7FFFFFFF; //PINF; + rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0); + } +} + +#define SDF_BLOCK_SIZE_PAD 9 +#define SDF_BLOCK_BUFFER 1024 // > 9x9x9 +#define SDF_DX 1 +#define SDF_DY SDF_BLOCK_SIZE_PAD +#define SDF_DZ (SDF_BLOCK_SIZE_PAD*SDF_BLOCK_SIZE_PAD) + +__device__ +float frac(float val) { + return (val - floorf(val)); +} +__device__ +float3 frac(const float3& val) { + return make_float3(frac(val.x), frac(val.y), frac(val.z)); +} + +__host__ size_t nickSharedMem() { + return sizeof(float)*SDF_BLOCK_BUFFER + + sizeof(uchar)*SDF_BLOCK_BUFFER + + sizeof(float)*SDF_BLOCK_BUFFER + + sizeof(float3)*SDF_BLOCK_BUFFER; +} + +/*__device__ void loadVoxel(const ftl::voxhash::HashData &hash, const int3 &vox, float *sdf, uint *weight, float3 *colour) { + ftl::voxhash::Voxel &v = hashData.getVoxel(vox); + *sdf = v.sdf; + *weight = v.weight; + *colour = v.color; +}*/ + +//! computes the (local) virtual voxel pos of an index; idx in [0;511] +__device__ +int3 pdelinVoxelIndex(uint idx) { + int x = idx % SDF_BLOCK_SIZE_PAD; + int y = (idx % (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD)) / SDF_BLOCK_SIZE_PAD; + int z = idx / (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD); + return make_int3(x,y,z); +} + +//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3 +__device__ +uint plinVoxelPos(const int3& virtualVoxelPos) { + return + virtualVoxelPos.z * SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD + + virtualVoxelPos.y * SDF_BLOCK_SIZE_PAD + + virtualVoxelPos.x; +} + +__device__ +void deleteVoxel(ftl::voxhash::Voxel& v) { + v.color = make_uchar3(0,0,0); + v.weight = 0; + v.sdf = PINF; +} + +__device__ inline int3 blockDelinear(const int3 &base, uint i) { + return make_int3(base.x + (i & 0x1), base.y + (i & 0x2), base.z + (i & 0x4)); +} + +__device__ inline uint blockLinear(int x, int y, int z) { + return x + (y << 1) + (z << 2); +} + +__device__ inline void trilinearInterp(const ftl::voxhash::HashData &hashData, const ftl::voxhash::Voxel *voxels, const uint *ix, const float3 &pos, float &depth, uchar3 &colour) { + float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f); + const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(pos)); // Should be world position of ray, not voxel?? + float dist = 0.0f; + dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color); + dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color); + dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color); + dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *make_float3(voxels[ix[3]].color); + dist+= weight.x * weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color); + dist+= (1.0f-weight.x)* weight.y * weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *make_float3(voxels[ix[5]].color); + dist+= weight.x *(1.0f-weight.y)* weight.z *voxels[ix[6]].sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *make_float3(voxels[ix[6]].color); + dist+= weight.x * weight.y * weight.z *voxels[ix[7]].sdf; colorFloat+= weight.x * weight.y * weight.z *make_float3(voxels[ix[7]].color); + depth = dist; + colour = make_uchar3(colorFloat); +} + +__global__ void nickRenderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData, RayCastParams params) { + // TODO(Nick) Reduce bank conflicts by aligning these + __shared__ ftl::voxhash::Voxel voxels[SDF_BLOCK_BUFFER]; + __shared__ ftl::voxhash::HashEntry blocks[8]; + + const uint i = threadIdx.x; //inside of an SDF block + + //TODO (Nick) Either don't use compactified or re-run compacitification using render cam frustrum + if (i == 0) blocks[0] = hashData.d_hashCompactified[blockIdx.x]; + else if (i <= 7) blocks[i] = hashData.getHashEntryForSDFBlockPos(blockDelinear(blocks[0].pos, i)); + + // Make sure all hash entries are cached + __syncthreads(); + + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(blocks[0].pos); + const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i)); + const int3 pi = pi_base + vp; + const uint j = plinVoxelPos(vp); // Padded linear index + const float3 worldPos = hashData.virtualVoxelPosToWorld(pi); + + // Load distances and colours into shared memory + padding + const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[blocks[0].ptr + i]; + voxels[j] = v; + + // TODO (Nick) Coalesce memory better? + uint imod = i & 0x7; + bool v_do = imod < 3; + + if (v_do) { + const uint v_a = (i >> 3) & 0x7; + const uint v_c = (i >> 6); + const uint v_b = (imod >= 1) ? v_c : v_a; + const int3 v_cache = make_int3(((imod == 0) ? 8 : v_a), ((imod == 1) ? 8 : v_b), ((imod == 2) ? 8 : v_c)); + const int3 v_ii = make_int3((imod == 0) ? 0 : v_a, (imod == 1) ? 0 : v_b, (imod == 2) ? 0 : v_c); + const int v_block = blockLinear((imod == 0) ? 1 : 0, (imod == 1) ? 1 : 0, (imod == 2) ? 1 : 0); + ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(v_cache)]; + const uint ii = hashData.linearizeVoxelPos(v_ii); + if (blocks[v_block].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[v_block].ptr + ii]; + else deleteVoxel(padVox); + } + + /*if (vp.x == 7) { + ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x+1,vp.y,vp.z))]; + const uint ii = hashData.linearizeVoxelPos(make_int3(0,vp.y,vp.z)); + //padVox = hashData.getVoxel(make_int3(pi.x+1,pi.y,pi.z)); + if (blocks[blockLinear(1,0,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(1,0,0)].ptr + ii]; + else deleteVoxel(padVox); + } + if (vp.y == 7) { + ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y+1,vp.z))]; + const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,0,vp.z)); + //padVox = hashData.getVoxel(make_int3(pi.x,pi.y+1,pi.z)); + if (blocks[blockLinear(0,1,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,1,0)].ptr + ii]; + else deleteVoxel(padVox); + } + if (vp.z == 7) { + ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y,vp.z+1))]; + const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,vp.y,0)); + //padVox = hashData.getVoxel(make_int3(pi.x,pi.y,pi.z+1)); + if (blocks[blockLinear(0,0,1)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,0,1)].ptr + ii]; + else deleteVoxel(padVox); + }*/ + + // Indexes of the 8 neighbor voxels in one direction + const uint ix[8] = { + j, j+SDF_DX, j+SDF_DY, j+SDF_DZ, j+SDF_DX+SDF_DY, j+SDF_DY+SDF_DZ, + j+SDF_DX+SDF_DZ, j+SDF_DX+SDF_DY+SDF_DZ + }; + + __syncthreads(); + + // If any weight is 0, skip this voxel + const bool missweight = voxels[ix[0]].weight == 0 || voxels[ix[1]].weight == 0 || voxels[ix[2]].weight == 0 || + voxels[ix[3]].weight == 0 || voxels[ix[4]].weight == 0 || voxels[ix[5]].weight == 0 || + voxels[ix[6]].weight == 0 || voxels[ix[7]].weight == 0; + if (missweight) return; + + // Trilinear Interpolation (simple and fast) + /*float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f); + const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(worldPos)); // Should be world position of ray, not voxel?? + float dist = 0.0f; + dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color); + dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color); + dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color); + dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *make_float3(voxels[ix[3]].color); + dist+= weight.x * weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color); + dist+= (1.0f-weight.x)* weight.y * weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *make_float3(voxels[ix[5]].color); + dist+= weight.x *(1.0f-weight.y)* weight.z *voxels[ix[6]].sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *make_float3(voxels[ix[6]].color); + dist+= weight.x * weight.y * weight.z *voxels[ix[7]].sdf; colorFloat+= weight.x * weight.y * weight.z *make_float3(voxels[ix[7]].color); + + // Must finish using colours before updating colours + __syncthreads(); + + //voxels[j].color = make_uchar3(colorFloat); + //voxels[j].sdf = dist; + + // What happens if fitlered voxel is put back? + //hashData.d_SDFBlocks[blocks[0].ptr + i] = voxels[j]; + + //return;*/ + + bool is_surface = false; + // Identify surfaces through sign change. Since we only check in one direction + // it is fine to check for any sign change? +#pragma unroll + for (int u=0; u<=1; u++) { + for (int v=0; v<=1; v++) { + for (int w=0; w<=1; w++) { + const int3 uvi = make_int3(vp.x+u,vp.y+v,vp.z+w); + + // Skip these cases since we didn't load voxels properly + if (uvi.x == 8 && uvi.y == 8 || uvi.x == 8 && uvi.z == 8 || uvi.y == 8 && uvi.z == 8) continue; + + if (signbit(voxels[j].sdf) != signbit(voxels[plinVoxelPos(uvi)].sdf)) { + is_surface = true; + break; + } + } + } + } + + // Only for surface voxels, work out screen coordinates + // TODO Could adjust weights, strengthen on surface, weaken otherwise?? + if (!is_surface) return; + + const float3 camPos = params.m_viewMatrix * worldPos; + const float2 screenPosf = cameraData.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f) + + /*if (screenPos.x < params.m_width && screenPos.y < params.m_height && + rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] > camPos.z) { + rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] = camPos.z; + rayCastData.d_colors[(screenPos.y)*params.m_width+screenPos.x] = voxels[j].color; + }*/ + + //return; + + // For this voxel in hash, get its screen position and check it is on screen + // Convert depth map to int by x1000 and use atomicMin + //const int pixsize = static_cast<int>(c_hashParams.m_virtualVoxelSize*c_depthCameraParams.fx/camPos.z)+1; + int pixsizeX = 10; // Max voxel pixels + int pixsizeY = 10; + + for (int y=0; y<pixsizeY; y++) { + for (int x=0; x<pixsizeX; x++) { + // TODO(Nick) Within a window, check pixels that have same voxel id + // Then trilinear interpolate between current voxel and neighbors. + const float3 pixelWorldPos = params.m_viewMatrixInverse * cameraData.kinectDepthToSkeleton(screenPos.x+x,screenPos.y+y, camPos.z); + const float3 posInVoxel = (pixelWorldPos - worldPos) / make_float3(c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize); + + if (posInVoxel.x >= 1.0f || posInVoxel.y >= 1.0f || posInVoxel.z >= 1.0f) { + pixsizeX = x; + continue; + } + + /*float depth; + uchar3 col; + trilinearInterp(hashData, voxels, ix, pixelWorldPos, depth, col);*/ + int idepth = static_cast<int>(camPos.z * 100.0f); + + // TODO (Nick) MAKE THIS ATOMIC!!!! + /*if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height && + rayCastData.d_depth_i[(screenPos.y+y)*params.m_width+screenPos.x+x] > idepth) { + rayCastData.d_depth[(screenPos.y+y)*params.m_width+screenPos.x+x] = idepth; + rayCastData.d_colors[(screenPos.y+y)*params.m_width+screenPos.x+x] = voxels[j].color; + }*/ + + if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height) { + int index = (screenPos.y+y)*params.m_width+screenPos.x+x; + if (rayCastData.d_depth_i[index] > idepth && atomicMin(&rayCastData.d_depth_i[index], idepth) != idepth) { + //rayCastData.d_depth[index] = idepth; + rayCastData.d_colors[index] = voxels[j].color; + } + } + } + if (pixsizeX == 0) break; + } +} + +extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams ¶ms) +{ + const dim3 clear_gridSize((params.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (params.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + + clearDepthKernel<<<clear_gridSize, clear_blockSize>>>(hashData, rayCastData, cameraData); + + const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; + const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1); + const dim3 blockSize(threadsPerBlock, 1); + + if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) + nickRenderKernel << <gridSize, blockSize >> >(hashData, rayCastData, cameraData, params); + } + + cudaSafeCall( cudaGetLastError() ); + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + +///////////////////////////////////////////////////////////////////////// +// ray interval splatting +///////////////////////////////////////////////////////////////////////// + +__global__ void resetRayIntervalSplatKernel(RayCastData data) +{ + uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X; + data.point_cloud_[idx] = make_float3(MINF); +} + +extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params) +{ + const dim3 gridSize(NUM_GROUPS_X, (params.m_maxNumVertices + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1); // ! todo check if need third dimension? + const dim3 blockSize(1, 1, 1); + + resetRayIntervalSplatKernel<<<gridSize, blockSize>>>(data); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + +__global__ void rayIntervalSplatKernel(ftl::voxhash::HashData hashData, DepthCameraData depthCameraData, RayCastData rayCastData, DepthCameraData cameraData) +{ + uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X; + + const ftl::voxhash::HashEntry& entry = hashData.d_hashCompactified[idx]; + if (entry.ptr != ftl::voxhash::FREE_ENTRY) { + //if (!hashData.isSDFBlockInCameraFrustumApprox(entry.pos)) return; + const RayCastParams ¶ms = c_rayCastParams; + const float4x4& viewMatrix = params.m_viewMatrix; + + float3 worldCurrentVoxel = hashData.SDFBlockToWorld(entry.pos); + + float3 MINV = worldCurrentVoxel - c_hashParams.m_virtualVoxelSize / 2.0f; + + float3 maxv = MINV+SDF_BLOCK_SIZE*c_hashParams.m_virtualVoxelSize; + + float3 proj000 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, MINV.z)); + float3 proj100 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, MINV.z)); + float3 proj010 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, MINV.z)); + float3 proj001 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, maxv.z)); + float3 proj110 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, MINV.z)); + float3 proj011 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, maxv.z)); + float3 proj101 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, maxv.z)); + float3 proj111 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, maxv.z)); + + // Tree Reduction Min + float3 min00 = fminf(proj000, proj100); + float3 min01 = fminf(proj010, proj001); + float3 min10 = fminf(proj110, proj011); + float3 min11 = fminf(proj101, proj111); + + float3 min0 = fminf(min00, min01); + float3 min1 = fminf(min10, min11); + + float3 minFinal = fminf(min0, min1); + + // Tree Reduction Max + float3 max00 = fmaxf(proj000, proj100); + float3 max01 = fmaxf(proj010, proj001); + float3 max10 = fmaxf(proj110, proj011); + float3 max11 = fmaxf(proj101, proj111); + + float3 max0 = fmaxf(max00, max01); + float3 max1 = fmaxf(max10, max11); + + float3 maxFinal = fmaxf(max0, max1); + + float depth = maxFinal.z; + if(params.m_splatMinimum == 1) { + depth = minFinal.z; + } + float depthWorld = cameraData.kinectProjToCameraZ(depth); + + //uint addr = idx*4; + //rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld); + //rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld); + //rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld); + //rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld); + + // Note (Nick) : Changed to create point cloud instead of vertex. + uint addr = idx; + rayCastData.point_cloud_[addr] = make_float3(maxFinal.x, maxFinal.y, depth); + //printf("Ray: %f\n", depth); + + /*uint addr = idx*6; + rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld); + rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld); + rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld); + rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, minFinal.y, depth, depthWorld); + rayCastData.d_vertexBuffer[addr+4] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld); + rayCastData.d_vertexBuffer[addr+5] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld);*/ + } +} + +extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData, const RayCastData &rayCastData, const RayCastParams &rayCastParams) +{ + //printf("Ray casting...\n"); + const dim3 gridSize(NUM_GROUPS_X, (rayCastParams.m_numOccupiedSDFBlocks + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1); + const dim3 blockSize(1, 1, 1); + + rayIntervalSplatKernel<<<gridSize, blockSize>>>(hashData, cameraData, rayCastData, cameraData); + +#ifdef _DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} diff --git a/applications/reconstruct/src/registration.cpp b/applications/reconstruct/src/registration.cpp index 7156e03bf82a9937c25314e53af453551b8a45ea..5bc5b6e75395ca41e5a8fff5e6f8fbb8281730aa 100644 --- a/applications/reconstruct/src/registration.cpp +++ b/applications/reconstruct/src/registration.cpp @@ -101,7 +101,7 @@ PointCloud<PointXYZ>::Ptr cornersToPointCloud(const vector<cv::Point2f> &corners for (int i = 0; i < corners_len; i++) { double x = corners[i].x; double y = corners[i].y; - double d = disp.at<float>((int) y, (int) x) * 1000.0f; // todo: better estimation + double d = disp.at<float>((int) y, (int) x); // * 1000.0f; // todo: better estimation //cv::Vec4d homg_pt = Q_ * cv::Vec4d(x, y, d, 1.0); //cv::Vec3d p = cv::Vec3d(homg_pt.val) / homg_pt[3]; diff --git a/applications/reconstruct/src/scene_rep_hash_sdf.cu b/applications/reconstruct/src/scene_rep_hash_sdf.cu new file mode 100644 index 0000000000000000000000000000000000000000..e3daa1ced6bef2db16eb4efa3516e90714486cb4 --- /dev/null +++ b/applications/reconstruct/src/scene_rep_hash_sdf.cu @@ -0,0 +1,755 @@ +// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDASceneRepHashSDF.cu + +//#include <cutil_inline.h> +//#include <cutil_math.h> +#include <vector_types.h> +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> + +#include <ftl/voxel_hash.hpp> +#include <ftl/depth_camera.hpp> +#include <ftl/ray_cast_params.hpp> + +#define T_PER_BLOCK 8 + +using ftl::voxhash::HashData; +using ftl::voxhash::HashParams; +using ftl::voxhash::Voxel; +using ftl::voxhash::HashEntry; +using ftl::voxhash::FREE_ENTRY; + +// TODO (Nick) Use ftl::cuda::Texture (texture objects) +//texture<float, cudaTextureType2D, cudaReadModeElementType> depthTextureRef; +//texture<float4, cudaTextureType2D, cudaReadModeElementType> colorTextureRef; + +__device__ __constant__ HashParams c_hashParams; +__device__ __constant__ RayCastParams c_rayCastParams; +__device__ __constant__ DepthCameraParams c_depthCameraParams; + +extern "C" void updateConstantHashParams(const HashParams& params) { + + size_t size; + cudaSafeCall(cudaGetSymbolSize(&size, c_hashParams)); + cudaSafeCall(cudaMemcpyToSymbol(c_hashParams, ¶ms, size, 0, cudaMemcpyHostToDevice)); + +#ifdef DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + } + + +extern "C" void updateConstantRayCastParams(const RayCastParams& params) { + //printf("Update ray cast params\n"); + size_t size; + cudaSafeCall(cudaGetSymbolSize(&size, c_rayCastParams)); + cudaSafeCall(cudaMemcpyToSymbol(c_rayCastParams, ¶ms, size, 0, cudaMemcpyHostToDevice)); + +#ifdef DEBUG + cutilSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + +} + +extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params) { + //printf("Update depth camera params\n"); + size_t size; + cudaSafeCall(cudaGetSymbolSize(&size, c_depthCameraParams)); + cudaSafeCall(cudaMemcpyToSymbol(c_depthCameraParams, ¶ms, size, 0, cudaMemcpyHostToDevice)); + +#ifdef DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + +} + +extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData) +{ + /*cudaSafeCall(cudaBindTextureToArray(depthTextureRef, depthCameraData.d_depthArray, depthCameraData.h_depthChannelDesc)); + cudaSafeCall(cudaBindTextureToArray(colorTextureRef, depthCameraData.d_colorArray, depthCameraData.h_colorChannelDesc)); + depthTextureRef.filterMode = cudaFilterModePoint; + colorTextureRef.filterMode = cudaFilterModePoint;*/ +} + +__global__ void resetHeapKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + + if (idx == 0) { + hashData.d_heapCounter[0] = hashParams.m_numSDFBlocks - 1; //points to the last element of the array + } + + if (idx < hashParams.m_numSDFBlocks) { + + hashData.d_heap[idx] = hashParams.m_numSDFBlocks - idx - 1; + uint blockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + uint base_idx = idx * blockSize; + for (uint i = 0; i < blockSize; i++) { + hashData.deleteVoxel(base_idx+i); + } + } +} + +__global__ void resetHashKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx < hashParams.m_hashNumBuckets * HASH_BUCKET_SIZE) { + hashData.deleteHashEntry(hashData.d_hash[idx]); + hashData.deleteHashEntry(hashData.d_hashCompactified[idx]); + } +} + + +__global__ void resetHashBucketMutexKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx < hashParams.m_hashNumBuckets) { + hashData.d_hashBucketMutex[idx] = FREE_ENTRY; + } +} + +extern "C" void resetCUDA(HashData& hashData, const HashParams& hashParams) +{ + { + //resetting the heap and SDF blocks + const dim3 gridSize((hashParams.m_numSDFBlocks + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + resetHeapKernel<<<gridSize, blockSize>>>(hashData); + + + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif + + } + + { + //resetting the hash + const dim3 gridSize((HASH_BUCKET_SIZE * hashParams.m_hashNumBuckets + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + resetHashKernel<<<gridSize, blockSize>>>(hashData); + + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif + } + + { + //resetting the mutex + const dim3 gridSize((hashParams.m_hashNumBuckets + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + resetHashBucketMutexKernel<<<gridSize, blockSize>>>(hashData); + + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif + } + + +} + +extern "C" void resetHashBucketMutexCUDA(HashData& hashData, const HashParams& hashParams) +{ + const dim3 gridSize((hashParams.m_hashNumBuckets + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + resetHashBucketMutexKernel<<<gridSize, blockSize>>>(hashData); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +__device__ +unsigned int linearizeChunkPos(const int3& chunkPos) +{ + int3 p = chunkPos-c_hashParams.m_streamingMinGridPos; + return p.z * c_hashParams.m_streamingGridDimensions.x * c_hashParams.m_streamingGridDimensions.y + + p.y * c_hashParams.m_streamingGridDimensions.x + + p.x; +} + +__device__ +int3 worldToChunks(const float3& posWorld) +{ + float3 p; + p.x = posWorld.x/c_hashParams.m_streamingVoxelExtents.x; + p.y = posWorld.y/c_hashParams.m_streamingVoxelExtents.y; + p.z = posWorld.z/c_hashParams.m_streamingVoxelExtents.z; + + float3 s; + s.x = (float)sign(p.x); + s.y = (float)sign(p.y); + s.z = (float)sign(p.z); + + return make_int3(p+s*0.5f); +} + +__device__ +bool isSDFBlockStreamedOut(const int3& sdfBlock, const HashData& hashData, const unsigned int* d_bitMask) //TODO MATTHIAS (-> move to HashData) +{ + float3 posWorld = hashData.virtualVoxelPosToWorld(hashData.SDFBlockToVirtualVoxelPos(sdfBlock)); // sdfBlock is assigned to chunk by the bottom right sample pos + + uint index = linearizeChunkPos(worldToChunks(posWorld)); + uint nBitsInT = 32; + return ((d_bitMask[index/nBitsInT] & (0x1 << (index%nBitsInT))) != 0x0); +} + +// Note: bitMask used for Streaming out code... could be set to nullptr if not streaming out +// Note: Allocations might need to be around fat rays since multiple voxels could correspond +// to same depth map pixel at larger distances. +__global__ void allocKernel(HashData hashData, DepthCameraData cameraData, const unsigned int* d_bitMask) +{ + const HashParams& hashParams = c_hashParams; + const DepthCameraParams& cameraParams = c_depthCameraParams; + + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < cameraParams.m_imageWidth && y < cameraParams.m_imageHeight) + { + float d = tex2D<float>(cameraData.depth_obj_, x, y); + //if (d == MINF || d < cameraParams.m_sensorDepthWorldMin || d > cameraParams.m_sensorDepthWorldMax) return; + if (d == MINF || d == 0.0f) return; + + if (d >= hashParams.m_maxIntegrationDistance) return; + + // TODO (Nick) Use covariance to include a frustrum of influence + float t = hashData.getTruncation(d); + float minDepth = min(hashParams.m_maxIntegrationDistance, d-t); + float maxDepth = min(hashParams.m_maxIntegrationDistance, d+t); + if (minDepth >= maxDepth) return; + + // Convert ray from image coords to world + // Does kinectDepthToSkeleton convert pixel values to coordinates using + // camera intrinsics? Same as what reprojectTo3D does in OpenCV? + float3 rayMin = cameraData.kinectDepthToSkeleton(x, y, minDepth); + // Is the rigid transform then the estimated camera pose? + rayMin = hashParams.m_rigidTransform * rayMin; + //printf("Ray min: %f,%f,%f\n", rayMin.x, rayMin.y, rayMin.z); + float3 rayMax = cameraData.kinectDepthToSkeleton(x, y, maxDepth); + rayMax = hashParams.m_rigidTransform * rayMax; + + float3 rayDir = normalize(rayMax - rayMin); + + // Only ray cast from min possible depth to max depth + int3 idCurrentVoxel = hashData.worldToSDFBlock(rayMin); + int3 idEnd = hashData.worldToSDFBlock(rayMax); + + float3 step = make_float3(sign(rayDir)); + float3 boundaryPos = hashData.SDFBlockToWorld(idCurrentVoxel+make_int3(clamp(step, 0.0, 1.0f)))-0.5f*hashParams.m_virtualVoxelSize; + float3 tMax = (boundaryPos-rayMin)/rayDir; + float3 tDelta = (step*SDF_BLOCK_SIZE*hashParams.m_virtualVoxelSize)/rayDir; + int3 idBound = make_int3(make_float3(idEnd)+step); + + //#pragma unroll + //for(int c = 0; c < 3; c++) { + // if (rayDir[c] == 0.0f) { tMax[c] = PINF; tDelta[c] = PINF; } + // if (boundaryPos[c] - rayMin[c] == 0.0f) { tMax[c] = PINF; tDelta[c] = PINF; } + //} + if (rayDir.x == 0.0f) { tMax.x = PINF; tDelta.x = PINF; } + if (boundaryPos.x - rayMin.x == 0.0f) { tMax.x = PINF; tDelta.x = PINF; } + + if (rayDir.y == 0.0f) { tMax.y = PINF; tDelta.y = PINF; } + if (boundaryPos.y - rayMin.y == 0.0f) { tMax.y = PINF; tDelta.y = PINF; } + + if (rayDir.z == 0.0f) { tMax.z = PINF; tDelta.z = PINF; } + if (boundaryPos.z - rayMin.z == 0.0f) { tMax.z = PINF; tDelta.z = PINF; } + + + unsigned int iter = 0; // iter < g_MaxLoopIterCount + unsigned int g_MaxLoopIterCount = 1024; +#pragma unroll 1 + while(iter < g_MaxLoopIterCount) { + + //check if it's in the frustum and not checked out + if (hashData.isSDFBlockInCameraFrustumApprox(idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) { + hashData.allocBlock(idCurrentVoxel, cameraParams.flags & 0xFF); + //printf("Allocate block: %d\n",idCurrentVoxel.x); + } + + // Traverse voxel grid + if(tMax.x < tMax.y && tMax.x < tMax.z) { + idCurrentVoxel.x += step.x; + if(idCurrentVoxel.x == idBound.x) return; + tMax.x += tDelta.x; + } + else if(tMax.z < tMax.y) { + idCurrentVoxel.z += step.z; + if(idCurrentVoxel.z == idBound.z) return; + tMax.z += tDelta.z; + } + else { + idCurrentVoxel.y += step.y; + if(idCurrentVoxel.y == idBound.y) return; + tMax.y += tDelta.y; + } + + iter++; + } + } +} + +extern "C" void allocCUDA(HashData& hashData, const HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask) +{ + + //printf("Allocating: %d\n",depthCameraParams.m_imageWidth); + + const dim3 gridSize((depthCameraParams.m_imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthCameraParams.m_imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + allocKernel<<<gridSize, blockSize>>>(hashData, depthCameraData, d_bitMask); + + //cudaSafeCall(cudaDeviceSynchronize()); + + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); + #endif +} + + + +__global__ void fillDecisionArrayKernel(HashData hashData, DepthCameraData depthCameraData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + + if (idx < hashParams.m_hashNumBuckets * HASH_BUCKET_SIZE) { + hashData.d_hashDecision[idx] = 0; + if (hashData.d_hash[idx].ptr != FREE_ENTRY) { + if (hashData.isSDFBlockInCameraFrustumApprox(hashData.d_hash[idx].pos)) { + hashData.d_hashDecision[idx] = 1; //yes + } + } + } +} + +extern "C" void fillDecisionArrayCUDA(HashData& hashData, const HashParams& hashParams, const DepthCameraData& depthCameraData) +{ + const dim3 gridSize((HASH_BUCKET_SIZE * hashParams.m_hashNumBuckets + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + fillDecisionArrayKernel<<<gridSize, blockSize>>>(hashData, depthCameraData); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + +} + +__global__ void compactifyHashKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx < hashParams.m_hashNumBuckets * HASH_BUCKET_SIZE) { + if (hashData.d_hashDecision[idx] == 1) { + hashData.d_hashCompactified[hashData.d_hashDecisionPrefix[idx]-1] = hashData.d_hash[idx]; + } + } +} + +extern "C" void compactifyHashCUDA(HashData& hashData, const HashParams& hashParams) +{ + const dim3 gridSize((HASH_BUCKET_SIZE * hashParams.m_hashNumBuckets + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + + compactifyHashKernel<<<gridSize, blockSize>>>(hashData); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +#define COMPACTIFY_HASH_THREADS_PER_BLOCK 256 +//#define COMPACTIFY_HASH_SIMPLE +__global__ void compactifyHashAllInOneKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; +#ifdef COMPACTIFY_HASH_SIMPLE + if (idx < hashParams.m_hashNumBuckets * HASH_BUCKET_SIZE) { + if (hashData.d_hash[idx].ptr != FREE_ENTRY) { + if (hashData.isSDFBlockInCameraFrustumApprox(hashData.d_hash[idx].pos)) + { + int addr = atomicAdd(hashData.d_hashCompactifiedCounter, 1); + hashData.d_hashCompactified[addr] = hashData.d_hash[idx]; + } + } + } +#else + __shared__ int localCounter; + if (threadIdx.x == 0) localCounter = 0; + __syncthreads(); + + int addrLocal = -1; + if (idx < hashParams.m_hashNumBuckets * HASH_BUCKET_SIZE) { + if (hashData.d_hash[idx].ptr != FREE_ENTRY) { + if (hashData.isSDFBlockInCameraFrustumApprox(hashData.d_hash[idx].pos)) + { + addrLocal = atomicAdd(&localCounter, 1); + } + } + } + + __syncthreads(); + + __shared__ int addrGlobal; + if (threadIdx.x == 0 && localCounter > 0) { + addrGlobal = atomicAdd(hashData.d_hashCompactifiedCounter, localCounter); + } + __syncthreads(); + + if (addrLocal != -1) { + const unsigned int addr = addrGlobal + addrLocal; + hashData.d_hashCompactified[addr] = hashData.d_hash[idx]; + } +#endif +} + +extern "C" unsigned int compactifyHashAllInOneCUDA(HashData& hashData, const HashParams& hashParams) +{ + const unsigned int threadsPerBlock = COMPACTIFY_HASH_THREADS_PER_BLOCK; + const dim3 gridSize((HASH_BUCKET_SIZE * hashParams.m_hashNumBuckets + threadsPerBlock - 1) / threadsPerBlock, 1); + const dim3 blockSize(threadsPerBlock, 1); + + cudaSafeCall(cudaMemset(hashData.d_hashCompactifiedCounter, 0, sizeof(int))); + compactifyHashAllInOneKernel << <gridSize, blockSize >> >(hashData); + unsigned int res = 0; + cudaSafeCall(cudaMemcpy(&res, hashData.d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif + return res; +} + +__device__ float4 make_float4(uchar4 c) { + return make_float4(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z), static_cast<float>(c.w)); +} + +inline __device__ uchar4 bilinearFilterColor(const float2& screenPos, cudaTextureObject_t colorTextureRef) { + const DepthCameraParams& cameraParams = c_depthCameraParams; + const int imageWidth = cameraParams.m_imageWidth; + const int imageHeight = cameraParams.m_imageHeight; + const int2 p00 = make_int2(screenPos.x+0.5f, screenPos.y+0.5f); + const int2 dir = sign(make_float2(screenPos.x - p00.x, screenPos.y - p00.y)); + + const int2 p01 = p00 + make_int2(0.0f, dir.y); + const int2 p10 = p00 + make_int2(dir.x, 0.0f); + const int2 p11 = p00 + make_int2(dir.x, dir.y); + + const float alpha = (screenPos.x - p00.x)*dir.x; + const float beta = (screenPos.y - p00.y)*dir.y; + + float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f; + if(p00.x >= 0 && p00.x < imageWidth && p00.y >= 0 && p00.y < imageHeight) { uchar4 v00 = tex2D<uchar4>(colorTextureRef, p00.x, p00.y); if(v00.x != 0) { s0 += (1.0f-alpha)*make_float4(v00); w0 += (1.0f-alpha); } } + if(p10.x >= 0 && p10.x < imageWidth && p10.y >= 0 && p10.y < imageHeight) { uchar4 v10 = tex2D<uchar4>(colorTextureRef, p10.x, p10.y); if(v10.x != 0) { s0 += alpha *make_float4(v10); w0 += alpha ; } } + + float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f; + if(p01.x >= 0 && p01.x < imageWidth && p01.y >= 0 && p01.y < imageHeight) { uchar4 v01 = tex2D<uchar4>(colorTextureRef, p01.x, p01.y); if(v01.x != 0) { s1 += (1.0f-alpha)*make_float4(v01); w1 += (1.0f-alpha);} } + if(p11.x >= 0 && p11.x < imageWidth && p11.y >= 0 && p11.y < imageHeight) { uchar4 v11 = tex2D<uchar4>(colorTextureRef, p11.x, p11.y); if(v11.x != 0) { s1 += alpha *make_float4(v11); w1 += alpha ;} } + + const float4 p0 = s0/w0; + const float4 p1 = s1/w1; + + float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f; + if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); } + if(w1 > 0.0f) { ss += beta *p1; ww += beta ; } + + if(ww > 0.0f) { + ss /= ww; + return make_uchar4(ss.x,ss.y,ss.z,ss.w); + } else return make_uchar4(0, 0, 0, 0); +} + +__device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) { + float x = c1.x-c2.x; + float y = c1.y-c2.y; + float z = c1.z-c2.z; + return x*x + y*y + z*z; +} + +__global__ void integrateDepthMapKernel(HashData hashData, DepthCameraData cameraData, cudaTextureObject_t depthT, cudaTextureObject_t colourT) { + const HashParams& hashParams = c_hashParams; + const DepthCameraParams& cameraParams = c_depthCameraParams; + + //TODO check if we should load this in shared memory + HashEntry& entry = hashData.d_hashCompactified[blockIdx.x]; + //if (entry.ptr == FREE_ENTRY) { + // printf("invliad integrate"); + // return; //should never happen since we did the compactification before + //} + + int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(entry.pos); + + uint i = threadIdx.x; //inside of an SDF block + int3 pi = pi_base + make_int3(hashData.delinearizeVoxelIndex(i)); + float3 pf = hashData.virtualVoxelPosToWorld(pi); + + pf = hashParams.m_rigidTransformInverse * pf; + uint2 screenPos = make_uint2(cameraData.cameraToKinectScreenInt(pf)); + + // For this voxel in hash, get its screen position and check it is on screen + if (screenPos.x < cameraParams.m_imageWidth && screenPos.y < cameraParams.m_imageHeight) { //on screen + + //float depth = g_InputDepth[screenPos]; + float depth = tex2D<float>(depthT, screenPos.x, screenPos.y); + //if (depth > 20.0f) return; + + uchar4 color = make_uchar4(0, 0, 0, 0); + //if (cameraData.d_colorData) { + color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y); + //color = bilinearFilterColor(cameraData.cameraToKinectScreenFloat(pf)); + //} + + //printf("screen pos %d\n", color.x); + //return; + + // Depth is within accepted max distance from camera + if (depth > 0 && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) + float depthZeroOne = cameraData.cameraToKinectProjZ(depth); + + // Calculate SDF of this voxel wrt the depth map value + float sdf = depth - pf.z; + float truncation = hashData.getTruncation(depth); + + // Is this voxel close enough to cam for depth map value + // CHECK Nick: If is too close then free space violation so remove? + // This isn't enough if the disparity has occlusions that don't cause violations + // Could RGB changes also cause removals if depth can't be confirmed? + if (sdf > truncation) { + uint idx = entry.ptr + i; + hashData.d_SDFBlocks[idx].weight = 0; + //hashData.d_SDFBlocks[idx].sdf = PINF; + hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,0); + } + else if (sdf > -truncation) // && depthZeroOne >= 0.0f && depthZeroOne <= 1.0f) //check if in truncation range should already be made in depth map computation + { + if (sdf >= 0.0f) { + sdf = fminf(truncation, sdf); + } else { + sdf = fmaxf(-truncation, sdf); + } + + + //printf("SDF: %f\n", sdf); + //float weightUpdate = g_WeightSample; + //weightUpdate = (1-depthZeroOne)*5.0f + depthZeroOne*0.05f; + //weightUpdate *= g_WeightSample; + float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f); + + Voxel curr; //construct current voxel + curr.sdf = sdf; + curr.weight = weightUpdate; + curr.color = make_uchar3(color.x, color.y, color.z); + + + uint idx = entry.ptr + i; + + if (entry.flags != cameraParams.flags & 0xFF) { + entry.flags = cameraParams.flags & 0xFF; + hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,0); + } + + Voxel newVoxel; + //if (color.x == MINF) hashData.combineVoxelDepthOnly(hashData.d_SDFBlocks[idx], curr, newVoxel); + //else hashData.combineVoxel(hashData.d_SDFBlocks[idx], curr, newVoxel); + hashData.combineVoxel(hashData.d_SDFBlocks[idx], curr, newVoxel); + + hashData.d_SDFBlocks[idx] = newVoxel; + + //Voxel prev = getVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx); + //Voxel newVoxel = combineVoxel(curr, prev); + //setVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx, newVoxel); + } + } else { + uint idx = entry.ptr + i; + float coldist = colourDistance(color, hashData.d_SDFBlocks[idx].color); + if (depth > 40.0f && coldist > 100.0f) { + //hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,(uchar)(coldist)); + hashData.d_SDFBlocks[idx].weight = hashData.d_SDFBlocks[idx].weight >> 1; + } + } + } +} + + +extern "C" void integrateDepthMapCUDA(HashData& hashData, const HashParams& hashParams, + const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) +{ + const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; + const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1); + const dim3 blockSize(threadsPerBlock, 1); + + if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) + integrateDepthMapKernel << <gridSize, blockSize >> >(hashData, depthCameraData, depthCameraData.depth_obj_, depthCameraData.colour_obj_); + } + + cudaSafeCall( cudaGetLastError() ); +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + + +__global__ void starveVoxelsKernel(HashData hashData) { + + const uint idx = blockIdx.x; + const HashEntry& entry = hashData.d_hashCompactified[idx]; + + //is typically exectued only every n'th frame + int weight = hashData.d_SDFBlocks[entry.ptr + threadIdx.x].weight; + weight = max(0, weight-1); + hashData.d_SDFBlocks[entry.ptr + threadIdx.x].weight = weight; //CHECK Remove to totally clear previous frame (Nick) +} + +extern "C" void starveVoxelsKernelCUDA(HashData& hashData, const HashParams& hashParams) +{ + const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; + const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1); + const dim3 blockSize(threadsPerBlock, 1); + + if (hashParams.m_numOccupiedBlocks > 0) { + starveVoxelsKernel << <gridSize, blockSize >> >(hashData); + } +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +__shared__ float shared_MinSDF[SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE / 2]; +__shared__ uint shared_MaxWeight[SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE / 2]; + + +__global__ void garbageCollectIdentifyKernel(HashData hashData) { + + const DepthCameraParams& cameraParams = c_depthCameraParams; + const unsigned int hashIdx = blockIdx.x; + const HashEntry& entry = hashData.d_hashCompactified[hashIdx]; + + // Entire block was not touched in this frame, so remove (Nick) + /*if (entry.flags != cameraParams.flags & 0xFF) { + hashData.d_hashDecision[hashIdx] = 1; + return; + }*/ + + //uint h = hashData.computeHashPos(entry.pos); + //hashData.d_hashDecision[hashIdx] = 1; + //if (hashData.d_hashBucketMutex[h] == LOCK_ENTRY) return; + + //if (entry.ptr == FREE_ENTRY) return; //should never happen since we did compactify before + //const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + + const unsigned int idx0 = entry.ptr + 2*threadIdx.x+0; + const unsigned int idx1 = entry.ptr + 2*threadIdx.x+1; + + Voxel v0 = hashData.d_SDFBlocks[idx0]; + Voxel v1 = hashData.d_SDFBlocks[idx1]; + + if (v0.weight == 0) v0.sdf = PINF; + if (v1.weight == 0) v1.sdf = PINF; + + shared_MinSDF[threadIdx.x] = min(fabsf(v0.sdf), fabsf(v1.sdf)); //init shared memory + shared_MaxWeight[threadIdx.x] = max(v0.weight, v1.weight); + +#pragma unroll 1 + for (uint stride = 2; stride <= blockDim.x; stride <<= 1) { + __syncthreads(); + if ((threadIdx.x & (stride-1)) == (stride-1)) { + shared_MinSDF[threadIdx.x] = min(shared_MinSDF[threadIdx.x-stride/2], shared_MinSDF[threadIdx.x]); + shared_MaxWeight[threadIdx.x] = max(shared_MaxWeight[threadIdx.x-stride/2], shared_MaxWeight[threadIdx.x]); + } + } + + __syncthreads(); + + if (threadIdx.x == blockDim.x - 1) { + float minSDF = shared_MinSDF[threadIdx.x]; + uint maxWeight = shared_MaxWeight[threadIdx.x]; + + float t = hashData.getTruncation(c_depthCameraParams.m_sensorDepthWorldMax); //MATTHIAS TODO check whether this is a reasonable metric + + if (minSDF >= t || maxWeight == 0) { + hashData.d_hashDecision[hashIdx] = 1; + } else { + hashData.d_hashDecision[hashIdx] = 0; + } + } +} + +extern "C" void garbageCollectIdentifyCUDA(HashData& hashData, const HashParams& hashParams) { + + const unsigned int threadsPerBlock = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE / 2; + const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1); + const dim3 blockSize(threadsPerBlock, 1); + + if (hashParams.m_numOccupiedBlocks > 0) { + garbageCollectIdentifyKernel << <gridSize, blockSize >> >(hashData); + } +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} + + +__global__ void garbageCollectFreeKernel(HashData hashData) { + + //const uint hashIdx = blockIdx.x; + const uint hashIdx = blockIdx.x*blockDim.x + threadIdx.x; + + + if (hashIdx < c_hashParams.m_numOccupiedBlocks && hashData.d_hashDecision[hashIdx] != 0) { //decision to delete the hash entry + + const HashEntry& entry = hashData.d_hashCompactified[hashIdx]; + //if (entry.ptr == FREE_ENTRY) return; //should never happen since we did compactify before + + if (hashData.deleteHashEntryElement(entry.pos)) { //delete hash entry from hash (and performs heap append) + const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + + #pragma unroll 1 + for (uint i = 0; i < linBlockSize; i++) { //clear sdf block: CHECK TODO another kernel? + hashData.deleteVoxel(entry.ptr + i); + } + } + } +} + + +extern "C" void garbageCollectFreeCUDA(HashData& hashData, const HashParams& hashParams) { + + const unsigned int threadsPerBlock = T_PER_BLOCK*T_PER_BLOCK; + const dim3 gridSize((hashParams.m_numOccupiedBlocks + threadsPerBlock - 1) / threadsPerBlock, 1); + const dim3 blockSize(threadsPerBlock, 1); + + if (hashParams.m_numOccupiedBlocks > 0) { + garbageCollectFreeKernel << <gridSize, blockSize >> >(hashData); + } +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + cutilCheckMsg(__FUNCTION__); +#endif +} diff --git a/applications/vision/src/main.cpp b/applications/vision/src/main.cpp index e6d14d05707067233c2e98a93aa6c506c9e959e2..d3a193ff12302738ca6c1094fcb541b44d71fa3f 100644 --- a/applications/vision/src/main.cpp +++ b/applications/vision/src/main.cpp @@ -52,6 +52,31 @@ using cv::Mat; using json = nlohmann::json; using ftl::config; +namespace ftl { +void disparityToDepth(const cv::Mat &disparity, cv::Mat &depth, const cv::Mat &q) { + cv::Matx44d _Q; + q.convertTo(_Q, CV_64F); + + if (depth.empty()) depth = cv::Mat(disparity.size(), CV_32F); + + for( int y = 0; y < disparity.rows; y++ ) { + const float *sptr = disparity.ptr<float>(y); + float *dptr = depth.ptr<float>(y); + + for( int x = 0; x < disparity.cols; x++ ) { + double d = sptr[x]; + cv::Vec4d homg_pt = _Q*cv::Vec4d(x, y, d, 1.0); + //dptr[x] = Vec3d(homg_pt.val); + //dptr[x] /= homg_pt[3]; + dptr[x] = (homg_pt[2] / homg_pt[3]) / 1000.0f; // Depth in meters + + if( fabs(d) <= FLT_EPSILON ) + dptr[x] = 1000.0f; + } + } +} +}; + static void run(const string &file) { ctpl::thread_pool pool(2); diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp index 741f857cc6e5536fb95bd815b0fc6fda2737cb90..91d5a28c30e1fa911ff984e8a14dabf5acc0aaea 100644 --- a/components/common/cpp/include/ftl/cuda_common.hpp +++ b/components/common/cpp/include/ftl/cuda_common.hpp @@ -27,7 +27,7 @@ class HisteresisTexture { template <typename T> class TextureObject { public: - TextureObject() : texobj_(0), ptr_(nullptr) {}; + __host__ __device__ TextureObject() : texobj_(0), ptr_(nullptr) {}; TextureObject(const cv::cuda::PtrStepSz<T> &d); TextureObject(T *ptr, int pitch, int width, int height); TextureObject(size_t width, size_t height); @@ -39,10 +39,13 @@ class TextureObject { __host__ __device__ T *devicePtr(int v) { return &ptr_[v*pitch2_]; } __host__ __device__ int width() const { return width_; } __host__ __device__ int height() const { return height_; } - cudaTextureObject_t cudaTexture() const { return texobj_; } + __host__ __device__ cudaTextureObject_t cudaTexture() const { return texobj_; } + + #ifdef __CUDACC__ __device__ inline T tex2D(int u, int v) { return ::tex2D<T>(texobj_, u, v); } __device__ inline T tex2D(float u, float v) { return ::tex2D<T>(texobj_, u, v); } - + #endif + __host__ __device__ inline const T &operator()(int u, int v) const { return ptr_[u+v*pitch2_]; } __host__ __device__ inline T &operator()(int u, int v) { return ptr_[u+v*pitch2_]; } @@ -83,7 +86,7 @@ TextureObject<T>::TextureObject(const cv::cuda::PtrStepSz<T> &d) { texDesc.readMode = cudaReadModeElementType; cudaTextureObject_t tex = 0; - cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); + cudaSafeCall(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); texobj_ = tex; pitch_ = d.step; pitch2_ = pitch_ / sizeof(T); diff --git a/components/common/cpp/src/configuration.cpp b/components/common/cpp/src/configuration.cpp index 45b40da01e6d2fd6d720ca1f19fea6ceda6ac564..a302d606bc32d8d2cd1ee8cc18b66387beab54bf 100644 --- a/components/common/cpp/src/configuration.cpp +++ b/components/common/cpp/src/configuration.cpp @@ -111,9 +111,9 @@ optional<string> ftl::locateFile(const string &name) { /** * Combine one json config with another patch json config. */ -static bool mergeConfig(const string &path) { - ifstream i; - i.open(path); +static bool mergeConfig(const string path) { + ifstream i(path.c_str()); + //i.open(path); if (i.is_open()) { try { json t; @@ -123,6 +123,8 @@ static bool mergeConfig(const string &path) { } catch (json::parse_error& e) { LOG(ERROR) << "Parse error in loading config: " << e.what(); return false; + } catch (...) { + LOG(ERROR) << "Unknown error opening config file"; } } else { return false; @@ -137,26 +139,8 @@ static bool findConfiguration(const string &file, const vector<string> &paths, bool f = false; bool found = false; - f = mergeConfig(FTL_GLOBAL_CONFIG_ROOT "/config.json"); - found |= f; - if (f) LOG(INFO) << "Loaded config: " << FTL_GLOBAL_CONFIG_ROOT "/config.json"; - f = mergeConfig(FTL_LOCAL_CONFIG_ROOT "/config.json"); - found |= f; - if (f) LOG(INFO) << "Loaded config: " << FTL_LOCAL_CONFIG_ROOT "/config.json"; - f = mergeConfig("./config.json"); - found |= f; - if (f) LOG(INFO) << "Loaded config: " << "./config.json"; - - for (auto p : paths) { - if (is_directory(p)) { - f = mergeConfig(p+"/config.json"); - found |= f; - if (f) LOG(INFO) << "Loaded config: " << p << "/config.json"; - } - } - - if (file != "") { - f = mergeConfig(file); + if (file.length() > 0) { + f = mergeConfig(file.substr(1,file.length()-2)); found |= f; if (!f) { @@ -164,6 +148,24 @@ static bool findConfiguration(const string &file, const vector<string> &paths, } else { LOG(INFO) << "Loaded config: " << file; } + } else { + f = mergeConfig(FTL_GLOBAL_CONFIG_ROOT "/config.json"); + found |= f; + if (f) LOG(INFO) << "Loaded config: " << FTL_GLOBAL_CONFIG_ROOT "/config.json"; + f = mergeConfig(FTL_LOCAL_CONFIG_ROOT "/config.json"); + found |= f; + if (f) LOG(INFO) << "Loaded config: " << FTL_LOCAL_CONFIG_ROOT "/config.json"; + f = mergeConfig("./config.json"); + found |= f; + if (f) LOG(INFO) << "Loaded config: " << "./config.json"; + + for (auto p : paths) { + if (is_directory(p)) { + f = mergeConfig(p+"/config.json"); + found |= f; + if (f) LOG(INFO) << "Loaded config: " << p << "/config.json"; + } + } } if (found) { diff --git a/components/net/cpp/include/ftl/net/universe.hpp b/components/net/cpp/include/ftl/net/universe.hpp index ccb6e12156fd02a7aece1ed4c84d44edf12f7719..fe376971c5c644093ea037132eef2e7c21cd6d9a 100644 --- a/components/net/cpp/include/ftl/net/universe.hpp +++ b/components/net/cpp/include/ftl/net/universe.hpp @@ -241,7 +241,9 @@ void Universe::publish(const std::string &res, ARGS... args) { template <typename... ARGS> void Universe::publish(const ftl::URI &res, ARGS... args) { + std::unique_lock<std::mutex> lk(net_mutex_); auto subs = subscribers_[res.getBaseURI()]; + lk.unlock(); for (auto p : subs) { auto peer = getPeer(p); if (peer) { diff --git a/components/net/cpp/src/universe.cpp b/components/net/cpp/src/universe.cpp index b78af39903a14d71a24cbc6aafc8820314efca43..f98d33b8eb86979b16f9a1ae0c256c626682e9bf 100644 --- a/components/net/cpp/src/universe.cpp +++ b/components/net/cpp/src/universe.cpp @@ -135,6 +135,7 @@ void Universe::_installBindings(Peer *p) { void Universe::_installBindings() { bind("__subscribe__", [this](const UUID &id, const string &uri) -> bool { LOG(INFO) << "Subscription to " << uri << " by " << id.to_string(); + unique_lock<mutex> lk(net_mutex_); subscribers_[ftl::URI(uri).to_string()].push_back(id); return true; }); diff --git a/components/renderers/cpp/include/ftl/display.hpp b/components/renderers/cpp/include/ftl/display.hpp index 75d0f785843392be10d3722b89728645dfd30d0a..7856c3ee924d009f547009dc48613afced465c2f 100644 --- a/components/renderers/cpp/include/ftl/display.hpp +++ b/components/renderers/cpp/include/ftl/display.hpp @@ -6,7 +6,7 @@ #define _FTL_DISPLAY_HPP_ #include <ftl/config.h> -#include "../../../rgbd-sources/include/ftl/rgbd_source.hpp" +#include "../../../rgbd-sources/include/ftl/camera_params.hpp" #include <nlohmann/json.hpp> #include <opencv2/opencv.hpp> @@ -45,6 +45,8 @@ class Display { void wait(int ms); + void onKey(std::function<void(int)> h) { key_handlers_.push_back(h); } + private: nlohmann::json config_; @@ -57,6 +59,7 @@ class Display { #endif // HAVE_PCL bool active_; + std::vector<std::function<void(int)>> key_handlers_; }; }; diff --git a/components/renderers/cpp/src/display.cpp b/components/renderers/cpp/src/display.cpp index 9760b4a7241da5e48924522058df855b04adff07..88ad2b18344c75d5609a1817bd1d7daf6d99befa 100644 --- a/components/renderers/cpp/src/display.cpp +++ b/components/renderers/cpp/src/display.cpp @@ -222,6 +222,7 @@ bool Display::render(pcl::PointCloud<pcl::PointXYZRGB>::ConstPtr pc) { #endif // HAVE_PCL bool Display::render(const cv::Mat &img, style_t s) { if (s == STYLE_NORMAL) { + cv::namedWindow("Image", cv::WINDOW_KEEPRATIO); cv::imshow("Image", img); } else if (s = STYLE_DISPARITY) { Mat idepth; @@ -251,10 +252,21 @@ void Display::wait(int ms) { } if (config_["depth"] || config_["left"] || config_["right"]) { - if(cv::waitKey(ms) == 27) { - // exit if ESC is pressed - active_ = false; - } + while (true) { + int key = cv::waitKey(ms); + + if(key == 27) { + // exit if ESC is pressed + active_ = false; + } else if (key == -1) { + return; + } else { + ms = 1; + for (auto &h : key_handlers_) { + h(key); + } + } + } } } diff --git a/components/rgbd-sources/include/ftl/camera_params.hpp b/components/rgbd-sources/include/ftl/camera_params.hpp new file mode 100644 index 0000000000000000000000000000000000000000..40f64a1ed35b87480f57f3f099a3fbf44c66491e --- /dev/null +++ b/components/rgbd-sources/include/ftl/camera_params.hpp @@ -0,0 +1,22 @@ +#pragma once +#ifndef _FTL_RGBD_CAMERA_PARAMS_HPP_ +#define _FTL_RGBD_CAMERA_PARAMS_HPP_ + +namespace ftl{ +namespace rgbd { + +struct CameraParameters { + double fx; + double fy; + double cx; + double cy; + unsigned int width; + unsigned int height; + double minDepth; + double maxDepth; +}; + +}; +}; + +#endif diff --git a/components/rgbd-sources/include/ftl/rgbd_source.hpp b/components/rgbd-sources/include/ftl/rgbd_source.hpp index a00a035490dc929ea3f3441ff82b988450abb5ea..5032aa3c59fb1a60e4bf31fa99b3671476ce6378 100644 --- a/components/rgbd-sources/include/ftl/rgbd_source.hpp +++ b/components/rgbd-sources/include/ftl/rgbd_source.hpp @@ -4,6 +4,7 @@ #include <ftl/config.h> #include <ftl/configurable.hpp> +#include <ftl/camera_params.hpp> #include <ftl/net/universe.hpp> #include <opencv2/opencv.hpp> #include <Eigen/Eigen> @@ -11,17 +12,6 @@ namespace ftl { namespace rgbd { -struct CameraParameters { - double fx; - double fy; - double cx; - double cy; - unsigned int width; - unsigned int height; - double minDepth; - double maxDepth; -}; - /** * Abstract class for any generic RGB-Depth data source. It can also store pose * information, although this must be provided by an external source. diff --git a/components/rgbd-sources/src/local.cpp b/components/rgbd-sources/src/local.cpp index 6faa4649bff2e49368ea508a77c2e212e49f866e..80190115fda19bc7ad749ccafa6dfb743af035e8 100644 --- a/components/rgbd-sources/src/local.cpp +++ b/components/rgbd-sources/src/local.cpp @@ -76,6 +76,8 @@ LocalSource::LocalSource(nlohmann::json &config) camera_a_->grab(); camera_a_->retrieve(frame); LOG(INFO) << "Video size : " << frame.cols << "x" << frame.rows; + width_ = frame.cols; + height_ = frame.rows; stereo_ = true; } @@ -125,9 +127,13 @@ LocalSource::LocalSource(const string &vid, nlohmann::json &config) if (frame.cols >= 2*frame.rows) { LOG(INFO) << "Video size : " << frame.cols/2 << "x" << frame.rows; + width_ = frame.cols / 2; + height_ = frame.rows; stereo_ = true; } else { LOG(INFO) << "Video size : " << frame.cols << "x" << frame.rows; + width_ = frame.cols; + height_ = frame.rows; stereo_ = false; } diff --git a/components/rgbd-sources/src/local.hpp b/components/rgbd-sources/src/local.hpp index ad79b9bf68dd4d5f5ae5589b2b9f6eed76b6a08a..d483ea185cf0a6db28d9da09fd55c2f1e17822c2 100644 --- a/components/rgbd-sources/src/local.hpp +++ b/components/rgbd-sources/src/local.hpp @@ -19,6 +19,9 @@ class LocalSource : public Configurable { bool left(cv::Mat &m); bool right(cv::Mat &m); bool get(cv::Mat &l, cv::Mat &r); + + unsigned int width() const { return width_; } + unsigned int height() const { return height_; } //void setFramerate(float fps); //float getFramerate() const; @@ -38,6 +41,8 @@ class LocalSource : public Configurable { float downsize_; cv::VideoCapture *camera_a_; cv::VideoCapture *camera_b_; + unsigned int width_; + unsigned int height_; }; }; diff --git a/components/rgbd-sources/src/stereovideo_source.cpp b/components/rgbd-sources/src/stereovideo_source.cpp index 55d4735ecde4cc18f115ca81a008d7cded9ad784..bf6d585be76975e5fa4a6feb8b195fb836bbed7f 100644 --- a/components/rgbd-sources/src/stereovideo_source.cpp +++ b/components/rgbd-sources/src/stereovideo_source.cpp @@ -53,8 +53,8 @@ StereoVideoSource::StereoVideoSource(nlohmann::json &config, const string &file) q.at<double>(1,1), // Fy -q.at<double>(0,2), // Cx -q.at<double>(1,2), // Cy - (unsigned int)left_.cols, // TODO (Nick) - (unsigned int)left_.rows, + (unsigned int)lsrc_->width(), // TODO (Nick) + (unsigned int)lsrc_->height(), 0.0f, // 0m min 15.0f // 15m max }; diff --git a/config/config_voxhash.json b/config/config_voxhash.json new file mode 100644 index 0000000000000000000000000000000000000000..d8b4f69954f4f00a6940a27fd0bb71938a518306 --- /dev/null +++ b/config/config_voxhash.json @@ -0,0 +1,117 @@ +{ + "vision": { + "type": "stereovideo", + "middlebury": { + "dataset": "", + "threshold": 2.0, + "scale": 0.5 + }, + "source": { + "flip": false, + "nostereo": false, + "scale": 1.0, + "flip_vert": false, + "max_fps": 25, + "width": 640, + "height": 480, + "crosshair": false + }, + "calibrate": false, + "calibration": { + "board_size": [9,6], + "square_size": 50, + "frame_delay": 1.0, + "num_frames": 35, + "assume_zero_tangential_distortion": true, + "fix_aspect_ratio": true, + "fix_principal_point_at_center": true, + "use_fisheye_model": false, + "fix_k1": false, + "fix_k2": false, + "fix_k3": false, + "fix_k4": true, + "fix_k5": true, + "save": true, + "use_intrinsics": true, + "use_extrinsics": true, + "flip_vertical": false + }, + "camera": { + "name": "Panasonic Lumix DMC-FZ300", + "focal_length": 25, + "sensor_width": 6.17, + "base_line": 0.1 + }, + "disparity": { + "algorithm": "libsgm", + "use_cuda": true, + "minimum": 0, + "maximum": 256, + "tau": 0.0, + "gamma": 0.0, + "window_size": 5, + "sigma": 1.5, + "lambda": 8000.0, + "filter": true + }, + "display": { + "flip_vert": false, + "disparity": false, + "points": false, + "depth": false, + "left": false, + "right": false + }, + "net": { + "listen": "tcp://*:9001", + "peers": [] + }, + "stream": { + "name": "dummy" + } + }, + "reconstruction": { + "net": { + "peers": ["tcp://localhost:9001"] + }, + "sources": [{"type": "net", "uri": "ftl://utu.fi/dummy/rgb-d"}], + "display": { + "flip_vert": false, + "disparity": false, + "points": true, + "depth": false, + "left": false, + "right": false + }, + "voxelhash": { + "adapterWidth": 640, + "adapterHeight": 480, + "sensorDepthMax": 20.0, + "sensorDepthMin": 1.0, + "SDFRayIncrementFactor": 0.8, + "SDFRayThresSampleDistFactor": 50.5, + "SDFRayThresDistFactor": 50.0, + "SDFUseGradients": false, + "hashNumBuckets": 50000, + "hashMaxCollisionLinkedListSize": 7, + "hashNumSDFBlocks": 100000, + "SDFVoxelSize": 0.01, + "SDFMaxIntegrationDistance": 8.0, + "SDFTruncation": 0.1, + "SDFTruncationScale": 0.01, + "SDFIntegrationWeightSample": 10, + "SDFIntegrationWeightMax": 255 + }, + "registration": { + "reference-source" : "ftl://utu.fi/dummy/rgb-d", + "calibration" : { + "run": false, + "max_error": 25, + "iterations" : 10, + "delay" : 1000, + "patternsize" : [9, 6] + } + } + } +} + diff --git a/reconstruct/src/voxel_hash.cu b/reconstruct/src/voxel_hash.cu new file mode 100644 index 0000000000000000000000000000000000000000..6dfe838a71bb81e1407bc02ba48e06db0b39e70e --- /dev/null +++ b/reconstruct/src/voxel_hash.cu @@ -0,0 +1,127 @@ +#include <ftl/voxel_hash.hpp> + +using ftl::voxhash::VoxelHash; +using ftl::voxhash::hash; +using ftl::voxhash::HashBucket; +using ftl::voxhash::HashEntry; + +__device__ HashEntry *ftl::voxhash::cuda::lookupBlock(HashBucket *table, int3 vox) { + uint32_t hashcode = hash<HASH_SIZE>(vox.x / 8, vox.y / 8, vox.z / 8); + HashBucket *bucket = table[hashcode]; + + while (bucket) { + for (int i=0; i<ENTRIES_PER_BUCKET; i++) { + HashEntry *entry = &bucket->entries[i]; + if (entry->position[0] == vox.x && entry->position[1] == vox.y && entry->position[2] == vox.z) { + return entry; + } + } + if (bucket->entries[ENTRIES_PER_BUCKET-1].offset) { + bucket += bucket->entries[ENTRIES_PER_BUCKET-1].offset; + } else { + break; + } + } + + return nullptr; +} + +__device__ HashEntry *ftl::voxhash::cuda::insertBlock(HashBucket *table, int3 vox) { + uint32_t hashcode = hash<HASH_SIZE>(vox.x / 8, vox.y / 8, vox.z / 8); + HashBucket *bucket = table[hashcode]; + + while (bucket) { + for (int i=0; i<ENTRIES_PER_BUCKET; i++) { + HashEntry *entry = &bucket->entries[i]; + if (entry->position[0] == vox.x && entry->position[1] == vox.y && entry->position[2] == vox.z) { + return entry; + } else if (entry->pointer == 0) { + // Allocate block. + entry->position[0] = vox.x; + entry->position[1] = vox.y; + entry->position[2] = vox.z; + } + } + if (bucket->entries[ENTRIES_PER_BUCKET-1].offset) { + bucket += bucket->entries[ENTRIES_PER_BUCKET-1].offset; + } else { + // Find a new bucket to append to this list + } + } + + return nullptr; +} + +__device__ Voxel *ftl::voxhash::cuda::insert(HashBucket *table, VoxelBlocks *blocks, int3 vox) { + HashEntry *entry = insertBlock(table, vox); + VoxelBlock *block = blocks[entry->pointer]; + return &block->voxels[vox.x % 8][vox.y % 8][vox.z % 8]; +} + +__device__ Voxel *ftl::voxhash::cuda::lookup(HashBucket *table, int3 vox) { + HashEntry *entry = lookupBlock(table, vox); + if (entry == nullptr) return nullptr; + VoxelBlock *block = blocks[entry->pointer]; + return &block->voxels[vox.x % 8][vox.y % 8][vox.z % 8]; +} + +VoxelHash::VoxelHash(float resolution) : resolution_(resolution) { + // Allocate CPU memory + table_cpu_.resize(HASH_SIZE); + + // Allocate and init GPU memory + table_gpu_ = table_cpu_; +} + +VoxelHash::~VoxelHash() { + +} + +void VoxelHash::nextFrame() { + // Clear the hash for now... +} + +__global__ static void merge_view_kernel(PtrStepSz<cv::Point3f> cloud, PtrStepSz<uchar3> rgb, PtrStepSz<double> pose, ftl::voxhash::cuda::VoxelHash vhash) { + for (STRIDE_Y(v,cloud.rows) { + for (STRIDE_X(u,cloud.cols) { + cv::Point3f p = cloud.at<cv::Point3f>(v,u); + // TODO Lock hash block on insert. + Voxel *voxel = ftl::voxhash::cuda::insert(vhash.table, make_int3(p.x,p.y,p.z)); + uchar3 colour = rgb.at<uchar3>(v,u); + voxel->colorRGB[0] = colour[0]; + voxel->colorRGB[1] = colour[1]; + voxel->colorRGB[2] = colour[2]; + voxel->weight = 1; + } + } +} + +static void merge_view_call(const PtrStepSz<cv::Point3f> &cloud, const PtrStepSz<uchar3> &rgb, const PtrStepSz<double> &pose, ftl::voxhash::cuda::VoxelHash vhash) { + dim3 grid(1,1,1); + dim3 threads(128, 1, 1); + grid.x = cv::cuda::device::divUp(l.cols, 128); + grid.y = cv::cuda::device::divUp(l.rows, 21); + + merge_view_kernel<<<grid,threads>>>(cloud, rgb, pose, vhash); + cudaSafeCall( cudaGetLastError() ); +} + +void VoxelHash::mergeView(const cv::Mat &cloud, const cv::Mat &rgb, const cv::Mat &pose) { + // Upload GPU Mat + gpu_cloud_.upload(cloud); + gpu_rgb_.upload(rgb); + gpu_pose_.upload(pose); + + // Call kernel to insert each point in hash table + merge_view_call(gpu_cloud_, gpu_rgb_, gpu_pose_, getCUDAStructure()); +} + +// Called per voxel +__global__ static void render_kernel(PtrStepSz<uchar3> out, PtrStepSz<double> cam, ftl::voxhash::cuda::VoxelHash vhash) { + +} + +void VoxelHash::render(cv::Mat &output, const cv::Mat &cam) { + // Call kernel to process every voxel and add to image + // Download gpu mat into output. +}