Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • nicolaspope/ftl
1 result
Select Git revision
Show changes
Commits on Source (11)
Showing
with 7147 additions and 60 deletions
...@@ -14,15 +14,14 @@ stages: ...@@ -14,15 +14,14 @@ stages:
# paths: # paths:
# - build/ # - build/
docker: linux:
stage: all stage: all
tags: tags:
- docker - linux
image: ubuntu:18.04 # before_script:
before_script: # - export DEBIAN_FRONTEND=noninteractive
- export DEBIAN_FRONTEND=noninteractive # - apt-get update -qq && apt-get install -y -qq g++ cmake git
- apt-get update -qq && apt-get install -y -qq g++ cmake git # - apt-get install -y -qq libopencv-dev libgoogle-glog-dev liburiparser-dev libreadline-dev libmsgpack-dev uuid-dev libpcl-dev
- apt-get install -y -qq libopencv-dev libgoogle-glog-dev liburiparser-dev libreadline-dev libmsgpack-dev uuid-dev libpcl-dev
script: script:
- mkdir build - mkdir build
- cd build - cd build
......
# Need to include staged files and libs # Need to include staged files and libs
include_directories(${PROJECT_SOURCE_DIR}/applications/reconstruct/include) #include_directories(${PROJECT_SOURCE_DIR}/reconstruct/include)
#include_directories(${PROJECT_BINARY_DIR}) #include_directories(${PROJECT_BINARY_DIR})
set(REPSRC set(REPSRC
src/main.cpp src/main.cpp
src/scene_rep_hash_sdf.cu
src/ray_cast_sdf.cu
src/camera_util.cu
src/ray_cast_sdf.cpp
src/registration.cpp src/registration.cpp
) )
...@@ -11,6 +15,15 @@ add_executable(ftl-reconstruct ${REPSRC}) ...@@ -11,6 +15,15 @@ add_executable(ftl-reconstruct ${REPSRC})
add_dependencies(ftl-reconstruct ftlnet) add_dependencies(ftl-reconstruct ftlnet)
add_dependencies(ftl-reconstruct ftlcommon) add_dependencies(ftl-reconstruct ftlcommon)
target_include_directories(ftl-reconstruct PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
PRIVATE src)
if (CUDA_FOUND)
set_property(TARGET ftl-reconstruct PROPERTY CUDA_SEPARABLE_COMPILATION ON)
endif()
#target_include_directories(cv-node PUBLIC ${PROJECT_SOURCE_DIR}/include) #target_include_directories(cv-node PUBLIC ${PROJECT_SOURCE_DIR}/include)
target_link_libraries(ftl-reconstruct ftlcommon ftlrgbd Threads::Threads ${OpenCV_LIBS} glog::glog ftlnet ftlrender) target_link_libraries(ftl-reconstruct ftlcommon ftlrgbd Threads::Threads ${OpenCV_LIBS} glog::glog ftlnet ftlrender)
......
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/cuda_SimpleMatrixUtil.h
#pragma once
#ifndef _CUDA_SIMPLE_MATRIX_UTIL_
#define _CUDA_SIMPLE_MATRIX_UTIL_
#ifndef __CUDACC__
inline float __int_as_float(int a) {
union {int a; float b;} u;
u.a = a;
return u.b;
}
#endif
#define MINF __int_as_float(0xff800000)
#define INF __int_as_float(0x7f800000)
#include <iostream>
#include <ftl/cuda_util.hpp>
//////////////////////////////
// float2x2
//////////////////////////////
class float2x2
{
public:
inline __device__ __host__ float2x2()
{
}
inline __device__ __host__ float2x2(const float values[4])
{
m11 = values[0]; m12 = values[1];
m21 = values[2]; m22 = values[3];
}
inline __device__ __host__ float2x2(const float2x2& other)
{
m11 = other.m11; m12 = other.m12;
m21 = other.m21; m22 = other.m22;
}
inline __device__ __host__ void setZero()
{
m11 = 0.0f; m12 = 0.0f;
m21 = 0.0f; m22 = 0.0f;
}
static inline __device__ __host__ float2x2 getIdentity()
{
float2x2 res;
res.setZero();
res.m11 = res.m22 = 1.0f;
return res;
}
inline __device__ __host__ float2x2& operator=(const float2x2 &other)
{
m11 = other.m11; m12 = other.m12;
m21 = other.m21; m22 = other.m22;
return *this;
}
inline __device__ __host__ float2x2 getInverse()
{
float2x2 res;
res.m11 = m22; res.m12 = -m12;
res.m21 = -m21; res.m22 = m11;
return res*(1.0f/det());
}
inline __device__ __host__ float det()
{
return m11*m22-m21*m12;
}
inline __device__ __host__ float2 operator*(const float2& v) const
{
return make_float2(m11*v.x + m12*v.y, m21*v.x + m22*v.y);
}
//! matrix scalar multiplication
inline __device__ __host__ float2x2 operator*(const float t) const
{
float2x2 res;
res.m11 = m11 * t; res.m12 = m12 * t;
res.m21 = m21 * t; res.m22 = m22 * t;
return res;
}
//! matrix matrix multiplication
inline __device__ __host__ float2x2 operator*(const float2x2& other) const
{
float2x2 res;
res.m11 = m11 * other.m11 + m12 * other.m21;
res.m12 = m11 * other.m12 + m12 * other.m22;
res.m21 = m21 * other.m11 + m22 * other.m21;
res.m22 = m21 * other.m12 + m22 * other.m22;
return res;
}
//! matrix matrix addition
inline __device__ __host__ float2x2 operator+(const float2x2& other) const
{
float2x2 res;
res.m11 = m11 + other.m11;
res.m12 = m12 + other.m12;
res.m21 = m21 + other.m21;
res.m22 = m22 + other.m22;
return res;
}
inline __device__ __host__ float& operator()(int i, int j)
{
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const
{
return entries2[i][j];
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union
{
struct
{
float m11; float m12;
float m21; float m22;
};
float entries[4];
float entries2[2][2];
};
};
//////////////////////////////
// float2x3
//////////////////////////////
class float2x3
{
public:
inline __device__ __host__ float2x3()
{
}
inline __device__ __host__ float2x3(const float values[6])
{
m11 = values[0]; m12 = values[1]; m13 = values[2];
m21 = values[3]; m22 = values[4]; m23 = values[5];
}
inline __device__ __host__ float2x3(const float2x3& other)
{
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
}
inline __device__ __host__ float2x3& operator=(const float2x3 &other)
{
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
return *this;
}
inline __device__ __host__ float2 operator*(const float3 &v) const
{
return make_float2(m11*v.x + m12*v.y + m13*v.z, m21*v.x + m22*v.y + m23*v.z);
}
//! matrix scalar multiplication
inline __device__ __host__ float2x3 operator*(const float t) const
{
float2x3 res;
res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t;
res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t;
return res;
}
//! matrix scalar division
inline __device__ __host__ float2x3 operator/(const float t) const
{
float2x3 res;
res.m11 = m11 / t; res.m12 = m12 / t; res.m13 = m13 / t;
res.m21 = m21 / t; res.m22 = m22 / t; res.m23 = m23 / t;
return res;
}
inline __device__ __host__ float& operator()(int i, int j)
{
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const
{
return entries2[i][j];
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union
{
struct
{
float m11; float m12; float m13;
float m21; float m22; float m23;
};
float entries[6];
float entries2[3][2];
};
};
//////////////////////////////
// float3x2
//////////////////////////////
class float3x2
{
public:
inline __device__ __host__ float3x2()
{
}
inline __device__ __host__ float3x2(const float values[6])
{
m11 = values[0]; m12 = values[1];
m21 = values[2]; m22 = values[3];
m31 = values[4]; m32 = values[5];
}
inline __device__ __host__ float3x2& operator=(const float3x2& other)
{
m11 = other.m11; m12 = other.m12;
m21 = other.m21; m22 = other.m22;
m31 = other.m31; m32 = other.m32;
return *this;
}
inline __device__ __host__ float3 operator*(const float2& v) const
{
return make_float3(m11*v.x + m12*v.y, m21*v.x + m22*v.y, m31*v.x + m32*v.y);
}
inline __device__ __host__ float3x2 operator*(const float t) const
{
float3x2 res;
res.m11 = m11 * t; res.m12 = m12 * t;
res.m21 = m21 * t; res.m22 = m22 * t;
res.m31 = m31 * t; res.m32 = m32 * t;
return res;
}
inline __device__ __host__ float& operator()(int i, int j)
{
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const
{
return entries2[i][j];
}
inline __device__ __host__ float2x3 getTranspose()
{
float2x3 res;
res.m11 = m11; res.m12 = m21; res.m13 = m31;
res.m21 = m12; res.m22 = m22; res.m23 = m32;
return res;
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union
{
struct
{
float m11; float m12;
float m21; float m22;
float m31; float m32;
};
float entries[6];
float entries2[3][2];
};
};
inline __device__ __host__ float2x2 matMul(const float2x3& m0, const float3x2& m1)
{
float2x2 res;
res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31;
res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32;
res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31;
res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32;
return res;
}
class float3x3 {
public:
inline __device__ __host__ float3x3() {
}
inline __device__ __host__ float3x3(const float values[9]) {
m11 = values[0]; m12 = values[1]; m13 = values[2];
m21 = values[3]; m22 = values[4]; m23 = values[5];
m31 = values[6]; m32 = values[7]; m33 = values[8];
}
inline __device__ __host__ float3x3(const float3x3& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
m31 = other.m31; m32 = other.m32; m33 = other.m33;
}
explicit inline __device__ __host__ float3x3(const float2x2& other) {
m11 = other.m11; m12 = other.m12; m13 = 0.0;
m21 = other.m21; m22 = other.m22; m23 = 0.0;
m31 = 0.0; m32 = 0.0; m33 = 0.0;
}
inline __device__ __host__ float3x3& operator=(const float3x3 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
m31 = other.m31; m32 = other.m32; m33 = other.m33;
return *this;
}
inline __device__ __host__ float& operator()(int i, int j) {
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const {
return entries2[i][j];
}
static inline __device__ __host__ void swap(float& v0, float& v1) {
float tmp = v0;
v0 = v1;
v1 = tmp;
}
inline __device__ __host__ void transpose() {
swap(m12, m21);
swap(m13, m31);
swap(m23, m32);
}
inline __device__ __host__ float3x3 getTranspose() const {
float3x3 ret = *this;
ret.transpose();
return ret;
}
//! inverts the matrix
inline __device__ __host__ void invert() {
*this = getInverse();
}
//! computes the inverse of the matrix; the result is returned
inline __device__ __host__ float3x3 getInverse() const {
float3x3 res;
res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7];
res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7];
res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4];
res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6];
res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6];
res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3];
res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6];
res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6];
res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3];
float nom = 1.0f/det();
return res * nom;
}
inline __device__ __host__ void setZero(float value = 0.0f) {
m11 = m12 = m13 = value;
m21 = m22 = m23 = value;
m31 = m32 = m33 = value;
}
inline __device__ __host__ float det() const {
return
+ m11*m22*m33
+ m12*m23*m31
+ m13*m21*m32
- m31*m22*m13
- m32*m23*m11
- m33*m21*m12;
}
inline __device__ __host__ float3 getRow(unsigned int i) {
return make_float3(entries[3*i+0], entries[3*i+1], entries[3*i+2]);
}
inline __device__ __host__ void setRow(unsigned int i, float3& r) {
entries[3*i+0] = r.x;
entries[3*i+1] = r.y;
entries[3*i+2] = r.z;
}
inline __device__ __host__ void normalizeRows()
{
//#pragma unroll 3
for(unsigned int i = 0; i<3; i++)
{
float3 r = getRow(i);
float l = length(r);
r.x /= l;
r.y /= l;
r.z /= l;
setRow(i, r);
}
}
//! computes the product of two matrices (result stored in this)
inline __device__ __host__ void mult(const float3x3 &other) {
float3x3 res;
res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33;
res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33;
res.m31 = m21 * other.m11 + m32 * other.m21 + m33 * other.m31;
res.m32 = m21 * other.m12 + m32 * other.m22 + m33 * other.m32;
res.m33 = m21 * other.m13 + m32 * other.m23 + m33 * other.m33;
*this = res;
}
//! computes the sum of two matrices (result stored in this)
inline __device__ __host__ void add(const float3x3 &other) {
m11 += other.m11; m12 += other.m12; m13 += other.m13;
m21 += other.m21; m22 += other.m22; m23 += other.m23;
m31 += other.m31; m32 += other.m32; m33 += other.m33;
}
//! standard matrix matrix multiplication
inline __device__ __host__ float3x3 operator*(const float3x3 &other) const {
float3x3 res;
res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33;
res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33;
res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31;
res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32;
res.m33 = m31 * other.m13 + m32 * other.m23 + m33 * other.m33;
return res;
}
//! standard matrix matrix multiplication
inline __device__ __host__ float3x2 operator*(const float3x2 &other) const {
float3x2 res;
res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31;
res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32;
return res;
}
inline __device__ __host__ float3 operator*(const float3 &v) const {
return make_float3(
m11*v.x + m12*v.y + m13*v.z,
m21*v.x + m22*v.y + m23*v.z,
m31*v.x + m32*v.y + m33*v.z
);
}
inline __device__ __host__ float3x3 operator*(const float t) const {
float3x3 res;
res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t;
res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t;
res.m31 = m31 * t; res.m32 = m32 * t; res.m33 = m33 * t;
return res;
}
inline __device__ __host__ float3x3 operator+(const float3x3 &other) const {
float3x3 res;
res.m11 = m11 + other.m11; res.m12 = m12 + other.m12; res.m13 = m13 + other.m13;
res.m21 = m21 + other.m21; res.m22 = m22 + other.m22; res.m23 = m23 + other.m23;
res.m31 = m31 + other.m31; res.m32 = m32 + other.m32; res.m33 = m33 + other.m33;
return res;
}
inline __device__ __host__ float3x3 operator-(const float3x3 &other) const {
float3x3 res;
res.m11 = m11 - other.m11; res.m12 = m12 - other.m12; res.m13 = m13 - other.m13;
res.m21 = m21 - other.m21; res.m22 = m22 - other.m22; res.m23 = m23 - other.m23;
res.m31 = m31 - other.m31; res.m32 = m32 - other.m32; res.m33 = m33 - other.m33;
return res;
}
static inline __device__ __host__ float3x3 getIdentity() {
float3x3 res;
res.setZero();
res.m11 = res.m22 = res.m33 = 1.0f;
return res;
}
static inline __device__ __host__ float3x3 getZeroMatrix() {
float3x3 res;
res.setZero();
return res;
}
static inline __device__ __host__ float3x3 getDiagonalMatrix(float diag = 1.0f) {
float3x3 res;
res.m11 = diag; res.m12 = 0.0f; res.m13 = 0.0f;
res.m21 = 0.0f; res.m22 = diag; res.m23 = 0.0f;
res.m31 = 0.0f; res.m32 = 0.0f; res.m33 = diag;
return res;
}
static inline __device__ __host__ float3x3 tensorProduct(const float3 &v, const float3 &vt) {
float3x3 res;
res.m11 = v.x * vt.x; res.m12 = v.x * vt.y; res.m13 = v.x * vt.z;
res.m21 = v.y * vt.x; res.m22 = v.y * vt.y; res.m23 = v.y * vt.z;
res.m31 = v.z * vt.x; res.m32 = v.z * vt.y; res.m33 = v.z * vt.z;
return res;
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union {
struct {
float m11; float m12; float m13;
float m21; float m22; float m23;
float m31; float m32; float m33;
};
float entries[9];
float entries2[3][3];
};
};
inline __device__ __host__ float2x3 matMul(const float2x3& m0, const float3x3& m1)
{
float2x3 res;
res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31;
res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32;
res.m13 = m0.m11*m1.m13+m0.m12*m1.m23+m0.m13*m1.m33;
res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31;
res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32;
res.m23 = m0.m21*m1.m13+m0.m22*m1.m23+m0.m23*m1.m33;
return res;
}
// (1x2) row matrix as float2
inline __device__ __host__ float3 matMul(const float2& m0, const float2x3& m1)
{
float3 res;
res.x = m0.x*m1.m11+m0.y*m1.m21;
res.y = m0.x*m1.m12+m0.y*m1.m22;
res.z = m0.x*m1.m13+m0.y*m1.m23;
return res;
}
class float3x4 {
public:
inline __device__ __host__ float3x4() {
}
inline __device__ __host__ float3x4(const float values[12]) {
m11 = values[0]; m12 = values[1]; m13 = values[2]; m14 = values[3];
m21 = values[4]; m22 = values[5]; m23 = values[6]; m24 = values[7];
m31 = values[8]; m32 = values[9]; m33 = values[10]; m34 = values[11];
}
inline __device__ __host__ float3x4(const float3x4& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
}
inline __device__ __host__ float3x4(const float3x3& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f;
}
inline __device__ __host__ float3x4 operator=(const float3x4 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
return *this;
}
inline __device__ __host__ float3x4& operator=(const float3x3 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f;
return *this;
}
//! assumes the last line of the matrix implicitly to be (0,0,0,1)
inline __device__ __host__ float4 operator*(const float4 &v) const {
return make_float4(
m11*v.x + m12*v.y + m13*v.z + m14*v.w,
m21*v.x + m22*v.y + m23*v.z + m24*v.w,
m31*v.x + m32*v.y + m33*v.z + m34*v.w,
v.w
);
}
//! assumes an implicit 1 in w component of the input vector
inline __device__ __host__ float3 operator*(const float3 &v) const {
return make_float3(
m11*v.x + m12*v.y + m13*v.z + m14,
m21*v.x + m22*v.y + m23*v.z + m24,
m31*v.x + m32*v.y + m33*v.z + m34
);
}
//! matrix scalar multiplication
inline __device__ __host__ float3x4 operator*(const float t) const {
float3x4 res;
res.m11 = m11 * t; res.m12 = m12 * t; res.m13 = m13 * t; res.m14 = m14 * t;
res.m21 = m21 * t; res.m22 = m22 * t; res.m23 = m23 * t; res.m24 = m24 * t;
res.m31 = m31 * t; res.m32 = m32 * t; res.m33 = m33 * t; res.m34 = m34 * t;
return res;
}
inline __device__ __host__ float3x4& operator*=(const float t) {
*this = *this * t;
return *this;
}
//! matrix scalar division
inline __device__ __host__ float3x4 operator/(const float t) const {
float3x4 res;
res.m11 = m11 / t; res.m12 = m12 / t; res.m13 = m13 / t; res.m14 = m14 / t;
res.m21 = m21 / t; res.m22 = m22 / t; res.m23 = m23 / t; res.m24 = m24 / t;
res.m31 = m31 / t; res.m32 = m32 / t; res.m33 = m33 / t; res.m34 = m34 / t;
return res;
}
inline __device__ __host__ float3x4& operator/=(const float t) {
*this = *this / t;
return *this;
}
//! assumes the last line of the matrix implicitly to be (0,0,0,1)
inline __device__ __host__ float3x4 operator*(const float3x4 &other) const {
float3x4 res;
res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31;
res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32;
res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33;
res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14;
res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31;
res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32;
res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33;
res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24;
res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31;
res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32;
res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33;
res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34;
//res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41;
//res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42;
//res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43;
//res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44;
return res;
}
//! assumes the last line of the matrix implicitly to be (0,0,0,1); and a (0,0,0) translation of other
inline __device__ __host__ float3x4 operator*(const float3x3 &other) const {
float3x4 res;
res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31;
res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32;
res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33;
res.m14 = m14;
res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31;
res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32;
res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33;
res.m24 = m24;
res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31;
res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32;
res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33;
res.m34 = m34;
return res;
}
inline __device__ __host__ float& operator()(int i, int j) {
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const {
return entries2[i][j];
}
//! returns the translation part of the matrix
inline __device__ __host__ float3 getTranslation() {
return make_float3(m14, m24, m34);
}
//! sets only the translation part of the matrix (other values remain unchanged)
inline __device__ __host__ void setTranslation(const float3 &t) {
m14 = t.x;
m24 = t.y;
m34 = t.z;
}
//! returns the 3x3 part of the matrix
inline __device__ __host__ float3x3 getFloat3x3() {
float3x3 ret;
ret.m11 = m11; ret.m12 = m12; ret.m13 = m13;
ret.m21 = m21; ret.m22 = m22; ret.m23 = m23;
ret.m31 = m31; ret.m32 = m32; ret.m33 = m33;
return ret;
}
//! sets the 3x3 part of the matrix (other values remain unchanged)
inline __device__ __host__ void setFloat3x3(const float3x3 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
m31 = other.m31; m32 = other.m32; m33 = other.m33;
}
//! inverts the matrix
inline __device__ __host__ void inverse() {
*this = getInverse();
}
//! computes the inverse of the matrix
inline __device__ __host__ float3x4 getInverse() {
float3x3 A = getFloat3x3();
A.invert();
float3 t = getTranslation();
t = A*t;
float3x4 ret;
ret.setFloat3x3(A);
ret.setTranslation(make_float3(-t.x, -t.y, -t.z)); //float3 doesn't have unary '-'... thank you cuda
return ret;
}
//! prints the matrix; only host
__host__ void print() {
std::cout <<
m11 << " " << m12 << " " << m13 << " " << m14 << std::endl <<
m21 << " " << m22 << " " << m23 << " " << m24 << std::endl <<
m31 << " " << m32 << " " << m33 << " " << m34 << std::endl <<
std::endl;
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union {
struct {
float m11; float m12; float m13; float m14;
float m21; float m22; float m23; float m24;
float m31; float m32; float m33; float m34;
};
float entries[9];
float entries2[3][4];
};
};
class float4x4 {
public:
inline __device__ __host__ float4x4() {
}
inline __device__ __host__ float4x4(const float values[16]) {
m11 = values[0]; m12 = values[1]; m13 = values[2]; m14 = values[3];
m21 = values[4]; m22 = values[5]; m23 = values[6]; m24 = values[7];
m31 = values[8]; m32 = values[9]; m33 = values[10]; m34 = values[11];
m41 = values[12]; m42 = values[13]; m43 = values[14]; m44 = values[15];
}
inline __device__ __host__ float4x4(const float4x4& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
m41 = other.m41; m42 = other.m42; m43 = other.m43; m44 = other.m44;
}
//implicitly assumes last line to (0,0,0,1)
inline __device__ __host__ float4x4(const float3x4& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f;
}
inline __device__ __host__ float4x4(const float3x3& other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f;
m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f;
}
inline __device__ __host__ float4x4 operator=(const float4x4 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
m41 = other.m41; m42 = other.m42; m43 = other.m43; m44 = other.m44;
return *this;
}
inline __device__ __host__ float4x4 operator=(const float3x4 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f;
return *this;
}
inline __device__ __host__ float4x4& operator=(const float3x3 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = 0.0f;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = 0.0f;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = 0.0f;
m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f;
return *this;
}
//! not tested
inline __device__ __host__ float4x4 operator*(const float4x4 &other) const {
float4x4 res;
res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31 + m14*other.m41;
res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32 + m14*other.m42;
res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33 + m14*other.m43;
res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14*other.m44;
res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31 + m24*other.m41;
res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32 + m24*other.m42;
res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33 + m24*other.m43;
res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24*other.m44;
res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31 + m34*other.m41;
res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32 + m34*other.m42;
res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33 + m34*other.m43;
res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34*other.m44;
res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41;
res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42;
res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43;
res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44;
return res;
}
// untested
inline __device__ __host__ float4 operator*(const float4& v) const
{
return make_float4(
m11*v.x + m12*v.y + m13*v.z + m14*v.w,
m21*v.x + m22*v.y + m23*v.z + m24*v.w,
m31*v.x + m32*v.y + m33*v.z + m34*v.w,
m41*v.x + m42*v.y + m43*v.z + m44*v.w
);
}
// untested
//implicitly assumes w to be 1
inline __device__ __host__ float3 operator*(const float3& v) const
{
return make_float3(
m11*v.x + m12*v.y + m13*v.z + m14*1.0f,
m21*v.x + m22*v.y + m23*v.z + m24*1.0f,
m31*v.x + m32*v.y + m33*v.z + m34*1.0f
);
}
inline __device__ __host__ float& operator()(int i, int j) {
return entries2[i][j];
}
inline __device__ __host__ float operator()(int i, int j) const {
return entries2[i][j];
}
static inline __device__ __host__ void swap(float& v0, float& v1) {
float tmp = v0;
v0 = v1;
v1 = tmp;
}
inline __device__ __host__ void transpose() {
swap(m12, m21);
swap(m13, m31);
swap(m23, m32);
swap(m41, m14);
swap(m42, m24);
swap(m43, m34);
}
inline __device__ __host__ float4x4 getTranspose() const {
float4x4 ret = *this;
ret.transpose();
return ret;
}
inline __device__ __host__ void invert() {
*this = getInverse();
}
//! return the inverse matrix; but does not change the current matrix
inline __device__ __host__ float4x4 getInverse() const {
float inv[16];
inv[0] = entries[5] * entries[10] * entries[15] -
entries[5] * entries[11] * entries[14] -
entries[9] * entries[6] * entries[15] +
entries[9] * entries[7] * entries[14] +
entries[13] * entries[6] * entries[11] -
entries[13] * entries[7] * entries[10];
inv[4] = -entries[4] * entries[10] * entries[15] +
entries[4] * entries[11] * entries[14] +
entries[8] * entries[6] * entries[15] -
entries[8] * entries[7] * entries[14] -
entries[12] * entries[6] * entries[11] +
entries[12] * entries[7] * entries[10];
inv[8] = entries[4] * entries[9] * entries[15] -
entries[4] * entries[11] * entries[13] -
entries[8] * entries[5] * entries[15] +
entries[8] * entries[7] * entries[13] +
entries[12] * entries[5] * entries[11] -
entries[12] * entries[7] * entries[9];
inv[12] = -entries[4] * entries[9] * entries[14] +
entries[4] * entries[10] * entries[13] +
entries[8] * entries[5] * entries[14] -
entries[8] * entries[6] * entries[13] -
entries[12] * entries[5] * entries[10] +
entries[12] * entries[6] * entries[9];
inv[1] = -entries[1] * entries[10] * entries[15] +
entries[1] * entries[11] * entries[14] +
entries[9] * entries[2] * entries[15] -
entries[9] * entries[3] * entries[14] -
entries[13] * entries[2] * entries[11] +
entries[13] * entries[3] * entries[10];
inv[5] = entries[0] * entries[10] * entries[15] -
entries[0] * entries[11] * entries[14] -
entries[8] * entries[2] * entries[15] +
entries[8] * entries[3] * entries[14] +
entries[12] * entries[2] * entries[11] -
entries[12] * entries[3] * entries[10];
inv[9] = -entries[0] * entries[9] * entries[15] +
entries[0] * entries[11] * entries[13] +
entries[8] * entries[1] * entries[15] -
entries[8] * entries[3] * entries[13] -
entries[12] * entries[1] * entries[11] +
entries[12] * entries[3] * entries[9];
inv[13] = entries[0] * entries[9] * entries[14] -
entries[0] * entries[10] * entries[13] -
entries[8] * entries[1] * entries[14] +
entries[8] * entries[2] * entries[13] +
entries[12] * entries[1] * entries[10] -
entries[12] * entries[2] * entries[9];
inv[2] = entries[1] * entries[6] * entries[15] -
entries[1] * entries[7] * entries[14] -
entries[5] * entries[2] * entries[15] +
entries[5] * entries[3] * entries[14] +
entries[13] * entries[2] * entries[7] -
entries[13] * entries[3] * entries[6];
inv[6] = -entries[0] * entries[6] * entries[15] +
entries[0] * entries[7] * entries[14] +
entries[4] * entries[2] * entries[15] -
entries[4] * entries[3] * entries[14] -
entries[12] * entries[2] * entries[7] +
entries[12] * entries[3] * entries[6];
inv[10] = entries[0] * entries[5] * entries[15] -
entries[0] * entries[7] * entries[13] -
entries[4] * entries[1] * entries[15] +
entries[4] * entries[3] * entries[13] +
entries[12] * entries[1] * entries[7] -
entries[12] * entries[3] * entries[5];
inv[14] = -entries[0] * entries[5] * entries[14] +
entries[0] * entries[6] * entries[13] +
entries[4] * entries[1] * entries[14] -
entries[4] * entries[2] * entries[13] -
entries[12] * entries[1] * entries[6] +
entries[12] * entries[2] * entries[5];
inv[3] = -entries[1] * entries[6] * entries[11] +
entries[1] * entries[7] * entries[10] +
entries[5] * entries[2] * entries[11] -
entries[5] * entries[3] * entries[10] -
entries[9] * entries[2] * entries[7] +
entries[9] * entries[3] * entries[6];
inv[7] = entries[0] * entries[6] * entries[11] -
entries[0] * entries[7] * entries[10] -
entries[4] * entries[2] * entries[11] +
entries[4] * entries[3] * entries[10] +
entries[8] * entries[2] * entries[7] -
entries[8] * entries[3] * entries[6];
inv[11] = -entries[0] * entries[5] * entries[11] +
entries[0] * entries[7] * entries[9] +
entries[4] * entries[1] * entries[11] -
entries[4] * entries[3] * entries[9] -
entries[8] * entries[1] * entries[7] +
entries[8] * entries[3] * entries[5];
inv[15] = entries[0] * entries[5] * entries[10] -
entries[0] * entries[6] * entries[9] -
entries[4] * entries[1] * entries[10] +
entries[4] * entries[2] * entries[9] +
entries[8] * entries[1] * entries[6] -
entries[8] * entries[2] * entries[5];
float matrixDet = entries[0] * inv[0] + entries[1] * inv[4] + entries[2] * inv[8] + entries[3] * inv[12];
float matrixDetr = 1.0f / matrixDet;
float4x4 res;
for (unsigned int i = 0; i < 16; i++) {
res.entries[i] = inv[i] * matrixDetr;
}
return res;
}
//! returns the 3x3 part of the matrix
inline __device__ __host__ float3x3 getFloat3x3() {
float3x3 ret;
ret.m11 = m11; ret.m12 = m12; ret.m13 = m13;
ret.m21 = m21; ret.m22 = m22; ret.m23 = m23;
ret.m31 = m31; ret.m32 = m32; ret.m33 = m33;
return ret;
}
//! sets the 3x3 part of the matrix (other values remain unchanged)
inline __device__ __host__ void setFloat3x3(const float3x3 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13;
m21 = other.m21; m22 = other.m22; m23 = other.m23;
m31 = other.m31; m32 = other.m32; m33 = other.m33;
}
//! sets the 4x4 part of the matrix to identity
inline __device__ __host__ void setIdentity()
{
m11 = 1.0f; m12 = 0.0f; m13 = 0.0f; m14 = 0.0f;
m21 = 0.0f; m22 = 1.0f; m23 = 0.0f; m24 = 0.0f;
m31 = 0.0f; m32 = 0.0f; m33 = 1.0f; m34 = 0.0f;
m41 = 0.0f; m42 = 0.0f; m43 = 0.0f; m44 = 1.0f;
}
//! sets the 4x4 part of the matrix to identity
inline __device__ __host__ void setValue(float v)
{
m11 = v; m12 = v; m13 = v; m14 = v;
m21 = v; m22 = v; m23 = v; m24 = v;
m31 = v; m32 = v; m33 = v; m34 = v;
m41 = v; m42 = v; m43 = v; m44 = v;
}
//! returns the 3x4 part of the matrix
inline __device__ __host__ float3x4 getFloat3x4() {
float3x4 ret;
ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; ret.m14 = m14;
ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; ret.m24 = m24;
ret.m31 = m31; ret.m32 = m32; ret.m33 = m33; ret.m34 = m34;
return ret;
}
//! sets the 3x4 part of the matrix (other values remain unchanged)
inline __device__ __host__ void setFloat3x4(const float3x4 &other) {
m11 = other.m11; m12 = other.m12; m13 = other.m13; m14 = other.m14;
m21 = other.m21; m22 = other.m22; m23 = other.m23; m24 = other.m24;
m31 = other.m31; m32 = other.m32; m33 = other.m33; m34 = other.m34;
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
union {
struct {
float m11; float m12; float m13; float m14;
float m21; float m22; float m23; float m24;
float m31; float m32; float m33; float m34;
float m41; float m42; float m43; float m44;
};
float entries[16];
float entries2[4][4];
};
};
//////////////////////////////
// matNxM
//////////////////////////////
template<unsigned int N, unsigned int M>
class matNxM
{
public:
//////////////////////////////
// Initialization
//////////////////////////////
inline __device__ __host__ matNxM()
{
}
inline __device__ __host__ matNxM(float* values)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i];
}
inline __device__ __host__ matNxM(const float* values)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i];
}
inline __device__ __host__ matNxM(const matNxM& other)
{
(*this) = other;
}
inline __device__ __host__ matNxM<N,M>& operator=(const matNxM<N,M>& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] = other.entries[i];
return *this;
}
inline __device__ __host__ void setZero()
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] = 0.0f;
}
inline __device__ __host__ void setIdentity()
{
setZero();
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<min(N, M); i++) entries2D[i][i] = 1.0f;
}
static inline __device__ __host__ matNxM<N, M> getIdentity()
{
matNxM<N, M> R; R.setIdentity();
return R;
}
//////////////////////////////
// Conversion
//////////////////////////////
// declare generic constructors for compile time checking of matrix size
template<class B>
explicit inline __device__ __host__ matNxM(const B& other);
template<class B>
explicit inline __device__ __host__ matNxM(const B& other0, const B& other1);
// declare generic casts for compile time checking of matrix size
inline __device__ __host__ operator float();
inline __device__ __host__ operator float2();
inline __device__ __host__ operator float3();
inline __device__ __host__ operator float4();
inline __device__ __host__ operator float2x2();
inline __device__ __host__ operator float3x3();
inline __device__ __host__ operator float4x4();
//////////////////////////////
// Matrix - Matrix Multiplication
//////////////////////////////
template<unsigned int NOther, unsigned int MOther>
inline __device__ __host__ matNxM<N,MOther> operator*(const matNxM<NOther,MOther>& other) const
{
cudaAssert(M == NOther);
matNxM<N,MOther> res;
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<MOther; j++)
{
float sum = 0.0f;
__CONDITIONAL_UNROLL__
for(unsigned int k = 0; k<M; k++)
{
sum += (*this)(i, k)*other(k, j);
}
res(i, j) = sum;
}
}
return res;
}
//////////////////////////////
// Matrix - Inversion
//////////////////////////////
inline __device__ __host__ float det() const;
inline __device__ __host__ matNxM<N, M> getInverse() const;
//////////////////////////////
// Matrix - Transpose
//////////////////////////////
inline __device__ __host__ matNxM<M,N> getTranspose() const
{
matNxM<M,N> res;
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<M; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<N; j++)
{
res(i, j) = (*this)(j, i);
}
}
return res;
}
inline __device__ void printCUDA() const
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<M; j++)
{
printf("%f ", (*this)(i, j));
}
printf("\n");
}
}
inline __device__ bool checkMINF() const
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<M; j++)
{
if((*this)(i, j) == MINF) return true;
}
}
return false;
}
inline __device__ bool checkINF() const
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<M; j++)
{
if((*this)(i, j) == INF) return true;
}
}
return false;
}
inline __device__ bool checkQNAN() const
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<M; j++)
{
if((*this)(i, j) != (*this)(i, j)) return true;
}
}
return false;
}
//////////////////////////////
// Matrix - Matrix Addition
//////////////////////////////
inline __device__ __host__ matNxM<N,M> operator+(const matNxM<N,M>& other) const
{
matNxM<N,M> res = (*this);
res+=other;
return res;
}
inline __device__ __host__ matNxM<N,M>& operator+=(const matNxM<N,M>& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] += other.entries[i];
return (*this);
}
//////////////////////////////
// Matrix - Negation
//////////////////////////////
inline __device__ __host__ matNxM<N,M> operator-() const
{
matNxM<N,M> res = (*this)*(-1.0f);
return res;
}
//////////////////////////////
// Matrix - Matrix Subtraction
//////////////////////////////
inline __device__ __host__ matNxM<N,M> operator-(const matNxM<N,M>& other) const
{
matNxM<N,M> res = (*this);
res-=other;
return res;
}
inline __device__ __host__ matNxM<N,M>& operator-=(const matNxM<N,M>& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] -= other.entries[i];
return (*this);
}
//////////////////////////////
// Matrix - Scalar Multiplication
//////////////////////////////
inline __device__ __host__ matNxM<N,M> operator*(const float t) const
{
matNxM<N,M> res = (*this);
res*=t;
return res;
}
inline __device__ __host__ matNxM<N, M>& operator*=(const float t)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] *= t;
return (*this);
}
//////////////////////////////
// Matrix - Scalar Division
//////////////////////////////
inline __device__ __host__ matNxM<N, M> operator/(const float t) const
{
matNxM<N, M> res = (*this);
res/=t;
return res;
}
inline __device__ __host__ matNxM<N, M>& operator/=(const float t)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<N*M; i++) entries[i] /= t;
return (*this);
}
//////////////////////////
// Element Access
//////////////////////////
inline __device__ __host__ unsigned int nRows()
{
return N;
}
inline __device__ __host__ unsigned int nCols()
{
return M;
}
inline __device__ __host__ float& operator()(unsigned int i, unsigned int j)
{
cudaAssert(i<N && j<M);
return entries2D[i][j];
}
inline __device__ __host__ float operator()(unsigned int i, unsigned int j) const
{
cudaAssert(i<N && j<M);
return entries2D[i][j];
}
inline __device__ __host__ float& operator()(unsigned int i)
{
cudaAssert(i<N*M);
return entries[i];
}
inline __device__ __host__ float operator()(unsigned int i) const
{
cudaAssert(i<N*M);
return entries[i];
}
template<unsigned int NOther, unsigned int MOther>
inline __device__ __host__ void getBlock(unsigned int xStart, unsigned int yStart, matNxM<NOther, MOther>& res) const
{
cudaAssert(xStart+NOther <= N && yStart+MOther <= M);
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<NOther; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<MOther; j++)
{
res(i, j) = (*this)(xStart+i, yStart+j);
}
}
}
template<unsigned int NOther, unsigned int MOther>
inline __device__ __host__ void setBlock(matNxM<NOther, MOther>& input, unsigned int xStart, unsigned int yStart)
{
cudaAssert(xStart+NOther <= N && yStart+MOther <= M);
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<NOther; i++)
{
__CONDITIONAL_UNROLL__
for(unsigned int j = 0; j<MOther; j++)
{
(*this)(xStart+i, yStart+j) = input(i, j);
}
}
}
inline __device__ __host__ const float* ptr() const {
return entries;
}
inline __device__ __host__ float* ptr() {
return entries;
}
// Operators
inline __device__ __host__ float norm1DSquared() const
{
cudaAssert(M==1 || N==1);
float sum = 0.0f;
for(unsigned int i = 0; i<max(N, M); i++) sum += entries[i]*entries[i];
return sum;
}
inline __device__ __host__ float norm1D() const
{
return sqrt(norm1DSquared());
}
private:
union
{
float entries[N*M];
float entries2D[N][M];
};
};
//////////////////////////////
// Scalar - Matrix Multiplication
//////////////////////////////
template<unsigned int N, unsigned int M>
inline __device__ __host__ matNxM<N,M> operator*(const float t, const matNxM<N, M>& mat)
{
matNxM<N,M> res = mat;
res*=t;
return res;
}
//////////////////////////////
// Matrix Inversion
//////////////////////////////
template<>
inline __device__ __host__ float matNxM<3, 3>::det() const
{
const float& m11 = entries2D[0][0];
const float& m12 = entries2D[0][1];
const float& m13 = entries2D[0][2];
const float& m21 = entries2D[1][0];
const float& m22 = entries2D[1][1];
const float& m23 = entries2D[1][2];
const float& m31 = entries2D[2][0];
const float& m32 = entries2D[2][1];
const float& m33 = entries2D[2][2];
return m11*m22*m33 + m12*m23*m31 + m13*m21*m32 - m31*m22*m13 - m32*m23*m11 - m33*m21*m12;
}
template<>
inline __device__ __host__ matNxM<3, 3> matNxM<3, 3>::getInverse() const
{
matNxM<3, 3> res;
res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7];
res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7];
res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4];
res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6];
res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6];
res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3];
res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6];
res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6];
res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3];
return res*(1.0f/det());
}
template<>
inline __device__ __host__ float matNxM<2, 2>::det() const
{
return (*this)(0, 0)*(*this)(1, 1)-(*this)(1, 0)*(*this)(0, 1);
}
template<>
inline __device__ __host__ matNxM<2, 2> matNxM<2, 2>::getInverse() const
{
matNxM<2, 2> res;
res(0, 0) = (*this)(1, 1); res(0, 1) = -(*this)(0, 1);
res(1, 0) = -(*this)(1, 0); res(1, 1) = (*this)(0, 0);
return res*(1.0f/det());
}
//////////////////////////////
// Conversion
//////////////////////////////
// To Matrix from floatNxN
template<>
template<>
inline __device__ __host__ matNxM<1, 1>::matNxM(const float& other)
{
entries[0] = other;
}
// To Matrix from floatNxN
template<>
template<>
inline __device__ __host__ matNxM<2, 2>::matNxM(const float2x2& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<4; i++) entries[i] = other.entries[i];
}
template<>
template<>
inline __device__ __host__ matNxM<3, 3>::matNxM(const float3x3& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<9; i++) entries[i] = other.entries[i];
}
template<>
template<>
inline __device__ __host__ matNxM<4, 4>::matNxM(const float4x4& other)
{
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<16; i++) entries[i] = other.entries[i];
}
template<>
template<>
inline __device__ __host__ matNxM<3, 2>::matNxM(const float3& col0, const float3& col1)
{
entries2D[0][0] = col0.x; entries2D[0][1] = col1.x;
entries2D[1][0] = col0.y; entries2D[1][1] = col1.y;
entries2D[2][0] = col0.z; entries2D[2][1] = col1.z;
}
// To floatNxM from Matrix
template<>
inline __device__ __host__ matNxM<4, 4>::operator float4x4()
{
float4x4 res;
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<16; i++) res.entries[i] = entries[i];
return res;
}
template<>
inline __device__ __host__ matNxM<3, 3>::operator float3x3()
{
float3x3 res;
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<9; i++) res.entries[i] = entries[i];
return res;
}
template<>
inline __device__ __host__ matNxM<2, 2>::operator float2x2()
{
float2x2 res;
__CONDITIONAL_UNROLL__
for(unsigned int i = 0; i<4; i++) res.entries[i] = entries[i];
return res;
}
// To Matrix from floatN
template<>
template<>
inline __device__ __host__ matNxM<2, 1>::matNxM(const float2& other)
{
entries[0] = other.x;
entries[1] = other.y;
}
template<>
template<>
inline __device__ __host__ matNxM<3, 1>::matNxM(const float3& other)
{
entries[0] = other.x;
entries[1] = other.y;
entries[2] = other.z;
}
template<>
template<>
inline __device__ __host__ matNxM<4, 1>::matNxM(const float4& other)
{
entries[0] = other.x;
entries[1] = other.y;
entries[2] = other.z;
entries[3] = other.w;
}
// To floatN from Matrix
template<>
inline __device__ __host__ matNxM<1, 1>::operator float()
{
return entries[0];
}
template<>
inline __device__ __host__ matNxM<2, 1>::operator float2()
{
return make_float2(entries[0], entries[1]);
}
template<>
inline __device__ __host__ matNxM<3, 1>::operator float3()
{
return make_float3(entries[0], entries[1], entries[2]);
}
template<>
inline __device__ __host__ matNxM<4, 1>::operator float4()
{
return make_float4(entries[0], entries[1], entries[2], entries[3]);
}
//////////////////////////////
// Typedefs
//////////////////////////////
typedef matNxM<9, 3> mat9x3;
typedef matNxM<3, 9> mat3x9;
typedef matNxM<9, 1> mat9x1;
typedef matNxM<1, 9> mat1x9;
typedef matNxM<6, 6> mat6x6;
typedef matNxM<6, 1> mat6x1;
typedef matNxM<1, 6> mat1x6;
typedef matNxM<3, 6> mat3x6;
typedef matNxM<6, 3> mat6x3;
typedef matNxM<4, 4> mat4x4;
typedef matNxM<4, 1> mat4x1;
typedef matNxM<1, 4> mat1x4;
typedef matNxM<3, 3> mat3x3;
typedef matNxM<2, 3> mat2x3;
typedef matNxM<3, 2> mat3x2;
typedef matNxM<2, 2> mat2x2;
typedef matNxM<1, 2> mat1x2;
typedef matNxM<2, 1> mat2x1;
typedef matNxM<1, 3> mat1x3;
typedef matNxM<3, 1> mat3x1;
typedef matNxM<1, 1> mat1x1;
typedef matNxM<4, 4> mat4f;
#endif
\ No newline at end of file
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation and
* any modifications thereto. Any use, reproduction, disclosure, or distribution
* of this software and related documentation without an express license
* agreement from NVIDIA Corporation is strictly prohibited.
*
*/
/*
This file implements common mathematical operations on vector types
(float3, float4 etc.) since these are not provided as standard by CUDA.
The syntax is modelled on the Cg standard library.
*/
#ifndef _FTL_CUDA_OPERATORS_HPP_
#define _FTL_CUDA_OPERATORS_HPP_
#include <cuda_runtime.h>
////////////////////////////////////////////////////////////////////////////////
typedef unsigned int uint;
typedef unsigned short ushort;
#ifndef __CUDACC__
#include <math.h>
inline float fminf(float a, float b)
{
return a < b ? a : b;
}
inline float fmaxf(float a, float b)
{
return a > b ? a : b;
}
inline int max(int a, int b)
{
return a > b ? a : b;
}
inline int min(int a, int b)
{
return a < b ? a : b;
}
inline float rsqrtf(float x)
{
return 1.0f / sqrtf(x);
}
#endif
// float functions
////////////////////////////////////////////////////////////////////////////////
// lerp
inline __device__ __host__ float lerp(float a, float b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float clamp(float f, float a, float b)
{
return fmaxf(a, fminf(f, b));
}
inline __device__ __host__ int sign(float x) {
int t = x<0 ? -1 : 0;
return x > 0 ? 1 : t;
}
// int2 functions
////////////////////////////////////////////////////////////////////////////////
inline __host__ __device__ int2 make_int2(float2 f)
{
int2 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); return t;
}
inline __host__ __device__ uint2 make_uint2(int2 i)
{
uint2 t; t.x = static_cast<uint>(i.x); t.y = static_cast<uint>(i.y); return t;
}
// negate
inline __host__ __device__ int2 operator-(int2 &a)
{
return make_int2(-a.x, -a.y);
}
// addition
inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ int2 operator*(int2 a, int s)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ int2 operator*(int s, int2 a)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(int2 &a, int s)
{
a.x *= s; a.y *= s;
}
// float2 functions
////////////////////////////////////////////////////////////////////////////////
// negate
inline __host__ __device__ float2 operator-(float2 &a)
{
return make_float2(-a.x, -a.y);
}
// addition
inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ float2 operator*(float2 a, float s)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ float2 operator*(float s, float2 a)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(float2 &a, float s)
{
a.x *= s; a.y *= s;
}
// divide
inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ float2 operator/(float2 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float2 operator/(float s, float2 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float2 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
// dot product
inline __host__ __device__ float dot(float2 a, float2 b)
{
return a.x * b.x + a.y * b.y;
}
// length
inline __host__ __device__ float length(float2 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float2 normalize(float2 v)
{
float invLen = rsqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float2 floor(const float2 v)
{
return make_float2(floor(v.x), floor(v.y));
}
// reflect
inline __host__ __device__ float2 reflect(float2 i, float2 n)
{
return i - 2.0f * n * dot(n,i);
}
// absolute value
inline __host__ __device__ float2 fabs(float2 v)
{
return make_float2(fabs(v.x), fabs(v.y));
}
inline __device__ __host__ int2 sign(float2 f) {
return make_int2(sign(f.x), sign(f.y));
}
// float3 functions
////////////////////////////////////////////////////////////////////////////////
inline __host__ __device__ float3 make_float3(int3 i)
{
float3 t; t.x = static_cast<float>(i.x); t.y = static_cast<float>(i.y); t.z = static_cast<float>(i.z); return t;
}
inline __host__ __device__ float3 make_float3(float4 f)
{
return make_float3(f.x,f.y,f.z);
}
inline __host__ __device__ float3 make_float3(uchar3 c)
{
return make_float3(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z));
}
inline __host__ __device__ uchar3 make_uchar3(float3 f)
{
return make_uchar3(static_cast<unsigned char>(f.x), static_cast<unsigned char>(f.y), static_cast<unsigned char>(f.z));
}
inline __host__ __device__ float3 make_float3(float f)
{
return make_float3(f,f,f);
}
// negate
inline __host__ __device__ float3 operator-(const float3 &a)
{
return make_float3(-a.x, -a.y, -a.z);
}
// min
static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
{
return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
}
// max
static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
}
// addition
inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ float3 operator+(float3 a, float b)
{
return make_float3(a.x + b, a.y + b, a.z + b);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ float3 operator-(float3 a, float b)
{
return make_float3(a.x - b, a.y - b, a.z - b);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ float3 operator*(float3 a, float s)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ float3 operator*(float s, float3 a)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(float3 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ float3 operator/(float3 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float3 operator/(float s, float3 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float3 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
// dot product
inline __host__ __device__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
// cross product
inline __host__ __device__ float3 cross(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
// length
inline __host__ __device__ float length(float3 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float3 normalize(float3 v)
{
float invLen = rsqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float3 floor(const float3 v)
{
return make_float3(floor(v.x), floor(v.y), floor(v.z));
}
// reflect
inline __host__ __device__ float3 reflect(float3 i, float3 n)
{
return i - 2.0f * n * dot(n,i);
}
// absolute value
inline __host__ __device__ float3 fabs(float3 v)
{
return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
}
inline __device__ __host__ int3 sign(float3 f) {
return make_int3(sign(f.x), sign(f.y), sign(f.z));
}
// float4 functions
////////////////////////////////////////////////////////////////////////////////
inline __host__ __device__ float4 make_float4(float a)
{
return make_float4(a,a,a,a);
}
inline __host__ __device__ float4 make_float4(float3 f, float a)
{
return make_float4(f.x,f.y,f.z,a);
}
// negate
inline __host__ __device__ float4 operator-(float4 &a)
{
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
// min
static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
{
return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
}
// max
static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
}
// addition
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
// subtract
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
// multiply
inline __host__ __device__ float4 operator*(float4 a, float s)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ void operator*=(float4 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s; a.w *= s;
}
// divide
inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
}
inline __host__ __device__ float4 operator/(float4 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float4 operator/(float s, float4 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float4 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}
// dot product
inline __host__ __device__ float dot(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
// length
inline __host__ __device__ float length(float4 r)
{
return sqrtf(dot(r, r));
}
// normalize
inline __host__ __device__ float4 normalize(float4 v)
{
float invLen = rsqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float4 floor(const float4 v)
{
return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
}
// absolute value
inline __host__ __device__ float4 fabs(float4 v)
{
return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
}
// int3 functions
////////////////////////////////////////////////////////////////////////////////
inline __host__ __device__ int3 make_int3(float3 f)
{
int3 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); t.z = static_cast<int>(f.z); return t;
}
inline __host__ __device__ int3 make_int3(uint3 i)
{
int3 t; t.x = static_cast<int>(i.x); t.y = static_cast<int>(i.y); t.z = static_cast<int>(i.z); return t;
}
inline __host__ __device__ int3 make_int3(int i)
{
int3 t; t.x = i; t.y = i; t.z = i; return t;
}
// negate
inline __host__ __device__ int3 operator-(int3 &a)
{
return make_int3(-a.x, -a.y, -a.z);
}
// min
inline __host__ __device__ int3 min(int3 a, int3 b)
{
return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ int3 max(int3 a, int3 b)
{
return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ int3 operator*(int3 a, int s)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ int3 operator*(int s, int3 a)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(int3 &a, int s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ int3 operator/(int3 a, int3 b)
{
return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ int3 operator/(int3 a, int s)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ int3 operator/(int s, int3 a)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(int3 &a, int s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ int clamp(int f, int a, int b)
{
return max(a, min(f, b));
}
inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
// uint3 functions
////////////////////////////////////////////////////////////////////////////////
// min
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ uint3 operator*(uint3 a, uint s)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ uint3 operator*(uint s, uint3 a)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(uint3 &a, uint s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
{
return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ uint3 operator/(uint3 a, uint s)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ uint3 operator/(uint s, uint3 a)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(uint3 &a, uint s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
return max(a, min(f, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
#endif // _FTL_CUDA_OPERATORS_HPP_
#pragma once
#ifndef _CUDA_UTIL_
#define _CUDA_UTIL_
#undef max
#undef min
#include <cuda_runtime.h>
#include <ftl/cuda_operators.hpp>
// Enable run time assertion checking in kernel code
#define cudaAssert(condition) if (!(condition)) { printf("ASSERT: %s %s\n", #condition, __FILE__); }
//#define cudaAssert(condition)
#if defined(__CUDA_ARCH__)
#define __CONDITIONAL_UNROLL__ #pragma unroll
#else
#define __CONDITIONAL_UNROLL__
#endif
#endif
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/DepthCameraUtil.h
#pragma once
//#include <cutil_inline.h>
//#include <cutil_math.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/cuda_common.hpp>
#include <ftl/depth_camera_params.hpp>
extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params);
extern __constant__ DepthCameraParams c_depthCameraParams;
struct DepthCameraData {
///////////////
// Host part //
///////////////
__device__ __host__
DepthCameraData() {
/*d_depthData = NULL;
d_colorData = NULL;
d_depthArray = NULL;
d_colorArray = NULL;*/
depth_mat_ = nullptr;
colour_mat_ = nullptr;
depth_tex_ = nullptr;
colour_tex_ = nullptr;
}
__host__
void alloc(const DepthCameraParams& params) { //! todo resizing???
/*cudaSafeCall(cudaMalloc(&d_depthData, sizeof(float) * params.m_imageWidth * params.m_imageHeight));
cudaSafeCall(cudaMalloc(&d_colorData, sizeof(float4) * params.m_imageWidth * params.m_imageHeight));
h_depthChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaSafeCall(cudaMallocArray(&d_depthArray, &h_depthChannelDesc, params.m_imageWidth, params.m_imageHeight));
h_colorChannelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
cudaSafeCall(cudaMallocArray(&d_colorArray, &h_colorChannelDesc, params.m_imageWidth, params.m_imageHeight));*/
std::cout << "Create texture objects: " << params.m_imageWidth << "," << params.m_imageHeight << std::endl;
depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1);
colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4);
depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_);
colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_);
depth_obj_ = depth_tex_->cudaTexture();
colour_obj_ = colour_tex_->cudaTexture();
}
__host__
void updateParams(const DepthCameraParams& params) {
updateConstantDepthCameraParams(params);
}
__host__
void updateData(const cv::Mat &depth, const cv::Mat &rgb) {
depth_mat_->upload(depth);
colour_mat_->upload(rgb);
}
__host__
void free() {
/*if (d_depthData) cudaSafeCall(cudaFree(d_depthData));
if (d_colorData) cudaSafeCall(cudaFree(d_colorData));
if (d_depthArray) cudaSafeCall(cudaFreeArray(d_depthArray));
if (d_colorArray) cudaSafeCall(cudaFreeArray(d_colorArray));*/
/*d_depthData = NULL;
d_colorData = NULL;
d_depthArray = NULL;
d_colorArray = NULL;*/
if (depth_mat_) delete depth_mat_;
if (colour_mat_) delete colour_mat_;
delete depth_tex_;
delete colour_tex_;
}
/////////////////
// Device part //
/////////////////
static inline const DepthCameraParams& params() {
return c_depthCameraParams;
}
///////////////////////////////////////////////////////////////
// Camera to Screen
///////////////////////////////////////////////////////////////
__device__
static inline float2 cameraToKinectScreenFloat(const float3& pos) {
//return make_float2(pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx, c_depthCameraParams.my - pos.y*c_depthCameraParams.fy/pos.z);
return make_float2(
pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx,
pos.y*c_depthCameraParams.fy/pos.z + c_depthCameraParams.my);
}
__device__
static inline int2 cameraToKinectScreenInt(const float3& pos) {
float2 pImage = cameraToKinectScreenFloat(pos);
return make_int2(pImage + make_float2(0.5f, 0.5f));
}
__device__
static inline uint2 cameraToKinectScreen(const float3& pos) {
int2 p = cameraToKinectScreenInt(pos);
return make_uint2(p.x, p.y);
}
__device__
static inline float cameraToKinectProjZ(float z) {
return (z - c_depthCameraParams.m_sensorDepthWorldMin)/(c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin);
}
__device__
static inline float3 cameraToKinectProj(const float3& pos) {
float2 proj = cameraToKinectScreenFloat(pos);
float3 pImage = make_float3(proj.x, proj.y, pos.z);
pImage.x = (2.0f*pImage.x - (c_depthCameraParams.m_imageWidth- 1.0f))/(c_depthCameraParams.m_imageWidth- 1.0f);
//pImage.y = (2.0f*pImage.y - (c_depthCameraParams.m_imageHeight-1.0f))/(c_depthCameraParams.m_imageHeight-1.0f);
pImage.y = ((c_depthCameraParams.m_imageHeight-1.0f) - 2.0f*pImage.y)/(c_depthCameraParams.m_imageHeight-1.0f);
pImage.z = cameraToKinectProjZ(pImage.z);
return pImage;
}
///////////////////////////////////////////////////////////////
// Screen to Camera (depth in meters)
///////////////////////////////////////////////////////////////
__device__
static inline float3 kinectDepthToSkeleton(uint ux, uint uy, float depth) {
const float x = ((float)ux-c_depthCameraParams.mx) / c_depthCameraParams.fx;
const float y = ((float)uy-c_depthCameraParams.my) / c_depthCameraParams.fy;
//const float y = (c_depthCameraParams.my-(float)uy) / c_depthCameraParams.fy;
return make_float3(depth*x, depth*y, depth);
}
///////////////////////////////////////////////////////////////
// RenderScreen to Camera -- ATTENTION ASSUMES [1,0]-Z range!!!!
///////////////////////////////////////////////////////////////
__device__
static inline float kinectProjToCameraZ(float z) {
return z * (c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin) + c_depthCameraParams.m_sensorDepthWorldMin;
}
// z has to be in [0, 1]
__device__
static inline float3 kinectProjToCamera(uint ux, uint uy, float z) {
float fSkeletonZ = kinectProjToCameraZ(z);
return kinectDepthToSkeleton(ux, uy, fSkeletonZ);
}
__device__
static inline bool isInCameraFrustumApprox(const float4x4& viewMatrixInverse, const float3& pos) {
float3 pCamera = viewMatrixInverse * pos;
float3 pProj = cameraToKinectProj(pCamera);
//pProj *= 1.5f; //TODO THIS IS A HACK FIX IT :)
pProj *= 0.95;
return !(pProj.x < -1.0f || pProj.x > 1.0f || pProj.y < -1.0f || pProj.y > 1.0f || pProj.z < 0.0f || pProj.z > 1.0f);
}
//float* d_depthData; //depth data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor
//float4* d_colorData;
//uchar4* d_colorData; //color data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor
cv::cuda::GpuMat *depth_mat_;
cv::cuda::GpuMat *colour_mat_;
ftl::cuda::TextureObject<float> *depth_tex_;
ftl::cuda::TextureObject<uchar4> *colour_tex_;
cudaTextureObject_t depth_obj_;
cudaTextureObject_t colour_obj_;
// cuda arrays for texture access
/*cudaArray* d_depthArray;
cudaArray* d_colorArray;
cudaChannelFormatDesc h_depthChannelDesc;
cudaChannelFormatDesc h_colorChannelDesc;*/
};
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDADepthCameraParams.h
//#include <cutil_inline.h>
//#include <cutil_math.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/camera_params.hpp>
struct __align__(16) DepthCameraParams {
float fx;
float fy;
float mx;
float my;
unsigned short m_imageWidth;
unsigned short m_imageHeight;
unsigned int flags;
float m_sensorDepthWorldMin;
float m_sensorDepthWorldMax;
};
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/MatrixConversion.h
#pragma once
#include <eigen3/Eigen/Eigen>
// #include "d3dx9math.h"
#include <ftl/cuda_matrix_util.hpp>
namespace MatrixConversion
{
//static D3DXMATRIX EigToMat(const Eigen::Matrix4f& mat)
//{
// D3DXMATRIX m(mat.data());
// D3DXMATRIX res; D3DXMatrixTranspose(&res, &m);
// return res;
//}
//static Eigen::Matrix4f MatToEig(const D3DXMATRIX& mat)
//{
// return Eigen::Matrix4f((float*)mat.m).transpose();
//}
/*static mat4f EigToMat(const Eigen::Matrix4f& mat)
{
return mat4f(mat.data()).getTranspose();
}
static Eigen::Matrix4f MatToEig(const mat4f& mat)
{
return Eigen::Matrix4f(mat.ptr()).transpose();
}*/
static Eigen::Vector4f VecH(const Eigen::Vector3f& v)
{
return Eigen::Vector4f(v[0], v[1], v[2], 1.0);
}
static Eigen::Vector3f VecDH(const Eigen::Vector4f& v)
{
return Eigen::Vector3f(v[0]/v[3], v[1]/v[3], v[2]/v[3]);
}
/*static Eigen::Vector3f VecToEig(const vec3f& v)
{
return Eigen::Vector3f(v[0], v[1], v[2]);
}
static vec3f EigToVec(const Eigen::Vector3f& v)
{
return vec3f(v[0], v[1], v[2]);
}
static vec3f EigToVec(const D3DXMATRIX v)
{
return vec3f(v[0], v[1], v[2]);
}*/
// dx/cuda conversion
/*static vec3f toMlib(const D3DXVECTOR3& v) {
return vec3f(v.x, v.y, v.z);
}
static vec4f toMlib(const D3DXVECTOR4& v) {
return vec4f(v.x, v.y, v.z, v.w);
}
static mat4f toMlib(const D3DXMATRIX& m) {
mat4f c((const float*)&m);
return c.getTranspose();
}
static D3DXVECTOR3 toDX(const vec3f& v) {
return D3DXVECTOR3(v.x, v.y, v.z);
}
static D3DXVECTOR4 toDX(const vec4f& v) {
return D3DXVECTOR4(v.x, v.y, v.z, v.w);
}
static D3DXMATRIX toDX(const mat4f& m) {
D3DXMATRIX c((const float*)m.ptr());
D3DXMatrixTranspose(&c, &c);
return c;
}*/
/*static mat4f toMlib(const float4x4& m) {
return mat4f(m.ptr());
}
static vec4f toMlib(const float4& v) {
return vec4f(v.x, v.y, v.z, v.w);
}
static vec3f toMlib(const float3& v) {
return vec3f(v.x, v.y, v.z);
}
static vec4i toMlib(const int4& v) {
return vec4i(v.x, v.y, v.z, v.w);
}
static vec3i toMlib(const int3& v) {
return vec3i(v.x, v.y, v.z);
}
static float4x4 toCUDA(const mat4f& m) {
return float4x4(m.ptr());
}*/
static float4x4 toCUDA(const Eigen::Matrix4f& mat) {
return float4x4(mat.data()).getTranspose();
}
static Eigen::Matrix4f toEigen(const float4x4& m) {
return Eigen::Matrix4f(m.ptr()).transpose();
}
/*static float4 toCUDA(const vec4f& v) {
return make_float4(v.x, v.y, v.z, v.w);
}
static float3 toCUDA(const vec3f& v) {
return make_float3(v.x, v.y, v.z);
}
static int4 toCUDA(const vec4i& v) {
return make_int4(v.x, v.y, v.z, v.w);
}
static int3 toCUDA(const vec3i& v) {
return make_int3(v.x, v.y, v.z);
}*/
//doesnt really work
//template<class FloatType>
//point3d<FloatType> eulerAngles(const Matrix3x3<FloatType>& r) {
// point3d<FloatType> res;
// //check for gimbal lock
// if (r(0,2) == (FloatType)-1.0) {
// FloatType x = 0; //gimbal lock, value of x doesn't matter
// FloatType y = (FloatType)M_PI / 2;
// FloatType z = x + atan2(r(1,0), r(2,0));
// res = point3d<FloatType>(x, y, z);
// } else if (r(0,2) == (FloatType)1.0) {
// FloatType x = 0;
// FloatType y = -(FloatType)M_PI / 2;
// FloatType z = -x + atan2(-r(1,0), -r(2,0));
// res = point3d<FloatType>(x, y, z);
// } else { //two solutions exist
// FloatType x1 = -asin(r(0,2));
// FloatType x2 = (FloatType)M_PI - x1;
// FloatType y1 = atan2(r(1,2) / cos(x1), r(2,2) / cos(x1));
// FloatType y2 = atan2(r(1,2) / cos(x2), r(2,2) / cos(x2));
// FloatType z2 = atan2(r(0,1) / cos(x2), r(0,0) / cos(x2));
// FloatType z1 = atan2(r(0,1) / cos(x1), r(0,0) / cos(x1));
// //choose one solution to return
// //for example the "shortest" rotation
// if ((std::abs(x1) + std::abs(y1) + std::abs(z1)) <= (std::abs(x2) + std::abs(y2) + std::abs(z2))) {
// res = point3d<FloatType>(x1, y1, z1);
// } else {
// res = point3d<FloatType>(x2, y2, z2);
// }
// }
// res.x = math::radiansToDegrees(-res.x);
// res.y = math::radiansToDegrees(-res.y);
// res.z = math::radiansToDegrees(-res.z);
// return res;
//}
}
#include <ftl/cuda_util.hpp>
#include <ftl/cuda_matrix_util.hpp>
struct __align__(16) RayCastParams {
float4x4 m_viewMatrix;
float4x4 m_viewMatrixInverse;
float4x4 m_intrinsics;
float4x4 m_intrinsicsInverse;
unsigned int m_width;
unsigned int m_height;
unsigned int m_numOccupiedSDFBlocks;
unsigned int m_maxNumVertices;
int m_splatMinimum;
float m_minDepth;
float m_maxDepth;
float m_rayIncrement;
float m_thresSampleDist;
float m_thresDist;
bool m_useGradients;
uint dummy0;
};
#pragma once
#include <ftl/configurable.hpp>
#include <ftl/matrix_conversion.hpp>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/depth_camera.hpp>
#include <ftl/ray_cast_util.hpp>
#include <nlohmann/json.hpp>
// #include "DX11RayIntervalSplatting.h"
class CUDARayCastSDF : public ftl::Configurable
{
public:
CUDARayCastSDF(nlohmann::json& config) : ftl::Configurable(config) {
create(parametersFromConfig(config));
hash_render_ = config.value("hash_renderer", false);
}
~CUDARayCastSDF(void) {
destroy();
}
static RayCastParams parametersFromConfig(const nlohmann::json& gas) {
RayCastParams params;
params.m_width = gas["adapterWidth"].get<unsigned int>();
params.m_height = gas["adapterHeight"].get<unsigned int>();
params.m_intrinsics = MatrixConversion::toCUDA(Eigen::Matrix4f());
params.m_intrinsicsInverse = MatrixConversion::toCUDA(Eigen::Matrix4f());
params.m_minDepth = gas["sensorDepthMin"].get<float>();
params.m_maxDepth = gas["sensorDepthMax"].get<float>();
params.m_rayIncrement = gas["SDFRayIncrementFactor"].get<float>() * gas["SDFTruncation"].get<float>();
params.m_thresSampleDist = gas["SDFRayThresSampleDistFactor"].get<float>() * params.m_rayIncrement;
params.m_thresDist = gas["SDFRayThresDistFactor"].get<float>() * params.m_rayIncrement;
params.m_useGradients = gas["SDFUseGradients"].get<bool>();
params.m_maxNumVertices = gas["hashNumSDFBlocks"].get<unsigned int>() * 6;
return params;
}
void render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform);
const RayCastData& getRayCastData(void) {
return m_data;
}
const RayCastParams& getRayCastParams() const {
return m_params;
}
// debugging
void convertToCameraSpace(const DepthCameraData& cameraData);
private:
void create(const RayCastParams& params);
void destroy(void);
void rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform); // rasterize
RayCastParams m_params;
RayCastData m_data;
bool hash_render_;
// DX11RayIntervalSplatting m_rayIntervalSplatting;
};
#pragma once
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/depth_camera.hpp>
#include <ftl/voxel_hash.hpp>
#include <ftl/ray_cast_params.hpp>
struct RayCastSample
{
float sdf;
float alpha;
uint weight;
//uint3 color;
};
#ifndef MINF
#define MINF asfloat(0xff800000)
#endif
extern __constant__ RayCastParams c_rayCastParams;
extern "C" void updateConstantRayCastParams(const RayCastParams& params);
struct RayCastData {
///////////////
// Host part //
///////////////
__device__ __host__
RayCastData() {
d_depth = NULL;
d_depth3 = NULL;
d_normals = NULL;
d_colors = NULL;
//d_vertexBuffer = NULL;
point_cloud_ = nullptr;
d_rayIntervalSplatMinArray = NULL;
d_rayIntervalSplatMaxArray = NULL;
}
#ifndef __CUDACC__
__host__
void allocate(const RayCastParams& params) {
//point_cloud_ = new cv::cuda::GpuMat(params.m_width*params.m_height, 1, CV_32FC3);
cudaSafeCall(cudaMalloc(&d_depth, sizeof(float) * params.m_width * params.m_height));
cudaSafeCall(cudaMalloc(&d_depth3, sizeof(float3) * params.m_width * params.m_height));
cudaSafeCall(cudaMalloc(&d_normals, sizeof(float4) * params.m_width * params.m_height));
cudaSafeCall(cudaMalloc(&d_colors, sizeof(uchar3) * params.m_width * params.m_height));
//cudaSafeCall(cudaMalloc(&point_cloud_, sizeof(float3) * params.m_width * params.m_height));
//printf("Allocate ray cast data: %lld \n", (unsigned long long)point_cloud_);
}
__host__
void updateParams(const RayCastParams& params) {
updateConstantRayCastParams(params);
}
__host__ void download(float3 *points, uchar3 *colours, const RayCastParams& params) const {
//printf("Download: %d,%d\n", params.m_width, params.m_height);
//cudaSafeCall(cudaMemcpy(points, d_depth3, sizeof(float3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(colours, d_colors, sizeof(uchar3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost));
}
__host__
void free() {
//cudaFree(point_cloud_);
cudaFree(d_depth);
cudaFree(d_depth3);
cudaFree(d_normals);
cudaFree(d_colors);
}
#endif
/////////////////
// Device part //
/////////////////
#ifdef __CUDACC__
__device__
const RayCastParams& params() const {
return c_rayCastParams;
}
__device__
float frac(float val) const {
return (val - floorf(val));
}
__device__
float3 frac(const float3& val) const {
return make_float3(frac(val.x), frac(val.y), frac(val.z));
}
__device__
bool trilinearInterpolationSimpleFastFast(const ftl::voxhash::HashData& hash, const float3& pos, float& dist, uchar3& color) const {
const float oSet = c_hashParams.m_virtualVoxelSize;
const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f);
float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos));
dist = 0.0f;
float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
ftl::voxhash::Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; float3 vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*vColor;
v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*vColor;
v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*vColor;
v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *vColor;
v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x * weight.y *(1.0f-weight.z)*v.sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*vColor;
v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)* weight.y * weight.z *v.sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *vColor;
v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x *(1.0f-weight.y)* weight.z *v.sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *vColor;
v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false; vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= weight.x * weight.y * weight.z *v.sdf; colorFloat+= weight.x * weight.y * weight.z *vColor;
color = make_uchar3(colorFloat.x, colorFloat.y, colorFloat.z);//v.color;
return true;
}
//__device__
//bool trilinearInterpolationSimpleFastFast(const HashData& hash, const float3& pos, float& dist, uchar3& color) const {
// const float oSet = c_hashParams.m_virtualVoxelSize;
// const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f);
// float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos));
// dist = 0.0f;
// Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf;
// v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false; dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf;
// v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*v.sdf;
// v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *v.sdf;
// v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false; dist+= weight.x * weight.y *(1.0f-weight.z)*v.sdf;
// v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)* weight.y * weight.z *v.sdf;
// v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false; dist+= weight.x *(1.0f-weight.y)* weight.z *v.sdf;
// v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false; dist+= weight.x * weight.y * weight.z *v.sdf;
// color = v.color;
// return true;
//}
__device__
float findIntersectionLinear(float tNear, float tFar, float dNear, float dFar) const
{
return tNear+(dNear/(dNear-dFar))*(tFar-tNear);
}
static const unsigned int nIterationsBisection = 3;
// d0 near, d1 far
__device__
bool findIntersectionBisection(const ftl::voxhash::HashData& hash, const float3& worldCamPos, const float3& worldDir, float d0, float r0, float d1, float r1, float& alpha, uchar3& color) const
{
float a = r0; float aDist = d0;
float b = r1; float bDist = d1;
float c = 0.0f;
#pragma unroll 1
for(uint i = 0; i<nIterationsBisection; i++)
{
c = findIntersectionLinear(a, b, aDist, bDist);
float cDist;
if(!trilinearInterpolationSimpleFastFast(hash, worldCamPos+c*worldDir, cDist, color)) return false;
if(aDist*cDist > 0.0) { a = c; aDist = cDist; }
else { b = c; bDist = cDist; }
}
alpha = c;
return true;
}
__device__
float3 gradientForPoint(const ftl::voxhash::HashData& hash, const float3& pos) const
{
const float voxelSize = c_hashParams.m_virtualVoxelSize;
float3 offset = make_float3(voxelSize, voxelSize, voxelSize);
float distp00; uchar3 colorp00; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.5f*offset.x, 0.0f, 0.0f), distp00, colorp00);
float dist0p0; uchar3 color0p0; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.5f*offset.y, 0.0f), dist0p0, color0p0);
float dist00p; uchar3 color00p; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.0f, 0.5f*offset.z), dist00p, color00p);
float dist100; uchar3 color100; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.5f*offset.x, 0.0f, 0.0f), dist100, color100);
float dist010; uchar3 color010; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.5f*offset.y, 0.0f), dist010, color010);
float dist001; uchar3 color001; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.0f, 0.5f*offset.z), dist001, color001);
float3 grad = make_float3((distp00-dist100)/offset.x, (dist0p0-dist010)/offset.y, (dist00p-dist001)/offset.z);
float l = length(grad);
if(l == 0.0f) {
return make_float3(0.0f, 0.0f, 0.0f);
}
return -grad/l;
}
__device__
void traverseCoarseGridSimpleSampleAll(const ftl::voxhash::HashData& hash, const DepthCameraData& cameraData, const float3& worldCamPos, const float3& worldDir, const float3& camDir, const int3& dTid, float minInterval, float maxInterval) const
{
const RayCastParams& rayCastParams = c_rayCastParams;
// Last Sample
RayCastSample lastSample; lastSample.sdf = 0.0f; lastSample.alpha = 0.0f; lastSample.weight = 0; // lastSample.color = int3(0, 0, 0);
const float depthToRayLength = 1.0f/camDir.z; // scale factor to convert from depth to ray length
float rayCurrent = depthToRayLength * max(rayCastParams.m_minDepth, minInterval); // Convert depth to raylength
float rayEnd = depthToRayLength * min(rayCastParams.m_maxDepth, maxInterval); // Convert depth to raylength
//float rayCurrent = depthToRayLength * rayCastParams.m_minDepth; // Convert depth to raylength
//float rayEnd = depthToRayLength * rayCastParams.m_maxDepth; // Convert depth to raylength
//printf("MINMAX: %f,%f\n", minInterval, maxInterval);
#pragma unroll 1
while(rayCurrent < rayEnd)
{
float3 currentPosWorld = worldCamPos+rayCurrent*worldDir;
float dist; uchar3 color;
//printf("RAY LOOP: %f,%f,%f\n", currentPosWorld.x, currentPosWorld.y, currentPosWorld.z);
if(trilinearInterpolationSimpleFastFast(hash, currentPosWorld, dist, color))
{
if(lastSample.weight > 0 && lastSample.sdf > 0.0f && dist < 0.0f) // current sample is always valid here
{
float alpha; // = findIntersectionLinear(lastSample.alpha, rayCurrent, lastSample.sdf, dist);
uchar3 color2;
bool b = findIntersectionBisection(hash, worldCamPos, worldDir, lastSample.sdf, lastSample.alpha, dist, rayCurrent, alpha, color2);
float3 currentIso = worldCamPos+alpha*worldDir;
if(b && abs(lastSample.sdf - dist) < rayCastParams.m_thresSampleDist)
{
if(abs(dist) < rayCastParams.m_thresDist)
{
float depth = alpha / depthToRayLength; // Convert ray length to depth depthToRayLength
d_depth[dTid.y*rayCastParams.m_width+dTid.x] = depth;
d_depth3[dTid.y*rayCastParams.m_width+dTid.x] = cameraData.kinectDepthToSkeleton(dTid.x, dTid.y, depth);
d_colors[dTid.y*rayCastParams.m_width+dTid.x] = make_uchar3(color2.x, color2.y, color2.z);
if(rayCastParams.m_useGradients)
{
float3 normal = -gradientForPoint(hash, currentIso);
float4 n = rayCastParams.m_viewMatrix * make_float4(normal, 0.0f);
d_normals[dTid.y*rayCastParams.m_width+dTid.x] = make_float4(n.x, n.y, n.z, 1.0f);
}
return;
}
}
}
lastSample.sdf = dist;
lastSample.alpha = rayCurrent;
// lastSample.color = color;
lastSample.weight = 1;
rayCurrent += rayCastParams.m_rayIncrement;
} else {
lastSample.weight = 0;
rayCurrent += rayCastParams.m_rayIncrement;
}
}
}
#endif // __CUDACC__
union {
float* d_depth;
int* d_depth_i;
};
float3* d_depth3;
float4* d_normals;
uchar3* d_colors;
//float4* d_vertexBuffer; // ray interval splatting triangles, mapped from directx (memory lives there)
float3 *point_cloud_;
cudaArray* d_rayIntervalSplatMinArray;
cudaArray* d_rayIntervalSplatMaxArray;
};
\ No newline at end of file
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDASceneRepHashSDF.h
#pragma once
#include <cuda_runtime.h>
#include <ftl/configurable.hpp>
#include <ftl/matrix_conversion.hpp>
#include <ftl/voxel_hash.hpp>
#include <ftl/depth_camera.hpp>
#include <unordered_set>
//#include "CUDAScan.h"
// #include "CUDATimer.h"
// #include "GlobalAppState.h"
// #include "TimingLog.h"
#define SAFE_DELETE_ARRAY(a) { delete [] (a); (a) = NULL; }
extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask);
extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData);
extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams);
extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData);
extern "C" void starveVoxelsKernelCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void garbageCollectIdentifyCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void garbageCollectFreeCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
namespace ftl {
namespace voxhash {
class SceneRep : public ftl::Configurable {
public:
SceneRep(nlohmann::json &config) : Configurable(config) {
REQUIRED({
{"hashNumBuckets", "Desired hash entries divide bucket size", "number"},
{"hashMaxCollisionLinkedListSize", "", "number"},
{"hashNumSDFBlocks", "", "number"},
{"SDFVoxelSize", "Size in meters of one voxel", "number"},
{"SDFMaxIntegrationDistance", "", "number"},
{"SDFTruncation", "Base error size", "number"},
{"SDFTruncationScale", "Error size scale with depth", "number"},
{"SDFIntegrationWeightSample", "", "number"},
{"SDFIntegrationWeightMax", "", "number"}
});
create(parametersFromConfig(config));
}
~SceneRep() {
destroy();
}
static HashParams parametersFromConfig(nlohmann::json &config) {
HashParams params;
// First camera view is set to identity pose to be at the centre of
// the virtual coordinate space.
params.m_rigidTransform.setIdentity();
params.m_rigidTransformInverse.setIdentity();
params.m_hashNumBuckets = config["hashNumBuckets"];
params.m_hashBucketSize = HASH_BUCKET_SIZE;
params.m_hashMaxCollisionLinkedListSize = config["hashMaxCollisionLinkedListSize"];
params.m_SDFBlockSize = SDF_BLOCK_SIZE;
params.m_numSDFBlocks = config["hashNumSDFBlocks"];
params.m_virtualVoxelSize = config["SDFVoxelSize"];
params.m_maxIntegrationDistance = config["SDFMaxIntegrationDistance"];
params.m_truncation = config["SDFTruncation"];
params.m_truncScale = config["SDFTruncationScale"];
params.m_integrationWeightSample = config["SDFIntegrationWeightSample"];
params.m_integrationWeightMax = config["SDFIntegrationWeightMax"];
// Note (Nick): We are not streaming voxels in/out of GPU
//params.m_streamingVoxelExtents = MatrixConversion::toCUDA(gas.s_streamingVoxelExtents);
//params.m_streamingGridDimensions = MatrixConversion::toCUDA(gas.s_streamingGridDimensions);
//params.m_streamingMinGridPos = MatrixConversion::toCUDA(gas.s_streamingMinGridPos);
//params.m_streamingInitialChunkListSize = gas.s_streamingInitialChunkListSize;
return params;
}
void bindDepthCameraTextures(const DepthCameraData& depthCameraData) {
//bindInputDepthColorTextures(depthCameraData);
}
/**
* Note: lastRigidTransform appears to be the estimated camera pose.
* Note: bitMask can be nullptr if not streaming out voxels from GPU
*/
void integrate(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, unsigned int* d_bitMask) {
setLastRigidTransform(lastRigidTransform);
//make the rigid transform available on the GPU
m_hashData.updateParams(m_hashParams);
//allocate all hash blocks which are corresponding to depth map entries
alloc(depthCameraData, depthCameraParams, d_bitMask);
//generate a linear hash array with only occupied entries
compactifyHashEntries(depthCameraData);
//volumetrically integrate the depth data into the depth SDFBlocks
integrateDepthMap(depthCameraData, depthCameraParams);
garbageCollect(depthCameraData);
m_numIntegratedFrames++;
}
void setLastRigidTransform(const Eigen::Matrix4f& lastRigidTransform) {
m_hashParams.m_rigidTransform = MatrixConversion::toCUDA(lastRigidTransform);
m_hashParams.m_rigidTransformInverse = m_hashParams.m_rigidTransform.getInverse();
}
void setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) {
setLastRigidTransform(lastRigidTransform);
compactifyHashEntries(depthCameraData);
}
const Eigen::Matrix4f getLastRigidTransform() const {
return MatrixConversion::toEigen(m_hashParams.m_rigidTransform);
}
/* Nick: To reduce weights between frames */
void nextFrame() {
starveVoxelsKernelCUDA(m_hashData, m_hashParams);
m_numIntegratedFrames = 0;
}
//! resets the hash to the initial state (i.e., clears all data)
void reset() {
m_numIntegratedFrames = 0;
m_hashParams.m_rigidTransform.setIdentity();
m_hashParams.m_rigidTransformInverse.setIdentity();
m_hashParams.m_numOccupiedBlocks = 0;
m_hashData.updateParams(m_hashParams);
resetCUDA(m_hashData, m_hashParams);
}
ftl::voxhash::HashData& getHashData() {
return m_hashData;
}
const HashParams& getHashParams() const {
return m_hashParams;
}
//! debug only!
unsigned int getHeapFreeCount() {
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count+1; //there is one more free than the address suggests (0 would be also a valid address)
}
//! debug only!
void debugHash() {
HashEntry* hashCPU = new HashEntry[m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets];
unsigned int* heapCPU = new unsigned int[m_hashParams.m_numSDFBlocks];
unsigned int heapCounterCPU;
cudaSafeCall(cudaMemcpy(&heapCounterCPU, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
heapCounterCPU++; //points to the first free entry: number of blocks is one more
cudaSafeCall(cudaMemcpy(heapCPU, m_hashData.d_heap, sizeof(unsigned int)*m_hashParams.m_numSDFBlocks, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashCPU, m_hashData.d_hash, sizeof(HashEntry)*m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets, cudaMemcpyDeviceToHost));
//Check for duplicates
class myint3Voxel {
public:
myint3Voxel() {}
~myint3Voxel() {}
bool operator<(const myint3Voxel& other) const {
if (x == other.x) {
if (y == other.y) {
return z < other.z;
}
return y < other.y;
}
return x < other.x;
}
bool operator==(const myint3Voxel& other) const {
return x == other.x && y == other.y && z == other.z;
}
int x,y,z, i;
int offset;
int ptr;
};
std::unordered_set<unsigned int> pointersFreeHash;
std::vector<unsigned int> pointersFreeVec(m_hashParams.m_numSDFBlocks, 0);
for (unsigned int i = 0; i < heapCounterCPU; i++) {
pointersFreeHash.insert(heapCPU[i]);
pointersFreeVec[heapCPU[i]] = FREE_ENTRY;
}
if (pointersFreeHash.size() != heapCounterCPU) {
throw std::runtime_error("ERROR: duplicate free pointers in heap array");
}
unsigned int numOccupied = 0;
unsigned int numMinusOne = 0;
unsigned int listOverallFound = 0;
std::list<myint3Voxel> l;
//std::vector<myint3Voxel> v;
for (unsigned int i = 0; i < m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets; i++) {
if (hashCPU[i].ptr == -1) {
numMinusOne++;
}
if (hashCPU[i].ptr != -2) {
numOccupied++; // != FREE_ENTRY
myint3Voxel a;
a.x = hashCPU[i].pos.x;
a.y = hashCPU[i].pos.y;
a.z = hashCPU[i].pos.z;
l.push_back(a);
//v.push_back(a);
unsigned int linearBlockSize = m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize;
if (pointersFreeHash.find(hashCPU[i].ptr / linearBlockSize) != pointersFreeHash.end()) {
throw std::runtime_error("ERROR: ptr is on free heap, but also marked as an allocated entry");
}
pointersFreeVec[hashCPU[i].ptr / linearBlockSize] = LOCK_ENTRY;
}
}
unsigned int numHeapFree = 0;
unsigned int numHeapOccupied = 0;
for (unsigned int i = 0; i < m_hashParams.m_numSDFBlocks; i++) {
if (pointersFreeVec[i] == FREE_ENTRY) numHeapFree++;
else if (pointersFreeVec[i] == LOCK_ENTRY) numHeapOccupied++;
else {
throw std::runtime_error("memory leak detected: neither free nor allocated");
}
}
if (numHeapFree + numHeapOccupied == m_hashParams.m_numSDFBlocks) std::cout << "HEAP OK!" << std::endl;
else throw std::runtime_error("HEAP CORRUPTED");
l.sort();
size_t sizeBefore = l.size();
l.unique();
size_t sizeAfter = l.size();
std::cout << "diff: " << sizeBefore - sizeAfter << std::endl;
std::cout << "minOne: " << numMinusOne << std::endl;
std::cout << "numOccupied: " << numOccupied << "\t numFree: " << getHeapFreeCount() << std::endl;
std::cout << "numOccupied + free: " << numOccupied + getHeapFreeCount() << std::endl;
std::cout << "numInFrustum: " << m_hashParams.m_numOccupiedBlocks << std::endl;
SAFE_DELETE_ARRAY(heapCPU);
SAFE_DELETE_ARRAY(hashCPU);
//getchar();
}
private:
void create(const HashParams& params) {
m_hashParams = params;
m_hashData.allocate(m_hashParams);
reset();
}
void destroy() {
m_hashData.free();
}
void alloc(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask) {
//Start Timing
//if (GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
// NOTE (nick): We might want this later...
if (true) {
//allocate until all blocks are allocated
unsigned int prevFree = getHeapFreeCount();
while (1) {
resetHashBucketMutexCUDA(m_hashData, m_hashParams);
allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask);
unsigned int currFree = getHeapFreeCount();
if (prevFree != currFree) {
prevFree = currFree;
}
else {
break;
}
}
}
else {
//this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame)
resetHashBucketMutexCUDA(m_hashData, m_hashParams);
allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask);
}
// Stop Timing
//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeAlloc += m_timer.getElapsedTimeMS(); TimingLog::countTimeAlloc++; }
}
void compactifyHashEntries(const DepthCameraData& depthCameraData) {
//Start Timing
//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
//CUDATimer t;
//t.startEvent("fillDecisionArray");
//fillDecisionArrayCUDA(m_hashData, m_hashParams, depthCameraData);
//t.endEvent();
//t.startEvent("prefixSum");
//m_hashParams.m_numOccupiedBlocks =
// m_cudaScan.prefixSum(
// m_hashParams.m_hashNumBuckets*m_hashParams.m_hashBucketSize,
// m_hashData.d_hashDecision,
// m_hashData.d_hashDecisionPrefix);
//t.endEvent();
//t.startEvent("compactifyHash");
//m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU
//compactifyHashCUDA(m_hashData, m_hashParams);
//t.endEvent();
//t.startEvent("compactifyAllInOne");
m_hashParams.m_numOccupiedBlocks = compactifyHashAllInOneCUDA(m_hashData, m_hashParams); //this version uses atomics over prefix sums, which has a much better performance
std::cout << "Occ blocks = " << m_hashParams.m_numOccupiedBlocks << std::endl;
m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU
//t.endEvent();
//t.evaluate();
// Stop Timing
//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeCompactifyHash += m_timer.getElapsedTimeMS(); TimingLog::countTimeCompactifyHash++; }
//std::cout << "numOccupiedBlocks: " << m_hashParams.m_numOccupiedBlocks << std::endl;
}
void integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) {
//Start Timing
//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
integrateDepthMapCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams);
// Stop Timing
//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeIntegrate += m_timer.getElapsedTimeMS(); TimingLog::countTimeIntegrate++; }
}
void garbageCollect(const DepthCameraData& depthCameraData) {
//only perform if enabled by global app state
//if (GlobalAppState::get().s_garbageCollectionEnabled) {
garbageCollectIdentifyCUDA(m_hashData, m_hashParams);
resetHashBucketMutexCUDA(m_hashData, m_hashParams); //needed if linked lists are enabled -> for memeory deletion
garbageCollectFreeCUDA(m_hashData, m_hashParams);
//}
}
HashParams m_hashParams;
ftl::voxhash::HashData m_hashData;
//CUDAScan m_cudaScan;
unsigned int m_numIntegratedFrames; //used for garbage collect
// static Timer m_timer;
};
}; // namespace voxhash
}; // namespace ftl
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/VoxelUtilHashSDF.h
#pragma once
#ifndef sint
typedef signed int sint;
#endif
#ifndef uint
typedef unsigned int uint;
#endif
#ifndef slong
typedef signed long slong;
#endif
#ifndef ulong
typedef unsigned long ulong;
#endif
#ifndef uchar
typedef unsigned char uchar;
#endif
#ifndef schar
typedef signed char schar;
#endif
#include <ftl/cuda_util.hpp>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/voxel_hash_params.hpp>
#include <ftl/depth_camera.hpp>
#define HANDLE_COLLISIONS
#define SDF_BLOCK_SIZE 8
#define HASH_BUCKET_SIZE 10
#ifndef MINF
#define MINF __int_as_float(0xff800000)
#endif
#ifndef PINF
#define PINF __int_as_float(0x7f800000)
#endif
extern __constant__ ftl::voxhash::HashParams c_hashParams;
extern "C" void updateConstantHashParams(const ftl::voxhash::HashParams& hashParams);
namespace ftl {
namespace voxhash {
//status flags for hash entries
static const int LOCK_ENTRY = -1;
static const int FREE_ENTRY = -2;
static const int NO_OFFSET = 0;
struct __align__(16) HashEntry
{
int3 pos; //hash position (lower left corner of SDFBlock))
int ptr; //pointer into heap to SDFBlock
uint offset; //offset for collisions
uint flags; //for interframe checks (Nick)
__device__ void operator=(const struct HashEntry& e) {
((long long*)this)[0] = ((const long long*)&e)[0];
((long long*)this)[1] = ((const long long*)&e)[1];
//((int*)this)[4] = ((const int*)&e)[4];
((long long*)this)[2] = ((const long long*)&e)[2];
}
};
struct __align__(8) Voxel {
float sdf; //signed distance function
uchar3 color; //color
uchar weight; //accumulated sdf weight
__device__ void operator=(const struct Voxel& v) {
((long long*)this)[0] = ((const long long*)&v)[0];
}
};
/**
* Voxel Hash Table structure and operations. Works on both CPU and GPU with
* host <-> device transfer included.
*/
struct HashData {
///////////////
// Host part //
///////////////
__device__ __host__
HashData() {
d_heap = NULL;
d_heapCounter = NULL;
d_hash = NULL;
d_hashDecision = NULL;
d_hashDecisionPrefix = NULL;
d_hashCompactified = NULL;
d_hashCompactifiedCounter = NULL;
d_SDFBlocks = NULL;
d_hashBucketMutex = NULL;
m_bIsOnGPU = false;
}
__host__
void allocate(const HashParams& params, bool dataOnGPU = true) {
m_bIsOnGPU = dataOnGPU;
if (m_bIsOnGPU) {
cudaSafeCall(cudaMalloc(&d_heap, sizeof(unsigned int) * params.m_numSDFBlocks));
cudaSafeCall(cudaMalloc(&d_heapCounter, sizeof(unsigned int)));
cudaSafeCall(cudaMalloc(&d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize));
cudaSafeCall(cudaMalloc(&d_hashDecision, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize));
cudaSafeCall(cudaMalloc(&d_hashDecisionPrefix, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize));
cudaSafeCall(cudaMalloc(&d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize));
cudaSafeCall(cudaMalloc(&d_hashCompactifiedCounter, sizeof(int)));
cudaSafeCall(cudaMalloc(&d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize));
cudaSafeCall(cudaMalloc(&d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets));
} else {
d_heap = new unsigned int[params.m_numSDFBlocks];
d_heapCounter = new unsigned int[1];
d_hash = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize];
d_hashDecision = new int[params.m_hashNumBuckets * params.m_hashBucketSize];
d_hashDecisionPrefix = new int[params.m_hashNumBuckets * params.m_hashBucketSize];
d_hashCompactified = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize];
d_hashCompactifiedCounter = new int[1];
d_SDFBlocks = new Voxel[params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize];
d_hashBucketMutex = new int[params.m_hashNumBuckets];
}
updateParams(params);
}
__host__
void updateParams(const HashParams& params) {
if (m_bIsOnGPU) {
updateConstantHashParams(params);
}
}
__host__
void free() {
if (m_bIsOnGPU) {
cudaSafeCall(cudaFree(d_heap));
cudaSafeCall(cudaFree(d_heapCounter));
cudaSafeCall(cudaFree(d_hash));
cudaSafeCall(cudaFree(d_hashDecision));
cudaSafeCall(cudaFree(d_hashDecisionPrefix));
cudaSafeCall(cudaFree(d_hashCompactified));
cudaSafeCall(cudaFree(d_hashCompactifiedCounter));
cudaSafeCall(cudaFree(d_SDFBlocks));
cudaSafeCall(cudaFree(d_hashBucketMutex));
} else {
if (d_heap) delete[] d_heap;
if (d_heapCounter) delete[] d_heapCounter;
if (d_hash) delete[] d_hash;
if (d_hashDecision) delete[] d_hashDecision;
if (d_hashDecisionPrefix) delete[] d_hashDecisionPrefix;
if (d_hashCompactified) delete[] d_hashCompactified;
if (d_hashCompactifiedCounter) delete[] d_hashCompactifiedCounter;
if (d_SDFBlocks) delete[] d_SDFBlocks;
if (d_hashBucketMutex) delete[] d_hashBucketMutex;
}
d_hash = NULL;
d_heap = NULL;
d_heapCounter = NULL;
d_hashDecision = NULL;
d_hashDecisionPrefix = NULL;
d_hashCompactified = NULL;
d_hashCompactifiedCounter = NULL;
d_SDFBlocks = NULL;
d_hashBucketMutex = NULL;
}
__host__
HashData copyToCPU() const {
HashParams params;
HashData hashData;
hashData.allocate(params, false); //allocate the data on the CPU
cudaSafeCall(cudaMemcpy(hashData.d_heap, d_heap, sizeof(unsigned int) * params.m_numSDFBlocks, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_heapCounter, d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hash, d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hashDecision, d_hashDecision, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hashDecisionPrefix, d_hashDecisionPrefix, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hashCompactifiedCounter, d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_SDFBlocks, d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize, cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(hashData.d_hashBucketMutex, d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost));
return hashData; //TODO MATTHIAS look at this (i.e,. when does memory get destroyed ; if it's in the destructer it would kill everything here
}
/////////////////
// Device part //
/////////////////
//#define __CUDACC__
#ifdef __CUDACC__
__device__
const HashParams& params() const {
return c_hashParams;
}
//! see teschner et al. (but with correct prime values)
__device__
uint computeHashPos(const int3& virtualVoxelPos) const {
const int p0 = 73856093;
const int p1 = 19349669;
const int p2 = 83492791;
int res = ((virtualVoxelPos.x * p0) ^ (virtualVoxelPos.y * p1) ^ (virtualVoxelPos.z * p2)) % c_hashParams.m_hashNumBuckets;
if (res < 0) res += c_hashParams.m_hashNumBuckets;
return (uint)res;
}
//merges two voxels (v0 the currently stored voxel, v1 is the input voxel)
__device__
void combineVoxel(const Voxel &v0, const Voxel& v1, Voxel &out) const {
//v.color = (10*v0.weight * v0.color + v1.weight * v1.color)/(10*v0.weight + v1.weight); //give the currently observed color more weight
//v.color = (v0.weight * v0.color + v1.weight * v1.color)/(v0.weight + v1.weight);
//out.color = 0.5f * (v0.color + v1.color); //exponential running average
float3 c0 = make_float3(v0.color.x, v0.color.y, v0.color.z);
float3 c1 = make_float3(v1.color.x, v1.color.y, v1.color.z);
float3 res = (c0.x+c0.y+c0.z == 0) ? c1 : 0.5f*c0 + 0.5f*c1;
//float3 res = (c0+c1)/2;
//float3 res = (c0 * (float)v0.weight + c1 * (float)v1.weight) / ((float)v0.weight + (float)v1.weight);
//float3 res = c1;
out.color.x = (uchar)(res.x+0.5f); out.color.y = (uchar)(res.y+0.5f); out.color.z = (uchar)(res.z+0.5f);
out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight);
out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight);
}
//! returns the truncation of the SDF for a given distance value
__device__
float getTruncation(float z) const {
return c_hashParams.m_truncation + c_hashParams.m_truncScale * z;
}
__device__
float3 worldToVirtualVoxelPosFloat(const float3& pos) const {
return pos / c_hashParams.m_virtualVoxelSize;
}
__device__
int3 worldToVirtualVoxelPos(const float3& pos) const {
//const float3 p = pos*g_VirtualVoxelResolutionScalar;
const float3 p = pos / c_hashParams.m_virtualVoxelSize;
return make_int3(p+make_float3(sign(p))*0.5f);
}
__device__
int3 virtualVoxelPosToSDFBlock(int3 virtualVoxelPos) const {
if (virtualVoxelPos.x < 0) virtualVoxelPos.x -= SDF_BLOCK_SIZE-1;
if (virtualVoxelPos.y < 0) virtualVoxelPos.y -= SDF_BLOCK_SIZE-1;
if (virtualVoxelPos.z < 0) virtualVoxelPos.z -= SDF_BLOCK_SIZE-1;
return make_int3(
virtualVoxelPos.x/SDF_BLOCK_SIZE,
virtualVoxelPos.y/SDF_BLOCK_SIZE,
virtualVoxelPos.z/SDF_BLOCK_SIZE);
}
// Computes virtual voxel position of corner sample position
__device__
int3 SDFBlockToVirtualVoxelPos(const int3& sdfBlock) const {
return sdfBlock*SDF_BLOCK_SIZE;
}
__device__
float3 virtualVoxelPosToWorld(const int3& pos) const {
return make_float3(pos)*c_hashParams.m_virtualVoxelSize;
}
__device__
float3 SDFBlockToWorld(const int3& sdfBlock) const {
return virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock));
}
__device__
int3 worldToSDFBlock(const float3& worldPos) const {
return virtualVoxelPosToSDFBlock(worldToVirtualVoxelPos(worldPos));
}
__device__
bool isSDFBlockInCameraFrustumApprox(const int3& sdfBlock) {
// NOTE (Nick): Changed, just assume all voxels are potentially in frustrum
//float3 posWorld = virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)) + c_hashParams.m_virtualVoxelSize * 0.5f * (SDF_BLOCK_SIZE - 1.0f);
//return DepthCameraData::isInCameraFrustumApprox(c_hashParams.m_rigidTransformInverse, posWorld);
return true;
}
//! computes the (local) virtual voxel pos of an index; idx in [0;511]
__device__
uint3 delinearizeVoxelIndex(uint idx) const {
uint x = idx % SDF_BLOCK_SIZE;
uint y = (idx % (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE)) / SDF_BLOCK_SIZE;
uint z = idx / (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE);
return make_uint3(x,y,z);
}
//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
__device__
uint linearizeVoxelPos(const int3& virtualVoxelPos) const {
return
virtualVoxelPos.z * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE +
virtualVoxelPos.y * SDF_BLOCK_SIZE +
virtualVoxelPos.x;
}
__device__
int virtualVoxelPosToLocalSDFBlockIndex(const int3& virtualVoxelPos) const {
int3 localVoxelPos = make_int3(
virtualVoxelPos.x % SDF_BLOCK_SIZE,
virtualVoxelPos.y % SDF_BLOCK_SIZE,
virtualVoxelPos.z % SDF_BLOCK_SIZE);
if (localVoxelPos.x < 0) localVoxelPos.x += SDF_BLOCK_SIZE;
if (localVoxelPos.y < 0) localVoxelPos.y += SDF_BLOCK_SIZE;
if (localVoxelPos.z < 0) localVoxelPos.z += SDF_BLOCK_SIZE;
return linearizeVoxelPos(localVoxelPos);
}
__device__
int worldToLocalSDFBlockIndex(const float3& world) const {
int3 virtualVoxelPos = worldToVirtualVoxelPos(world);
return virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos);
}
//! returns the hash entry for a given worldPos; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set
__device__
HashEntry getHashEntry(const float3& worldPos) const {
//int3 blockID = worldToSDFVirtualVoxelPos(worldPos)/SDF_BLOCK_SIZE; //position of sdf block
int3 blockID = worldToSDFBlock(worldPos);
return getHashEntryForSDFBlockPos(blockID);
}
__device__
void deleteHashEntry(uint id) {
deleteHashEntry(d_hash[id]);
}
__device__
void deleteHashEntry(HashEntry& hashEntry) {
hashEntry.pos = make_int3(0);
hashEntry.offset = 0;
hashEntry.ptr = FREE_ENTRY;
}
__device__
bool voxelExists(const float3& worldPos) const {
HashEntry hashEntry = getHashEntry(worldPos);
return (hashEntry.ptr != FREE_ENTRY);
}
__device__
void deleteVoxel(Voxel& v) const {
v.color = make_uchar3(0,0,0);
v.weight = 0;
v.sdf = 0.0f;
}
__device__
void deleteVoxel(uint id) {
deleteVoxel(d_SDFBlocks[id]);
}
__device__
Voxel getVoxel(const float3& worldPos) const {
HashEntry hashEntry = getHashEntry(worldPos);
Voxel v;
if (hashEntry.ptr == FREE_ENTRY) {
deleteVoxel(v);
} else {
int3 virtualVoxelPos = worldToVirtualVoxelPos(worldPos);
v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)];
}
return v;
}
__device__
Voxel getVoxel(const int3& virtualVoxelPos) const {
HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos));
Voxel v;
if (hashEntry.ptr == FREE_ENTRY) {
deleteVoxel(v);
} else {
v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)];
}
return v;
}
__device__
void setVoxel(const int3& virtualVoxelPos, Voxel& voxelInput) const {
HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos));
if (hashEntry.ptr != FREE_ENTRY) {
d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)] = voxelInput;
}
}
//! returns the hash entry for a given sdf block id; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set
__device__
HashEntry getHashEntryForSDFBlockPos(const int3& sdfBlock) const
{
uint h = computeHashPos(sdfBlock); //hash bucket
uint hp = h * HASH_BUCKET_SIZE; //hash position
HashEntry entry;
entry.pos = sdfBlock;
entry.offset = 0;
entry.ptr = FREE_ENTRY;
for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
uint i = j + hp;
HashEntry curr = d_hash[i];
if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) {
return curr;
}
}
#ifdef HANDLE_COLLISIONS
const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;
int i = idxLastEntryInBucket; //start with the last entry of the current bucket
HashEntry curr;
//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
unsigned int maxIter = 0;
uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) {
curr = d_hash[i];
if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) {
return curr;
}
if (curr.offset == 0) { //we have found the end of the list
break;
}
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
maxIter++;
}
#endif
return entry;
}
//for histogram (no collision traversal)
__device__
unsigned int getNumHashEntriesPerBucket(unsigned int bucketID) {
unsigned int h = 0;
for (uint i = 0; i < HASH_BUCKET_SIZE; i++) {
if (d_hash[bucketID*HASH_BUCKET_SIZE+i].ptr != FREE_ENTRY) {
h++;
}
}
return h;
}
//for histogram (collisions traversal only)
__device__
unsigned int getNumHashLinkedList(unsigned int bucketID) {
unsigned int listLen = 0;
#ifdef HANDLE_COLLISIONS
const uint idxLastEntryInBucket = (bucketID+1)*HASH_BUCKET_SIZE - 1;
unsigned int i = idxLastEntryInBucket; //start with the last entry of the current bucket
//int offset = 0;
HashEntry curr; curr.offset = 0;
//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
unsigned int maxIter = 0;
uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) {
//offset = curr.offset;
//curr = getHashEntry(g_Hash, i);
curr = d_hash[i];
if (curr.offset == 0) { //we have found the end of the list
break;
}
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
listLen++;
maxIter++;
}
#endif
return listLen;
}
__device__
uint consumeHeap() {
uint addr = atomicSub(&d_heapCounter[0], 1);
//TODO MATTHIAS check some error handling?
return d_heap[addr];
}
__device__
void appendHeap(uint ptr) {
uint addr = atomicAdd(&d_heapCounter[0], 1);
//TODO MATTHIAS check some error handling?
d_heap[addr+1] = ptr;
}
//pos in SDF block coordinates
__device__
void allocBlock(const int3& pos, const uchar frame) {
uint h = computeHashPos(pos); //hash bucket
uint hp = h * HASH_BUCKET_SIZE; //hash position
int firstEmpty = -1;
for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
uint i = j + hp;
HashEntry& curr = d_hash[i];
//in that case the SDF-block is already allocated and corresponds to the current position -> exit thread
if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
//curr.flags = frame; // Flag block as valid in this frame (Nick)
return;
}
//store the first FREE_ENTRY hash entry
if (firstEmpty == -1 && curr.ptr == FREE_ENTRY) {
firstEmpty = i;
}
}
#ifdef HANDLE_COLLISIONS
//updated variables as after the loop
const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; //get last index of bucket
uint i = idxLastEntryInBucket; //start with the last entry of the current bucket
//int offset = 0;
HashEntry curr; curr.offset = 0;
//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
//int k = 0;
unsigned int maxIter = 0;
uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) {
//offset = curr.offset;
curr = d_hash[i]; //TODO MATTHIAS do by reference
if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
//curr.flags = frame; // Flag block as valid in this frame (Nick)
return;
}
if (curr.offset == 0) { //we have found the end of the list
break;
}
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
maxIter++;
}
#endif
if (firstEmpty != -1) { //if there is an empty entry and we haven't allocated the current entry before
//int prevValue = 0;
//InterlockedExchange(d_hashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket
int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked
HashEntry& entry = d_hash[firstEmpty];
entry.pos = pos;
entry.offset = NO_OFFSET;
entry.flags = 0; // Flag block as valid in this frame (Nick)
entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc
}
return;
}
#ifdef HANDLE_COLLISIONS
//if (i != idxLastEntryInBucket) return;
int offset = 0;
//linear search for free entry
maxIter = 0;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) {
offset++;
i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //go to next hash element
if ((offset % HASH_BUCKET_SIZE) == 0) continue; //cannot insert into a last bucket element (would conflict with other linked lists)
curr = d_hash[i];
//if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
// return;
//}
if (curr.ptr == FREE_ENTRY) { //this is the first free entry
//int prevValue = 0;
//InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the original hash bucket
int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
if (prevValue != LOCK_ENTRY) {
HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket];
h = i / HASH_BUCKET_SIZE;
//InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket where we have found a free entry
prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked
HashEntry& entry = d_hash[i];
entry.pos = pos;
entry.offset = lastEntryInBucket.offset;
entry.flags = 0; // Flag block as valid in this frame (Nick)
entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc
lastEntryInBucket.offset = offset;
d_hash[idxLastEntryInBucket] = lastEntryInBucket;
//setHashEntry(g_Hash, idxLastEntryInBucket, lastEntryInBucket);
}
}
return; //bucket was already locked
}
maxIter++;
}
#endif
}
//!inserts a hash entry without allocating any memory: used by streaming: TODO MATTHIAS check the atomics in this function
__device__
bool insertHashEntry(HashEntry entry)
{
uint h = computeHashPos(entry.pos);
uint hp = h * HASH_BUCKET_SIZE;
for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
uint i = j + hp;
//const HashEntry& curr = d_hash[i];
int prevWeight = 0;
//InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight);
prevWeight = atomicCAS(&d_hash[i].ptr, FREE_ENTRY, LOCK_ENTRY);
if (prevWeight == FREE_ENTRY) {
d_hash[i] = entry;
//setHashEntry(hash, i, entry);
return true;
}
}
#ifdef HANDLE_COLLISIONS
//updated variables as after the loop
const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1; //get last index of bucket
uint i = idxLastEntryInBucket; //start with the last entry of the current bucket
HashEntry curr;
unsigned int maxIter = 0;
//[allow_uav_condition]
uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) { //traverse list until end // why find the end? we you are inserting at the start !!!
//curr = getHashEntry(hash, i);
curr = d_hash[i]; //TODO MATTHIAS do by reference
if (curr.offset == 0) break; //we have found the end of the list
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
maxIter++;
}
maxIter = 0;
int offset = 0;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) { //linear search for free entry
offset++;
uint i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //go to next hash element
if ((offset % HASH_BUCKET_SIZE) == 0) continue; //cannot insert into a last bucket element (would conflict with other linked lists)
int prevWeight = 0;
//InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight); //check for a free entry
uint* d_hashUI = (uint*)d_hash;
prevWeight = prevWeight = atomicCAS(&d_hashUI[3*idxLastEntryInBucket+1], (uint)FREE_ENTRY, (uint)LOCK_ENTRY);
if (prevWeight == FREE_ENTRY) { //if free entry found set prev->next = curr & curr->next = prev->next
//[allow_uav_condition]
//while(hash[3*idxLastEntryInBucket+2] == LOCK_ENTRY); // expects setHashEntry to set the ptr last, required because pos.z is packed into the same value -> prev->next = curr -> might corrput pos.z
HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket]; //get prev (= lastEntry in Bucket)
int newOffsetPrev = (offset << 16) | (lastEntryInBucket.pos.z & 0x0000ffff); //prev->next = curr (maintain old z-pos)
int oldOffsetPrev = 0;
//InterlockedExchange(hash[3*idxLastEntryInBucket+1], newOffsetPrev, oldOffsetPrev); //set prev offset atomically
uint* d_hashUI = (uint*)d_hash;
oldOffsetPrev = prevWeight = atomicExch(&d_hashUI[3*idxLastEntryInBucket+1], newOffsetPrev);
entry.offset = oldOffsetPrev >> 16; //remove prev z-pos from old offset
//setHashEntry(hash, i, entry); //sets the current hashEntry with: curr->next = prev->next
d_hash[i] = entry;
return true;
}
maxIter++;
}
#endif
return false;
}
//! deletes a hash entry position for a given sdfBlock index (returns true uppon successful deletion; otherwise returns false)
__device__
bool deleteHashEntryElement(const int3& sdfBlock) {
uint h = computeHashPos(sdfBlock); //hash bucket
uint hp = h * HASH_BUCKET_SIZE; //hash position
for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
uint i = j + hp;
const HashEntry& curr = d_hash[i];
if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) {
#ifndef HANDLE_COLLISIONS
const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
appendHeap(curr.ptr / linBlockSize);
//heapAppend.Append(curr.ptr / linBlockSize);
deleteHashEntry(i);
return true;
#endif
#ifdef HANDLE_COLLISIONS
if (curr.offset != 0) { //if there was a pointer set it to the next list element
//int prevValue = 0;
//InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket
int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
if (prevValue == LOCK_ENTRY) return false;
if (prevValue != LOCK_ENTRY) {
const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
appendHeap(curr.ptr / linBlockSize);
//heapAppend.Append(curr.ptr / linBlockSize);
int nextIdx = (i + curr.offset) % (HASH_BUCKET_SIZE*c_hashParams.m_hashNumBuckets);
//setHashEntry(hash, i, getHashEntry(hash, nextIdx));
d_hash[i] = d_hash[nextIdx];
deleteHashEntry(nextIdx);
return true;
}
} else {
const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
appendHeap(curr.ptr / linBlockSize);
//heapAppend.Append(curr.ptr / linBlockSize);
deleteHashEntry(i);
return true;
}
#endif //HANDLE_COLLSISION
}
}
#ifdef HANDLE_COLLISIONS
const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;
int i = idxLastEntryInBucket;
HashEntry curr;
curr = d_hash[i];
int prevIdx = i;
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
unsigned int maxIter = 0;
uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
#pragma unroll 1
while (maxIter < g_MaxLoopIterCount) {
curr = d_hash[i];
//found that dude that we need/want to delete
if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) {
//int prevValue = 0;
//InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket
int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
if (prevValue == LOCK_ENTRY) return false;
if (prevValue != LOCK_ENTRY) {
const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
appendHeap(curr.ptr / linBlockSize);
//heapAppend.Append(curr.ptr / linBlockSize);
deleteHashEntry(i);
HashEntry prev = d_hash[prevIdx];
prev.offset = curr.offset;
//setHashEntry(hash, prevIdx, prev);
d_hash[prevIdx] = prev;
return true;
}
}
if (curr.offset == 0) { //we have found the end of the list
return false; //should actually never happen because we need to find that guy before
}
prevIdx = i;
i = idxLastEntryInBucket + curr.offset; //go to next element in the list
i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets); //check for overflow
maxIter++;
}
#endif // HANDLE_COLLSISION
return false;
}
#endif //CUDACC
uint* d_heap; //heap that manages free memory
uint* d_heapCounter; //single element; used as an atomic counter (points to the next free block)
int* d_hashDecision; //
int* d_hashDecisionPrefix; //
HashEntry* d_hash; //hash that stores pointers to sdf blocks
HashEntry* d_hashCompactified; //same as before except that only valid pointers are there
int* d_hashCompactifiedCounter; //atomic counter to add compactified entries atomically
Voxel* d_SDFBlocks; //sub-blocks that contain 8x8x8 voxels (linearized); are allocated by heap
int* d_hashBucketMutex; //binary flag per hash bucket; used for allocation to atomically lock a bucket
bool m_bIsOnGPU; //the class be be used on both cpu and gpu
};
} // namespace voxhash
} // namespace ftl
// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDAHashParams.h
#pragma once
//#include <cutil_inline.h>
//#include <cutil_math.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <ftl/cuda_matrix_util.hpp>
namespace ftl {
namespace voxhash {
//TODO might have to be split into static and dynamics
struct __align__(16) HashParams {
HashParams() {
}
float4x4 m_rigidTransform;
float4x4 m_rigidTransformInverse;
unsigned int m_hashNumBuckets;
unsigned int m_hashBucketSize;
unsigned int m_hashMaxCollisionLinkedListSize;
unsigned int m_numSDFBlocks;
int m_SDFBlockSize;
float m_virtualVoxelSize;
unsigned int m_numOccupiedBlocks; //occupied blocks in the viewing frustum
float m_maxIntegrationDistance;
float m_truncScale;
float m_truncation;
unsigned int m_integrationWeightSample;
unsigned int m_integrationWeightMax;
float3 m_streamingVoxelExtents;
int3 m_streamingGridDimensions;
int3 m_streamingMinGridPos;
unsigned int m_streamingInitialChunkListSize;
uint2 m_dummy;
};
} // namespace voxhash
} // namespace ftl
#ifndef _CAMERA_UTIL_
#define _CAMERA_UTIL_
#include <ftl/cuda_util.hpp>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/depth_camera.hpp>
#ifndef BYTE
#define BYTE unsigned char
#endif
#define T_PER_BLOCK 16
#define MINF __int_as_float(0xff800000)
#ifndef CUDART_PI_F
#define CUDART_PI_F 3.141592654f
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Copy Float Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void copyFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = d_input[y*width+x];
}
extern "C" void copyFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
copyFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
__global__ void copyDepthFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const float depth = d_input[y*width+x];
if (depth >= minDepth && depth <= maxDepth)
d_output[y*width+x] = depth;
else
d_output[y*width+x] = MINF;
}
extern "C" void copyDepthFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
copyDepthFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height,minDepth, maxDepth);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Copy Float Map Fill
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void initializeOptimizerMapsDevice(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const float depth = d_input[y*width+x];
if(d_mask[y*width+x] != MINF) { d_output[y*width+x] = depth; }
else { d_output[y*width+x] = MINF; d_input[y*width+x] = MINF; d_input2[y*width+x] = MINF; }
}
extern "C" void initializeOptimizerMaps(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
initializeOptimizerMapsDevice<<<gridSize, blockSize>>>(d_output, d_input, d_input2, d_mask, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Copy Float4 Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void copyFloat4MapDevice(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = d_input[y*width+x];
}
extern "C" void copyFloat4Map(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
copyFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Raw Color to float
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertColorRawToFloatDevice(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
//uchar4 c = make_uchar4(d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+3]); //note the flip from BGRW to RGBW
uchar4 c = make_uchar4(d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+3]);
if (c.x == 0 && c.y == 0 && c.z == 0) {
d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
} else {
d_output[y*width+x] = make_float4(c.x/255.0f, c.y/255.0f, c.z/255.0f, c.w/255);
}
}
extern "C" void convertColorRawToFloat4(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertColorRawToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Float4 Color to UCHAR4
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertColorFloat4ToUCHAR4Device(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
float4 color = d_input[y*width+x];
d_output[y*width+x] = make_uchar4(color.x*255.0f, color.y*255.0f, color.z*255.0f, color.w*255.0f);
}
extern "C" void convertColorFloat4ToUCHAR4(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);
convertColorFloat4ToUCHAR4Device<<<blockSize, gridSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Mask Color Map using Depth
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void maskColorMapFloat4MapDevice(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
float4 color = d_inputColor[y*width+x];
if(d_inputDepth[y*width+x].x != MINF) d_inputColor[y*width+x] = color;
else d_inputColor[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
}
extern "C" void maskColorMapFloat4Map(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height)
{
const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);
maskColorMapFloat4MapDevice<<<blockSize, gridSize>>>(d_inputColor, d_inputDepth, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Color to Intensity Float4
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertColorToIntensityFloat4Device(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const float4 color = d_input[y*width+x];
const float I = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z;
d_output[y*width+x] = make_float4(I, I, I, 1.0f);
}
extern "C" void convertColorToIntensityFloat4(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertColorToIntensityFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Color to Intensity
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertColorToIntensityFloatDevice(float* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const float4 color = d_input[y*width+x];
d_output[y*width+x] = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z;
}
extern "C" void convertColorToIntensityFloat(float* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertColorToIntensityFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert depth map to color map view
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertDepthToColorSpaceDevice(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x < depthWidth && y < depthHeight)
{
const float depth = d_input[y*depthWidth+x];
if(depth != MINF && depth < 1.0f)
{
// Cam space depth
float4 depthCamSpace = depthIntrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth);
depthCamSpace = make_float4(depthCamSpace.x, depthCamSpace.y, depthCamSpace.w, 1.0f);
// World Space
const float4 worldSpace = depthExtrinsicsInv*depthCamSpace;
// Cam space color
float4 colorCamSpace = colorExtrinsics*worldSpace;
//colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, 0.0f, colorCamSpace.z);
colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, colorCamSpace.z, 1.0f);
// Get coordinates in color image and set pixel to new depth
const float4 screenSpaceColor = colorIntrinsics*colorCamSpace;
//const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.w + 0.5f);
//const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.w + 0.5f);
const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.z + 0.5f);
const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.z + 0.5f);
//if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.w; // Check for minimum !!!
if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.z; // Check for minimum !!!
}
}
}
extern "C" void convertDepthToColorSpace(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight)
{
const dim3 gridSize((depthWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertDepthToColorSpaceDevice<<<gridSize, blockSize>>>(d_output, d_input, depthIntrinsicsInv, colorIntrinsics, depthExtrinsicsInv, colorExtrinsics, depthWidth, depthHeight, colorWidth, colorHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Set invalid float map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void setInvalidFloatMapDevice(float* d_output, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = MINF;
}
extern "C" void setInvalidFloatMap(float* d_output, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
setInvalidFloatMapDevice<<<gridSize, blockSize>>>(d_output, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Set invalid float4 map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void setInvalidFloat4MapDevice(float4* d_output, unsigned int width, unsigned int height)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = make_float4(MINF, MINF, MINF ,MINF);
}
extern "C" void setInvalidFloat4Map(float4* d_output, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
setInvalidFloat4MapDevice<<<gridSize, blockSize>>>(d_output, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Depth to Camera Space Positions
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertDepthFloatToCameraSpaceFloat3Device(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, DepthCameraData depthCameraData)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < width && y < height) {
d_output[y*width+x] = make_float3(MINF, MINF, MINF);
float depth = d_input[y*width+x];
if(depth != MINF)
{
//float4 cameraSpace(intrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth));
//d_output[y*width+x] = make_float4(cameraSpace.x, cameraSpace.y, cameraSpace.w, 1.0f);
d_output[y*width+x] = depthCameraData.kinectDepthToSkeleton(x, y, depth);
}
}
}
extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertDepthFloatToCameraSpaceFloat3Device<<<gridSize, blockSize>>>(d_output, d_input, intrinsicsInv, width, height, depthCameraData);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Bilateral Filter Float Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
inline __device__ float gaussR(float sigma, float dist)
{
return exp(-(dist*dist)/(2.0*sigma*sigma));
}
inline __device__ float linearR(float sigma, float dist)
{
return max(1.0f, min(0.0f, 1.0f-(dist*dist)/(2.0*sigma*sigma)));
}
inline __device__ float gaussD(float sigma, int x, int y)
{
return exp(-((x*x+y*y)/(2.0f*sigma*sigma)));
}
inline __device__ float gaussD(float sigma, int x)
{
return exp(-((x*x)/(2.0f*sigma*sigma)));
}
__global__ void bilateralFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const int kernelRadius = (int)ceil(2.0*sigmaD);
d_output[y*width+x] = MINF;
float sum = 0.0f;
float sumWeight = 0.0f;
const float depthCenter = d_input[y*width+x];
if(depthCenter != MINF)
{
for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
{
for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
{
if(m >= 0 && n >= 0 && m < width && n < height)
{
const float currentDepth = d_input[n*width+m];
if (currentDepth != MINF) {
const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, currentDepth-depthCenter);
sumWeight += weight;
sum += weight*currentDepth;
}
}
}
}
if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
}
}
extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
bilateralFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Bilateral Filter Float4 Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void bilateralFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const int kernelRadius = (int)ceil(2.0*sigmaD);
//d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
float sumWeight = 0.0f;
const float4 depthCenter = d_input[y*width+x];
if (depthCenter.x != MINF) {
for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
{
for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
{
if(m >= 0 && n >= 0 && m < width && n < height)
{
const float4 currentDepth = d_input[n*width+m];
if (currentDepth.x != MINF) {
const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, length(currentDepth-depthCenter));
sum += weight*currentDepth;
sumWeight += weight;
}
}
}
}
}
if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
}
extern "C" void bilateralFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);
const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
bilateralFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Gauss Filter Float Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void gaussFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const int kernelRadius = (int)ceil(2.0*sigmaD);
d_output[y*width+x] = MINF;
float sum = 0.0f;
float sumWeight = 0.0f;
const float depthCenter = d_input[y*width+x];
if(depthCenter != MINF)
{
for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
{
for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
{
if(m >= 0 && n >= 0 && m < width && n < height)
{
const float currentDepth = d_input[n*width+m];
if(currentDepth != MINF && fabs(depthCenter-currentDepth) < sigmaR)
{
const float weight = gaussD(sigmaD, m-x, n-y);
sumWeight += weight;
sum += weight*currentDepth;
}
}
}
}
}
if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
}
extern "C" void gaussFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
gaussFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Gauss Filter Float4 Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void gaussFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const int kernelRadius = (int)ceil(2.0*sigmaD);
//d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
float sumWeight = 0.0f;
const float4 depthCenter = d_input[y*width+x];
if (depthCenter.x != MINF) {
for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
{
for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
{
if(m >= 0 && n >= 0 && m < width && n < height)
{
const float4 currentDepth = d_input[n*width+m];
if (currentDepth.x != MINF) {
if(length(depthCenter-currentDepth) < sigmaR)
{
const float weight = gaussD(sigmaD, m-x, n-y);
sumWeight += weight;
sum += weight*currentDepth;
}
}
}
}
}
}
if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
}
extern "C" void gaussFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
gaussFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Normal Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeNormalsDevice(float4* d_output, float3* d_input, unsigned int width, unsigned int height)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
if(x > 0 && x < width-1 && y > 0 && y < height-1)
{
const float3 CC = d_input[(y+0)*width+(x+0)];
const float3 PC = d_input[(y+1)*width+(x+0)];
const float3 CP = d_input[(y+0)*width+(x+1)];
const float3 MC = d_input[(y-1)*width+(x+0)];
const float3 CM = d_input[(y+0)*width+(x-1)];
if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF)
{
const float3 n = cross(PC-MC, CP-CM);
const float l = length(n);
if(l > 0.0f)
{
d_output[y*width+x] = make_float4(n/-l, 1.0f);
}
}
}
}
extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeNormalsDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Normal Map 2
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeNormalsDevice2(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
if(x > 0 && x < width-1 && y > 0 && y < height-1)
{
const float4 CC = d_input[(y+0)*width+(x+0)];
const float4 MC = d_input[(y-1)*width+(x+0)];
const float4 CM = d_input[(y+0)*width+(x-1)];
if(CC.x != MINF && MC.x != MINF && CM.x != MINF)
{
const float3 n = cross(make_float3(MC)-make_float3(CC), make_float3(CM)-make_float3(CC));
const float l = length(n);
if(l > 0.0f)
{
d_output[y*width+x] = make_float4(n/-l, 1.0f);
}
}
}
}
extern "C" void computeNormals2(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeNormalsDevice2<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Shading Value
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
inline __device__ void evaluateLightingModelTerms(float* d_out, float4 n)
{
d_out[0] = 1.0;
d_out[1] = n.y;
d_out[2] = n.z;
d_out[3] = n.x;
d_out[4] = n.x*n.y;
d_out[5] = n.y*n.z;
d_out[6] = 3*n.z*n.z - 1;
d_out[7] = n.z*n.x;
d_out[8] = n.x*n.x-n.y*n.y;
}
inline __device__ float evaluateLightingModel(float* d_lit, float4 n)
{
float tmp[9];
evaluateLightingModelTerms(tmp, n);
float sum = 0.0f;
for(unsigned int i = 0; i<9; i++) sum += tmp[i]*d_lit[i];
return sum;
}
__global__ void computeShadingValueDevice(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 Intrinsic, float* d_litcoeff, unsigned int width, unsigned int height)
{
const unsigned int posx = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int posy = blockIdx.y*blockDim.y + threadIdx.y;
if(posx >= width || posy >= height) return;
d_outShading[posy*width+posx] = 0;
if(posx > 0 && posx < width-1 && posy > 0 && posy < height-1)
{
float4 n = d_normals[posy*width+posx];
if(n.x != MINF)
{
n.x = -n.x; // Change handedness
n.z = -n.z;
float albedo = d_albedoEstimates[(unsigned int)(d_clusterIDs[posy*width+posx]+0.5f)];
float shadingval = albedo*evaluateLightingModel(d_litcoeff, n);
if(shadingval<0.0f) shadingval = 0.0f;
if(shadingval>1.0f) shadingval = 1.0f;
d_outShading[posy*width+posx] = shadingval;
}
}
}
extern "C" void computeShadingValue(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 &Intrinsic, float* d_lighting, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeShadingValueDevice<<<gridSize, blockSize>>>(d_outShading, d_indepth, d_normals, d_clusterIDs, d_albedoEstimates, Intrinsic, d_lighting, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Simple Segmentation
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeSimpleSegmentationDevice(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
const float inputDepth = d_input[y*width+x];
if(inputDepth != MINF && inputDepth < depthThres) d_output[y*width+x] = inputDepth;
else d_output[y*width+x] = MINF;
}
extern "C" void computeSimpleSegmentation(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeSimpleSegmentationDevice<<<gridSize, blockSize>>>(d_output, d_input, depthThres, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Edge Mask
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeMaskEdgeMapFloat4Device(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = 1;
d_output[width*height+y*width+x] = 1;
const float thre = threshold *threshold *3.0f;
if(x > 0 && y > 0 && x < width-1 && y < height-1)
{
if(d_indepth[y*width+x] == MINF)
{
d_output[y*width+x] = 0;
d_output[y*width+x-1] = 0;
d_output[width*height+y*width+x] = 0;
d_output[width*height+(y-1)*width+x] = 0;
return;
}
const float4& p0 = d_input[(y+0)*width+(x+0)];
const float4& p1 = d_input[(y+0)*width+(x+1)];
const float4& p2 = d_input[(y+1)*width+(x+0)];
float dU = sqrt(((p1.x-p0.x)*(p1.x-p0.x) + (p1.y-p0.y) * (p1.y-p0.y) + (p1.z-p0.z)*(p1.z-p0.z))/3.0f);
float dV = sqrt(((p2.x-p0.x)*(p2.x-p0.x) + (p2.y-p0.y) * (p2.y-p0.y) + (p2.z-p0.z)*(p2.z-p0.z))/3.0f);
//float dgradx = abs(d_indepth[y*width+x-1] + d_indepth[y*width+x+1] - 2.0f * d_indepth[y*width+x]);
//float dgrady = abs(d_indepth[y*width+x-width] + d_indepth[y*width+x+width] - 2.0f * d_indepth[y*width+x]);
if(dU > thre ) d_output[y*width+x] = 0;
if(dV > thre ) d_output[width*height+y*width+x] = 0;
//remove depth discontinuities
const int r = 1;
const float thres = 0.01f;
const float pCC = d_indepth[y*width+x];
for(int i = -r; i<=r; i++)
{
for(int j = -r; j<=r; j++)
{
int currentX = x+j;
int currentY = y+i;
if(currentX >= 0 && currentX < width && currentY >= 0 && currentY < height)
{
float d = d_indepth[currentY*width+currentX];
if(d != MINF && abs(pCC-d) > thres)
{
d_output[y*width+x] = 0;
d_output[width*height+y*width+x] = 0;
return;
}
}
}
}
}
}
extern "C" void computeMaskEdgeMapFloat4(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeMaskEdgeMapFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, d_indepth, threshold,width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Clear Decission Array for Patch Depth Mask
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void clearDecissionArrayPatchDepthMaskDevice(int* d_output, unsigned int inputWidth, unsigned int inputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight) d_output[y*inputWidth+x] = 0;
}
extern "C" void clearDecissionArrayPatchDepthMask(int* d_output, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
clearDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, inputWidth, inputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Decission Array for Patch Depth Mask
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeDecissionArrayPatchDepthMaskDevice(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight)
{
const int patchId_x = x/patchSize;
const int patchId_y = y/patchSize;
const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
const float d = d_input[y*inputWidth+x];
if(d != MINF) atomicMax(&d_output[patchId_y*nPatchesWidth+patchId_x], 1);
}
}
extern "C" void computeDecissionArrayPatchDepthMask(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, patchSize, inputWidth, inputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute Remapping Array for Patch Depth Mask
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void computeRemappingArrayPatchDepthMaskDevice(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight)
{
const int patchId_x = x/patchSize;
const int patchId_y = y/patchSize;
const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
const float d = d_input[y*inputWidth+x];
if(d != MINF) d_output[d_prefixSum[patchId_y*nPatchesWidth+patchId_x]-1] = patchId_y*nPatchesWidth+patchId_x;
}
}
extern "C" void computeRemappingArrayPatchDepthMask(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeRemappingArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, d_prefixSum, patchSize, inputWidth, inputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Debug Patch Remap Array
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void DebugPatchRemapArrayDevice(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
if(x < numElements)
{
int patchID = d_remapArray[x];
const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
const int patchId_x = patchID%nPatchesWidth;
const int patchId_y = patchID/nPatchesWidth;
for(unsigned int i = 0; i<patchSize; i++)
{
for(unsigned int j = 0; j<patchSize; j++)
{
const int pixel_x = patchId_x*patchSize;
const int pixel_y = patchId_y*patchSize;
d_mask[(pixel_y+i)*inputWidth+(pixel_x+j)] = 3.0f;
}
}
}
}
extern "C" void DebugPatchRemapArray(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((numElements + T_PER_BLOCK*T_PER_BLOCK - 1)/(T_PER_BLOCK*T_PER_BLOCK));
const dim3 blockSize(T_PER_BLOCK*T_PER_BLOCK);
DebugPatchRemapArrayDevice<<<gridSize, blockSize>>>(d_mask, d_remapArray, patchSize, numElements, inputWidth, inputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Resample Float Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
inline __device__ float bilinearInterpolationFloat(float x, float y, float* d_input, unsigned int imageWidth, unsigned int imageHeight)
{
const int2 p00 = make_int2(floor(x), floor(y));
const int2 p01 = p00 + make_int2(0.0f, 1.0f);
const int2 p10 = p00 + make_int2(1.0f, 0.0f);
const int2 p11 = p00 + make_int2(1.0f, 1.0f);
const float alpha = x - p00.x;
const float beta = y - p00.y;
float s0 = 0.0f; float w0 = 0.0f;
if(p00.x < imageWidth && p00.y < imageHeight) { float v00 = d_input[p00.y*imageWidth + p00.x]; if(v00 != MINF) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } }
if(p10.x < imageWidth && p10.y < imageHeight) { float v10 = d_input[p10.y*imageWidth + p10.x]; if(v10 != MINF) { s0 += alpha *v10; w0 += alpha ; } }
float s1 = 0.0f; float w1 = 0.0f;
if(p01.x < imageWidth && p01.y < imageHeight) { float v01 = d_input[p01.y*imageWidth + p01.x]; if(v01 != MINF) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} }
if(p11.x < imageWidth && p11.y < imageHeight) { float v11 = d_input[p11.y*imageWidth + p11.x]; if(v11 != MINF) { s1 += alpha *v11; w1 += alpha ;} }
const float p0 = s0/w0;
const float p1 = s1/w1;
float ss = 0.0f; float ww = 0.0f;
if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); }
if(w1 > 0.0f) { ss += beta *p1; ww += beta ; }
if(ww > 0.0f) return ss/ww;
else return MINF;
}
__global__ void resampleFloatMapDevice(float* d_colorMapResampledFloat, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x < outputWidth && y < outputHeight)
{
const float scaleWidth = (float)(inputWidth-1) /(float)(outputWidth-1);
const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1);
const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f);
const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f);
if(xInput < inputWidth && yInput < inputHeight)
{
d_colorMapResampledFloat[y*outputWidth+x] = bilinearInterpolationFloat(x*scaleWidth, y*scaleHeight, d_colorMapFloat, inputWidth, inputHeight);
}
}
}
extern "C" void resampleFloatMap(float* d_colorMapResampledFloat, unsigned int outputWidth, unsigned int outputHeight, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
resampleFloatMapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat, d_colorMapFloat, inputWidth, inputHeight, outputWidth, outputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Resample Float4 Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
inline __device__ float4 bilinearInterpolationFloat4(float x, float y, float4* d_input, unsigned int imageWidth, unsigned int imageHeight)
{
const int2 p00 = make_int2(floor(x), floor(y));
const int2 p01 = p00 + make_int2(0.0f, 1.0f);
const int2 p10 = p00 + make_int2(1.0f, 0.0f);
const int2 p11 = p00 + make_int2(1.0f, 1.0f);
const float alpha = x - p00.x;
const float beta = y - p00.y;
//const float INVALID = 0.0f;
const float INVALID = MINF;
float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f;
if(p00.x < imageWidth && p00.y < imageHeight) { float4 v00 = d_input[p00.y*imageWidth + p00.x]; if(v00.x != INVALID && v00.y != INVALID && v00.z != INVALID) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } }
if(p10.x < imageWidth && p10.y < imageHeight) { float4 v10 = d_input[p10.y*imageWidth + p10.x]; if(v10.x != INVALID && v10.y != INVALID && v10.z != INVALID) { s0 += alpha *v10; w0 += alpha ; } }
float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f;
if(p01.x < imageWidth && p01.y < imageHeight) { float4 v01 = d_input[p01.y*imageWidth + p01.x]; if(v01.x != INVALID && v01.y != INVALID && v01.z != INVALID) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} }
if(p11.x < imageWidth && p11.y < imageHeight) { float4 v11 = d_input[p11.y*imageWidth + p11.x]; if(v11.x != INVALID && v11.y != INVALID && v11.z != INVALID) { s1 += alpha *v11; w1 += alpha ;} }
const float4 p0 = s0/w0;
const float4 p1 = s1/w1;
float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f;
if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); }
if(w1 > 0.0f) { ss += beta *p1; ww += beta ; }
if(ww > 0.0f) return ss/ww;
else return make_float4(MINF, MINF, MINF, MINF);
}
__global__ void resampleFloat4MapDevice(float4* d_colorMapResampledFloat4, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x < outputWidth && y < outputHeight)
{
const float scaleWidth = (float)(inputWidth-1) /(float)(outputWidth-1);
const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1);
const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f);
const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f);
if(xInput < inputWidth && yInput < inputHeight)
{
d_colorMapResampledFloat4[y*outputWidth+x] = bilinearInterpolationFloat4(x*scaleWidth, y*scaleHeight, d_colorMapFloat4, inputWidth, inputHeight);
}
}
}
extern "C" void resampleFloat4Map(float4* d_colorMapResampledFloat4, unsigned int outputWidth, unsigned int outputHeight, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
resampleFloat4MapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat4, d_colorMapFloat4, inputWidth, inputHeight, outputWidth, outputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Resample Unsigned Char Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void downsampleUnsignedCharMapDevice(unsigned char* d_MapResampled, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight, unsigned int layerOffsetInput, unsigned int layerOffsetOutput)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= outputWidth || y >= outputHeight) return;
unsigned char res = 0;
const unsigned int inputX = 2*x;
const unsigned int inputY = 2*y;
if((inputY+0) < inputHeight && (inputX+0) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+0))];
if((inputY+0) < inputHeight && (inputX+1) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+1))];
if((inputY+1) < inputHeight && (inputX+0) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+0))];
if((inputY+1) < inputHeight && (inputX+1) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+1))];
if(res == 4) d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 1;
else d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 0;
}
extern "C" void downsampleUnsignedCharMap(unsigned char* d_MapResampled, unsigned int outputWidth, unsigned int outputHeight, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight)
{
const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, 0, 0);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, inputWidth*inputHeight, outputWidth*outputHeight);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert Edge Mask to Float Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void convertEdgeMaskToFloatDevice(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= width || y >= height) return;
d_output[y*width+x] = min(d_input[y*width+x], d_input[width*height+y*width+x]);
}
extern "C" void convertEdgeMaskToFloat(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
convertEdgeMaskToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Dilate Depth Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void dilateDepthMapDevice(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= 0 && x < width && y >= 0 && y < height)
{
float sum = 0.0f;
float count = 0.0f;
float oldDepth = d_inputOrig[y*width+x];
if(oldDepth != MINF && oldDepth != 0)
{
for(int i = -structureSize; i<=structureSize; i++)
{
for(int j = -structureSize; j<=structureSize; j++)
{
if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
{
const float d = d_input[(y+i)*width+(x+j)];
if(d != MINF && d != 0.0f && fabs(d-oldDepth) < 0.05f)
{
sum += d;
count += 1.0f;
}
}
}
}
}
if(count > ((2*structureSize+1)*(2*structureSize+1))/36) d_output[y*width+x] = 1.0f;
else d_output[y*width+x] = MINF;
}
}
extern "C" void dilateDepthMapMask(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
dilateDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, d_inputOrig, structureSize, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Mean Filter Depth Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void removeDevMeanMapMaskDevice(float* d_output, float* d_input, int structureSize, int width, int height)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
d_output[y*width+x] = d_input[y*width+x];
if(x >= 0 && x < width && y >= 0 && y < height)
{
float oldDepth = d_input[y*width+x];
float mean = 0.0f;
float meanSquared = 0.0f;
float count = 0.0f;
for(int i = -structureSize; i<=structureSize; i++)
{
for(int j = -structureSize; j<=structureSize; j++)
{
if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
{
float depth = d_input[(y+i)*width+(x+j)];
if(depth == MINF)
{
depth = 8.0f;
}
if(depth > 0.0f)
{
mean += depth;
meanSquared += depth*depth;
count += 1.0f;
}
}
}
}
mean/=count;
meanSquared/=count;
float stdDev = sqrt(meanSquared-mean*mean);
if(fabs(oldDepth-mean) > 0.5f*stdDev)// || stdDev> 0.005f)
{
d_output[y*width+x] = MINF;
}
}
}
extern "C" void removeDevMeanMapMask(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
removeDevMeanMapMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
// Nearest neighbour
inline __device__ bool getValueNearestNeighbourNoCheck(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue)
{
const int u = (int)(p.x + 0.5f);
const int v = (int)(p.y + 0.5f);
if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false;
*outValue = inputMap[v*imageWidth + u];
return true;
}
inline __device__ bool getValueNearestNeighbour(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue)
{
bool valid = getValueNearestNeighbourNoCheck(p, inputMap, imageWidth, imageHeight, outValue);
return valid && (outValue->x != MINF && outValue->y != MINF && outValue->z != MINF);
}
// Nearest neighbour
inline __device__ bool getValueNearestNeighbourFloatNoCheck(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue)
{
const int u = (int)(p.x + 0.5f);
const int v = (int)(p.y + 0.5f);
if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false;
*outValue = inputMap[v*imageWidth + u];
return true;
}
inline __device__ bool getValueNearestNeighbourFloat(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue)
{
bool valid = getValueNearestNeighbourFloatNoCheck(p, inputMap, imageWidth, imageHeight, outValue);
return valid && (*outValue != MINF);
}
/////////////////////////////////////////////
// Compute derivatives in camera space
/////////////////////////////////////////////
__global__ void computeDerivativesCameraSpaceDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
const int index = y*imageWidth+x;
if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
{
d_positionsDU[index] = make_float4(MINF, MINF, MINF, MINF);
d_positionsDV[index] = make_float4(MINF, MINF, MINF, MINF);
if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1)
{
float4 pos00; bool valid00 = getValueNearestNeighbour(make_float2(x-1, y-1), d_positions, imageWidth, imageHeight, &pos00); if(!valid00) return;
float4 pos01; bool valid01 = getValueNearestNeighbour(make_float2(x-1, y-0), d_positions, imageWidth, imageHeight, &pos01); if(!valid01) return;
float4 pos02; bool valid02 = getValueNearestNeighbour(make_float2(x-1, y+1), d_positions, imageWidth, imageHeight, &pos02); if(!valid02) return;
float4 pos10; bool valid10 = getValueNearestNeighbour(make_float2(x-0, y-1), d_positions, imageWidth, imageHeight, &pos10); if(!valid10) return;
float4 pos11; bool valid11 = getValueNearestNeighbour(make_float2(x-0, y-0), d_positions, imageWidth, imageHeight, &pos11); if(!valid11) return;
float4 pos12; bool valid12 = getValueNearestNeighbour(make_float2(x-0, y+1), d_positions, imageWidth, imageHeight, &pos12); if(!valid12) return;
float4 pos20; bool valid20 = getValueNearestNeighbour(make_float2(x+1, y-1), d_positions, imageWidth, imageHeight, &pos20); if(!valid20) return;
float4 pos21; bool valid21 = getValueNearestNeighbour(make_float2(x+1, y-0), d_positions, imageWidth, imageHeight, &pos21); if(!valid21) return;
float4 pos22; bool valid22 = getValueNearestNeighbour(make_float2(x+1, y+1), d_positions, imageWidth, imageHeight, &pos22); if(!valid22) return;
float4 resU = (-1.0f)*pos00 + (1.0f)*pos20 +
(-2.0f)*pos01 + (2.0f)*pos21 +
(-1.0f)*pos02 + (1.0f)*pos22;
resU /= 8.0f;
float4 resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 +
( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22;
resV /= 8.0f;
//if(mat3x1(make_float3(resU)).norm1D() > 0.02f) return;
//if(mat3x1(make_float3(resV)).norm1D() > 0.02f) return;
d_positionsDU[index] = resU;
d_positionsDV[index] = resV;
}
}
}
extern "C" void computeDerivativesCameraSpace(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV)
{
const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeDerivativesCameraSpaceDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_positionsDU, d_positionsDV);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
/////////////////////////////////////////////
// Compute Intensity and Derivatives
/////////////////////////////////////////////
__global__ void computeIntensityAndDerivativesDevice(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
const int index = y*imageWidth+x;
if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
{
d_intensityAndDerivatives[index] = make_float4(MINF, MINF, MINF, MINF);
if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1)
{
float pos00; bool valid00 = getValueNearestNeighbourFloat(make_float2(x-1, y-1), d_intensity, imageWidth, imageHeight, &pos00); if(!valid00) return;
float pos01; bool valid01 = getValueNearestNeighbourFloat(make_float2(x-1, y-0), d_intensity, imageWidth, imageHeight, &pos01); if(!valid01) return;
float pos02; bool valid02 = getValueNearestNeighbourFloat(make_float2(x-1, y+1), d_intensity, imageWidth, imageHeight, &pos02); if(!valid02) return;
float pos10; bool valid10 = getValueNearestNeighbourFloat(make_float2(x-0, y-1), d_intensity, imageWidth, imageHeight, &pos10); if(!valid10) return;
float pos11; bool valid11 = getValueNearestNeighbourFloat(make_float2(x-0, y-0), d_intensity, imageWidth, imageHeight, &pos11); if(!valid11) return;
float pos12; bool valid12 = getValueNearestNeighbourFloat(make_float2(x-0, y+1), d_intensity, imageWidth, imageHeight, &pos12); if(!valid12) return;
float pos20; bool valid20 = getValueNearestNeighbourFloat(make_float2(x+1, y-1), d_intensity, imageWidth, imageHeight, &pos20); if(!valid20) return;
float pos21; bool valid21 = getValueNearestNeighbourFloat(make_float2(x+1, y-0), d_intensity, imageWidth, imageHeight, &pos21); if(!valid21) return;
float pos22; bool valid22 = getValueNearestNeighbourFloat(make_float2(x+1, y+1), d_intensity, imageWidth, imageHeight, &pos22); if(!valid22) return;
float resU = (-1.0f)*pos00 + (1.0f)*pos20 +
(-2.0f)*pos01 + (2.0f)*pos21 +
(-1.0f)*pos02 + (1.0f)*pos22;
resU /= 8.0f;
float resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 +
( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22;
resV /= 8.0f;
d_intensityAndDerivatives[index] = make_float4(pos11, resU, resV, 1.0f);
}
}
}
extern "C" void computeIntensityAndDerivatives(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives)
{
const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeIntensityAndDerivativesDevice<<<gridSize, blockSize>>>(d_intensity, imageWidth, imageHeight, d_intensityAndDerivatives);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
/////////////////////////////////////////////
// Compute grdient intensity magnitude
/////////////////////////////////////////////
__global__ void computeGradientIntensityMagnitudeDevice(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
const int index = y*imageWidth+x;
d_ouput[index] = make_float4(MINF, MINF, MINF, MINF);
float4 DU = d_inputDU[index];
float4 DV = d_inputDV[index];
if(DU.x != MINF && DV.x != MINF)
{
float m = sqrtf(DU.x*DU.x+DV.x*DV.x);
if(m > 0.005f)
{
d_ouput[index] = make_float4(m, m, m, 1.0f);
}
}
}
extern "C" void computeGradientIntensityMagnitude(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput)
{
const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
computeGradientIntensityMagnitudeDevice<<<gridSize, blockSize>>>(d_inputDU, d_inputDV, imageWidth, imageHeight, d_ouput);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
/////////////////////////////////////////////
// Transform
/////////////////////////////////////////////
__global__ void transformCameraSpaceMapDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_output)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
const int index = y*imageWidth+x;
if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
{
d_output[index] = d_positions[index];
if(d_positions[index].x != MINF && d_positions[index].y != MINF && d_positions[index].z != MINF)
{
d_output[index] = d_positions[index]+make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
}
}
extern "C" void transformCameraSpaceMap(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_output)
{
const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
transformCameraSpaceMapDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_output);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Erode Depth Map
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void erodeDepthMapDevice(float* d_output, float* d_input, int structureSize, int width, int height, float dThresh, float fracReq)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if(x >= 0 && x < width && y >= 0 && y < height)
{
unsigned int count = 0;
float oldDepth = d_input[y*width+x];
for(int i = -structureSize; i<=structureSize; i++)
{
for(int j = -structureSize; j<=structureSize; j++)
{
if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
{
float depth = d_input[(y+i)*width+(x+j)];
if(depth == MINF || depth == 0.0f || fabs(depth-oldDepth) > dThresh)
{
count++;
//d_output[y*width+x] = MINF;
//return;
}
}
}
}
unsigned int sum = (2*structureSize+1)*(2*structureSize+1);
if ((float)count/(float)sum >= fracReq) {
d_output[y*width+x] = MINF;
} else {
d_output[y*width+x] = d_input[y*width+x];
}
}
}
extern "C" void erodeDepthMap(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height, float dThresh, float fracReq)
{
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
erodeDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height, dThresh, fracReq);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////
// Depth to HSV map conversion /////
////////////////////////////////////
__device__ float3 convertHSVToRGB(const float3& hsv) {
float H = hsv.x;
float S = hsv.y;
float V = hsv.z;
float hd = H/60.0f;
unsigned int h = (unsigned int)hd;
float f = hd-h;
float p = V*(1.0f-S);
float q = V*(1.0f-S*f);
float t = V*(1.0f-S*(1.0f-f));
if(h == 0 || h == 6)
{
return make_float3(V, t, p);
}
else if(h == 1)
{
return make_float3(q, V, p);
}
else if(h == 2)
{
return make_float3(p, V, t);
}
else if(h == 3)
{
return make_float3(p, q, V);
}
else if(h == 4)
{
return make_float3(t, p, V);
}
else
{
return make_float3(V, p, q);
}
}
__device__ float3 convertDepthToRGB(float depth, float depthMin, float depthMax) {
float depthZeroOne = (depth - depthMin)/(depthMax - depthMin);
float x = 1.0f-depthZeroOne;
if (x < 0.0f) x = 0.0f;
if (x > 1.0f) x = 1.0f;
//return convertHSVToRGB(make_float3(240.0f*x, 1.0f, 0.5f));
x = 360.0f*x - 120.0f;
if (x < 0.0f) x += 359.0f;
return convertHSVToRGB(make_float3(x, 1.0f, 0.5f));
}
__global__ void depthToHSVDevice(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
{
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= 0 && x < width && y >= 0 && y < height) {
float depth = d_input[y*width + x];
if (depth != MINF && depth != 0.0f && depth >= minDepth && depth <= maxDepth) {
float3 c = convertDepthToRGB(depth, minDepth, maxDepth);
d_output[y*width + x] = make_float4(c, 1.0f);
} else {
d_output[y*width + x] = make_float4(0.0f);
}
}
}
extern "C" void depthToHSV(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) {
const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
depthToHSVDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height, minDepth, maxDepth);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
#endif // _CAMERA_UTIL_
\ No newline at end of file
...@@ -7,6 +7,9 @@ ...@@ -7,6 +7,9 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <ftl/config.h> #include <ftl/config.h>
#include <ftl/configuration.hpp> #include <ftl/configuration.hpp>
#include <ftl/depth_camera.hpp>
#include <ftl/scene_rep_hash_sdf.hpp>
#include <ftl/ray_cast_sdf.hpp>
#include <ftl/rgbd.hpp> #include <ftl/rgbd.hpp>
// #include <zlib.h> // #include <zlib.h>
...@@ -67,6 +70,7 @@ using cv::Mat; ...@@ -67,6 +70,7 @@ using cv::Mat;
namespace ftl { namespace ftl {
namespace rgbd { namespace rgbd {
PointCloud<PointXYZRGB>::Ptr createPointCloud(RGBDSource *src); PointCloud<PointXYZRGB>::Ptr createPointCloud(RGBDSource *src);
PointCloud<PointXYZRGB>::Ptr createPointCloud(RGBDSource *src, const cv::Point3_<uchar> &col);
} }
} }
...@@ -87,14 +91,14 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) { ...@@ -87,14 +91,14 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) {
for(int i=0;i<rgb.rows;i++) { for(int i=0;i<rgb.rows;i++) {
const float *sptr = depth.ptr<float>(i); const float *sptr = depth.ptr<float>(i);
for(int j=0;j<rgb.cols;j++) { for(int j=0;j<rgb.cols;j++) {
float d = sptr[j] * 1000.0f; float d = sptr[j]; // * 1000.0f;
pcl::PointXYZRGB point; pcl::PointXYZRGB point;
point.x = (((double)j + CX) / FX) * d; point.x = (((double)j + CX) / FX) * d;
point.y = (((double)i + CY) / FY) * d; point.y = (((double)i + CY) / FY) * d;
point.z = d; point.z = d;
if (point.x == INFINITY || point.y == INFINITY || point.z > 20000.0f || point.z < 0.04f) { if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) {
point.x = 0.0f; point.y = 0.0f; point.z = 0.0f; point.x = 0.0f; point.y = 0.0f; point.z = 0.0f;
} }
...@@ -109,6 +113,44 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) { ...@@ -109,6 +113,44 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) {
return point_cloud_ptr; return point_cloud_ptr;
} }
PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src, const cv::Point3_<uchar> &prgb) {
const double CX = src->getParameters().cx;
const double CY = src->getParameters().cy;
const double FX = src->getParameters().fx;
const double FY = src->getParameters().fy;
cv::Mat rgb;
cv::Mat depth;
src->getRGBD(rgb, depth);
pcl::PointCloud<pcl::PointXYZRGB>::Ptr point_cloud_ptr(new pcl::PointCloud<pcl::PointXYZRGB>);
point_cloud_ptr->width = rgb.cols * rgb.rows;
point_cloud_ptr->height = 1;
for(int i=0;i<rgb.rows;i++) {
const float *sptr = depth.ptr<float>(i);
for(int j=0;j<rgb.cols;j++) {
float d = sptr[j]; // * 1000.0f;
pcl::PointXYZRGB point;
point.x = (((double)j + CX) / FX) * d;
point.y = (((double)i + CY) / FY) * d;
point.z = d;
if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) {
point.x = 0.0f; point.y = 0.0f; point.z = 0.0f;
}
uint32_t rgb = (static_cast<uint32_t>(prgb.z) << 16 | static_cast<uint32_t>(prgb.y) << 8 | static_cast<uint32_t>(prgb.x));
point.rgb = *reinterpret_cast<float*>(&rgb);
point_cloud_ptr -> points.push_back(point);
}
}
return point_cloud_ptr;
}
#ifdef HAVE_PCL #ifdef HAVE_PCL
#include <pcl/filters/uniform_sampling.h> #include <pcl/filters/uniform_sampling.h>
...@@ -181,8 +223,14 @@ void saveRegistration(std::map<string, Eigen::Matrix4f> registration) { ...@@ -181,8 +223,14 @@ void saveRegistration(std::map<string, Eigen::Matrix4f> registration) {
file << save; file << save;
} }
struct Cameras {
RGBDSource *source;
DepthCameraData gpu;
DepthCameraParams params;
};
template <template<class> class Container> template <template<class> class Container>
std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<RGBDSource*> &inputs) { std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<Cameras> &inputs) {
std::map<string, Eigen::Matrix4f> registration; std::map<string, Eigen::Matrix4f> registration;
...@@ -205,7 +253,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta ...@@ -205,7 +253,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
size_t ref_i; size_t ref_i;
bool found = false; bool found = false;
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
if (inputs[i]->getConfig()["uri"] == ref_uri) { if (inputs[i].source->getConfig()["uri"] == ref_uri) {
ref_i = i; ref_i = i;
found = true; found = true;
break; break;
...@@ -215,7 +263,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta ...@@ -215,7 +263,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
if (!found) { LOG(ERROR) << "Reference input not found!"; return registration; } if (!found) { LOG(ERROR) << "Reference input not found!"; return registration; }
for (auto &input : inputs) { for (auto &input : inputs) {
cv::namedWindow("Registration: " + (string)input->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL); cv::namedWindow("Registration: " + (string)input.source->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL);
} }
// vector for every input: vector of point clouds of patterns // vector for every input: vector of point clouds of patterns
...@@ -227,7 +275,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta ...@@ -227,7 +275,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
vector<PointCloud<PointXYZ>::Ptr> patterns_iter(inputs.size()); vector<PointCloud<PointXYZ>::Ptr> patterns_iter(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
RGBDSource *input = inputs[i]; RGBDSource *input = inputs[i].source;
Mat rgb, depth; Mat rgb, depth;
input->getRGBD(rgb, depth); input->getRGBD(rgb, depth);
...@@ -250,7 +298,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta ...@@ -250,7 +298,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
else { LOG(WARNING) << "Pattern not detected on all inputs";} else { LOG(WARNING) << "Pattern not detected on all inputs";}
} }
for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input->getConfig()["uri"]); } for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input.source->getConfig()["uri"]); }
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
Eigen::Matrix4f T; Eigen::Matrix4f T;
...@@ -260,13 +308,48 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta ...@@ -260,13 +308,48 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
else { else {
T = ftl::registration::findTransformation(patterns[i], patterns[ref_i]); T = ftl::registration::findTransformation(patterns[i], patterns[ref_i]);
} }
registration[(string)inputs[i]->getConfig()["uri"]] = T; registration[(string)inputs[i].source->getConfig()["uri"]] = T;
} }
saveRegistration(registration); saveRegistration(registration);
return registration; return registration;
} }
#endif #endif
template<class T>
Eigen::Matrix<T,4,4> lookAt
(
Eigen::Matrix<T,3,1> const & eye,
Eigen::Matrix<T,3,1> const & center,
Eigen::Matrix<T,3,1> const & up
)
{
typedef Eigen::Matrix<T,4,4> Matrix4;
typedef Eigen::Matrix<T,3,1> Vector3;
Vector3 f = (center - eye).normalized();
Vector3 u = up.normalized();
Vector3 s = f.cross(u).normalized();
u = s.cross(f);
Matrix4 res;
res << s.x(),s.y(),s.z(),-s.dot(eye),
u.x(),u.y(),u.z(),-u.dot(eye),
-f.x(),-f.y(),-f.z(),f.dot(eye),
0,0,0,1;
return res;
}
using MouseAction = std::function<void(int, int, int, int)>;
static void setMouseAction(const std::string& winName, const MouseAction &action)
{
cv::setMouseCallback(winName,
[] (int event, int x, int y, int flags, void* userdata) {
(*(MouseAction*)userdata)(event, x, y, flags);
}, (void*)&action);
}
static void run() { static void run() {
Universe net(config["net"]); Universe net(config["net"]);
...@@ -278,8 +361,8 @@ static void run() { ...@@ -278,8 +361,8 @@ static void run() {
return; return;
} }
std::deque<RGBDSource*> inputs; std::deque<Cameras> inputs;
std::vector<Display> displays; //std::vector<Display> displays;
// TODO Allow for non-net source types // TODO Allow for non-net source types
for (auto &src : config["sources"]) { for (auto &src : config["sources"]) {
...@@ -287,9 +370,20 @@ static void run() { ...@@ -287,9 +370,20 @@ static void run() {
if (!in) { if (!in) {
LOG(ERROR) << "Unrecognised source: " << src; LOG(ERROR) << "Unrecognised source: " << src;
} else { } else {
inputs.push_back(in); auto &cam = inputs.emplace_back();
displays.emplace_back(config["display"], src["uri"]); cam.source = in;
LOG(INFO) << (string)src["uri"] << " loaded"; cam.params.fx = in->getParameters().fx;
cam.params.fy = in->getParameters().fy;
cam.params.mx = -in->getParameters().cx;
cam.params.my = -in->getParameters().cy;
cam.params.m_imageWidth = in->getParameters().width;
cam.params.m_imageHeight = in->getParameters().height;
cam.params.m_sensorDepthWorldMax = in->getParameters().maxDepth;
cam.params.m_sensorDepthWorldMin = in->getParameters().minDepth;
cam.gpu.alloc(cam.params);
//displays.emplace_back(config["display"], src["uri"]);
LOG(INFO) << (string)src["uri"] << " loaded " << cam.params.m_imageWidth;
} }
} }
...@@ -297,63 +391,126 @@ static void run() { ...@@ -297,63 +391,126 @@ static void run() {
// load point cloud transformations // load point cloud transformations
#ifdef HAVE_PCL
std::map<string, Eigen::Matrix4f> registration; std::map<string, Eigen::Matrix4f> registration;
if (config["registration"]["calibration"]["run"]) { if (config["registration"]["calibration"]["run"]) {
registration = runRegistration(net, inputs); registration = runRegistration(net, inputs);
} }
else { else {
LOG(INFO) << "LOAD REG";
registration = loadRegistration(); registration = loadRegistration();
} }
LOG(INFO) << "Assigning poses";
vector<Eigen::Matrix4f> T; vector<Eigen::Matrix4f> T;
for (auto &input : inputs) { for (auto &input : inputs) {
Eigen::Matrix4f RT = (registration.count((string)input->getConfig()["uri"]) > 0) ? registration[(string)input->getConfig()["uri"]] : registration["default"]; LOG(INFO) << (unsigned long long)input.source;
Eigen::Matrix4f RT = (registration.count(input.source->getConfig()["uri"].get<string>()) > 0) ? registration[(string)input.source->getConfig()["uri"]] : registration["default"];
T.push_back(RT); T.push_back(RT);
input.source->setPose(RT);
} }
// //vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size());
vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size());
Display display_merged(config["display"], "Merged"); // todo Display display_merged(config["display"], "Merged"); // todo
CUDARayCastSDF rays(config["voxelhash"]);
ftl::voxhash::SceneRep scene(config["voxelhash"]);
cv::Mat colour_array(cv::Size(rays.getRayCastParams().m_width,rays.getRayCastParams().m_height), CV_8UC3);
// Force window creation
display_merged.render(colour_array);
display_merged.wait(1);
unsigned char frameCount = 0;
bool paused = false;
float cam_x = 0.0f;
float cam_z = 0.0f;
Eigen::Vector3f eye(0.0f, 0.0f, 0.0f);
Eigen::Vector3f centre(0.0f, 0.0f, -4.0f);
Eigen::Vector3f up(0,1.0f,0);
Eigen::Vector3f lookPoint(0.0f,1.0f,-4.0f);
Eigen::Matrix4f viewPose;
float lerpSpeed = 0.4f;
// Keyboard camera controls
display_merged.onKey([&paused,&eye,&centre](int key) {
LOG(INFO) << "Key = " << key;
if (key == 32) paused = !paused;
else if (key == 81) eye[0] += 0.02f;
else if (key == 83) eye[0] -= 0.02f;
else if (key == 84) eye[2] += 0.02f;
else if (key == 82) eye[2] -= 0.02f;
});
// TODO(Nick) Calculate "camera" properties of viewport.
MouseAction mouseact = [&inputs,&lookPoint,&viewPose]( int event, int ux, int uy, int) {
LOG(INFO) << "Mouse " << ux << "," << uy;
if (event == 1) { // click
const float x = ((float)ux-inputs[0].params.mx) / inputs[0].params.fx;
const float y = ((float)uy-inputs[0].params.my) / inputs[0].params.fy;
const float depth = -4.0f;
Eigen::Vector4f camPos(x*depth,y*depth,depth,1.0);
Eigen::Vector4f worldPos = viewPose * camPos;
lookPoint = Eigen::Vector3f(worldPos[0],worldPos[1],worldPos[2]);
}
};
::setMouseAction("Image", mouseact);
int active = displays.size(); int active = inputs.size();
while (active > 0 && display_merged.active()) { while (active > 0 && display_merged.active()) {
active = 0; active = 0;
if (!paused) {
net.broadcast("grab"); // To sync cameras net.broadcast("grab"); // To sync cameras
scene.nextFrame();
PointCloud<PointXYZRGB>::Ptr cloud(new PointCloud<PointXYZRGB>);
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
//if (i == 1) continue;
//Display &display = displays[i]; //Display &display = displays[i];
RGBDSource *input = inputs[i]; RGBDSource *input = inputs[i].source;
Mat rgb, depth; Mat rgb, depth;
//LOG(INFO) << "GetRGB";
input->getRGBD(rgb,depth); input->getRGBD(rgb,depth);
//if (!display.active()) continue; //if (!display.active()) continue;
active += 1; active += 1;
clouds[i] = ftl::rgbd::createPointCloud(input); //clouds[i] = ftl::rgbd::createPointCloud(input);
//display.render(rgb, depth,input->getParameters()); //display.render(rgb, depth,input->getParameters());
//display.render(clouds[i]); //display.render(clouds[i]);
//display.wait(5); //display.wait(5);
}
for (size_t i = 0; i < clouds.size(); i++) { //LOG(INFO) << "Data size: " << depth.cols << "," << depth.rows;
pcl::transformPointCloud(*clouds[i], *clouds[i], T[i]); if (depth.cols == 0) continue;
*cloud += *clouds[i];
Mat rgba;
cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA);
inputs[i].params.flags = frameCount;
// Send to GPU and merge view into scene
inputs[i].gpu.updateParams(inputs[i].params);
inputs[i].gpu.updateData(depth, rgba);
scene.integrate(inputs[i].source->getPose(), inputs[i].gpu, inputs[i].params, nullptr);
}
} else {
active = 1;
} }
pcl::UniformSampling<PointXYZRGB> uniform_sampling; frameCount++;
uniform_sampling.setInputCloud(cloud);
uniform_sampling.setRadiusSearch(0.1f); // Set virtual camera transformation matrix
uniform_sampling.filter(*cloud); //Eigen::Affine3f transform(Eigen::Translation3f(cam_x,0.0f,cam_z));
centre += (lookPoint - centre) * (lerpSpeed * 0.1f);
viewPose = lookAt<float>(eye,centre,up); // transform.matrix();
display_merged.render(cloud); // Call GPU renderer and download result into OpenCV mat
display_merged.wait(50); rays.render(scene.getHashData(), scene.getHashParams(), inputs[0].gpu, viewPose);
rays.getRayCastData().download(nullptr, (uchar3*)colour_array.data, rays.getRayCastParams());
display_merged.render(colour_array);
display_merged.wait(1);
} }
#endif
// TODO non-PCL (?)
} }
int main(int argc, char **argv) { int main(int argc, char **argv) {
......
//#include <stdafx.h>
#include <ftl/voxel_hash.hpp>
//#include "Util.h"
#include <ftl/ray_cast_sdf.hpp>
extern "C" void renderCS(
const ftl::voxhash::HashData& hashData,
const RayCastData &rayCastData,
const DepthCameraData &cameraData,
const RayCastParams &rayCastParams);
extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height);
extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData);
extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params);
extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData,
const RayCastData &rayCastData, const RayCastParams &rayCastParams);
extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &params);
void CUDARayCastSDF::create(const RayCastParams& params)
{
m_params = params;
m_data.allocate(m_params);
//m_rayIntervalSplatting.OnD3D11CreateDevice(DXUTGetD3D11Device(), params.m_width, params.m_height);
}
void CUDARayCastSDF::destroy(void)
{
m_data.free();
//m_rayIntervalSplatting.OnD3D11DestroyDevice();
}
void CUDARayCastSDF::render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform)
{
rayIntervalSplatting(hashData, hashParams, cameraData, lastRigidTransform);
//m_data.d_rayIntervalSplatMinArray = m_rayIntervalSplatting.mapMinToCuda();
//m_data.d_rayIntervalSplatMaxArray = m_rayIntervalSplatting.mapMaxToCuda();
if (hash_render_) nickRenderCUDA(hashData, hashParams, m_data, cameraData, m_params);
else renderCS(hashData, m_data, cameraData, m_params);
//convertToCameraSpace(cameraData);
if (!m_params.m_useGradients)
{
computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height);
}
//m_rayIntervalSplatting.unmapCuda();
}
void CUDARayCastSDF::convertToCameraSpace(const DepthCameraData& cameraData)
{
convertDepthFloatToCameraSpaceFloat3(m_data.d_depth3, m_data.d_depth, m_params.m_intrinsicsInverse, m_params.m_width, m_params.m_height, cameraData);
if(!m_params.m_useGradients) {
computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height);
}
}
void CUDARayCastSDF::rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform)
{
if (hashParams.m_numOccupiedBlocks == 0) return;
if (m_params.m_maxNumVertices <= 6*hashParams.m_numOccupiedBlocks) { // 6 verts (2 triangles) per block
throw std::runtime_error("not enough space for vertex buffer for ray interval splatting");
}
m_params.m_numOccupiedSDFBlocks = hashParams.m_numOccupiedBlocks;
m_params.m_viewMatrix = MatrixConversion::toCUDA(lastRigidTransform.inverse());
m_params.m_viewMatrixInverse = MatrixConversion::toCUDA(lastRigidTransform);
m_data.updateParams(m_params); // !!! debugging
//don't use ray interval splatting (cf CUDARayCastSDF.cu -> line 40
//m_rayIntervalSplatting.rayIntervalSplatting(DXUTGetD3D11DeviceContext(), hashData, cameraData, m_data, m_params, m_params.m_numOccupiedSDFBlocks*6);
}
\ No newline at end of file
#include <cuda_runtime.h>
#include <ftl/cuda_matrix_util.hpp>
#include <ftl/depth_camera.hpp>
#include <ftl/voxel_hash.hpp>
#include <ftl/ray_cast_util.hpp>
#define T_PER_BLOCK 8
#define NUM_GROUPS_X 1024
//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMinTextureRef;
//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMaxTextureRef;
__global__ void renderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
const RayCastParams& rayCastParams = c_rayCastParams;
if (x < rayCastParams.m_width && y < rayCastParams.m_height) {
rayCastData.d_depth[y*rayCastParams.m_width+x] = MINF;
rayCastData.d_depth3[y*rayCastParams.m_width+x] = make_float3(MINF,MINF,MINF);
rayCastData.d_normals[y*rayCastParams.m_width+x] = make_float4(MINF,MINF,MINF,MINF);
rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0);
float3 camDir = normalize(cameraData.kinectProjToCamera(x, y, 1.0f));
float3 worldCamPos = rayCastParams.m_viewMatrixInverse * make_float3(0.0f, 0.0f, 0.0f);
float4 w = rayCastParams.m_viewMatrixInverse * make_float4(camDir, 0.0f);
float3 worldDir = normalize(make_float3(w.x, w.y, w.z));
////use ray interval splatting
//float minInterval = tex2D(rayMinTextureRef, x, y);
//float maxInterval = tex2D(rayMaxTextureRef, x, y);
//don't use ray interval splatting
float minInterval = rayCastParams.m_minDepth;
float maxInterval = rayCastParams.m_maxDepth;
//if (minInterval == 0 || minInterval == MINF) minInterval = rayCastParams.m_minDepth;
//if (maxInterval == 0 || maxInterval == MINF) maxInterval = rayCastParams.m_maxDepth;
//TODO MATTHIAS: shouldn't this return in the case no interval is found?
if (minInterval == 0 || minInterval == MINF) return;
if (maxInterval == 0 || maxInterval == MINF) return;
// debugging
//if (maxInterval < minInterval) {
// printf("ERROR (%d,%d): [ %f, %f ]\n", x, y, minInterval, maxInterval);
//}
rayCastData.traverseCoarseGridSimpleSampleAll(hashData, cameraData, worldCamPos, worldDir, camDir, make_int3(x,y,1), minInterval, maxInterval);
}
}
extern "C" void renderCS(const ftl::voxhash::HashData& hashData, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &rayCastParams)
{
const dim3 gridSize((rayCastParams.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (rayCastParams.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
//cudaBindTextureToArray(rayMinTextureRef, rayCastData.d_rayIntervalSplatMinArray, channelDesc);
//cudaBindTextureToArray(rayMaxTextureRef, rayCastData.d_rayIntervalSplatMaxArray, channelDesc);
//printf("Ray casting render...\n");
renderKernel<<<gridSize, blockSize>>>(hashData, rayCastData, cameraData);
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
//cutilCheckMsg(__FUNCTION__);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// Nicks render approach
////////////////////////////////////////////////////////////////////////////////
__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData)
{
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
const RayCastParams& rayCastParams = c_rayCastParams;
if (x < rayCastParams.m_width && y < rayCastParams.m_height) {
rayCastData.d_depth_i[y*rayCastParams.m_width+x] = 0x7FFFFFFF; //PINF;
rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0);
}
}
#define SDF_BLOCK_SIZE_PAD 9
#define SDF_BLOCK_BUFFER 1024 // > 9x9x9
#define SDF_DX 1
#define SDF_DY SDF_BLOCK_SIZE_PAD
#define SDF_DZ (SDF_BLOCK_SIZE_PAD*SDF_BLOCK_SIZE_PAD)
__device__
float frac(float val) {
return (val - floorf(val));
}
__device__
float3 frac(const float3& val) {
return make_float3(frac(val.x), frac(val.y), frac(val.z));
}
__host__ size_t nickSharedMem() {
return sizeof(float)*SDF_BLOCK_BUFFER +
sizeof(uchar)*SDF_BLOCK_BUFFER +
sizeof(float)*SDF_BLOCK_BUFFER +
sizeof(float3)*SDF_BLOCK_BUFFER;
}
/*__device__ void loadVoxel(const ftl::voxhash::HashData &hash, const int3 &vox, float *sdf, uint *weight, float3 *colour) {
ftl::voxhash::Voxel &v = hashData.getVoxel(vox);
*sdf = v.sdf;
*weight = v.weight;
*colour = v.color;
}*/
//! computes the (local) virtual voxel pos of an index; idx in [0;511]
__device__
int3 pdelinVoxelIndex(uint idx) {
int x = idx % SDF_BLOCK_SIZE_PAD;
int y = (idx % (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD)) / SDF_BLOCK_SIZE_PAD;
int z = idx / (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD);
return make_int3(x,y,z);
}
//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
__device__
uint plinVoxelPos(const int3& virtualVoxelPos) {
return
virtualVoxelPos.z * SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD +
virtualVoxelPos.y * SDF_BLOCK_SIZE_PAD +
virtualVoxelPos.x;
}
__device__
void deleteVoxel(ftl::voxhash::Voxel& v) {
v.color = make_uchar3(0,0,0);
v.weight = 0;
v.sdf = PINF;
}
__device__ inline int3 blockDelinear(const int3 &base, uint i) {
return make_int3(base.x + (i & 0x1), base.y + (i & 0x2), base.z + (i & 0x4));
}
__device__ inline uint blockLinear(int x, int y, int z) {
return x + (y << 1) + (z << 2);
}
__device__ inline void trilinearInterp(const ftl::voxhash::HashData &hashData, const ftl::voxhash::Voxel *voxels, const uint *ix, const float3 &pos, float &depth, uchar3 &colour) {
float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(pos)); // Should be world position of ray, not voxel??
float dist = 0.0f;
dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color);
dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color);
dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color);
dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *make_float3(voxels[ix[3]].color);
dist+= weight.x * weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color);
dist+= (1.0f-weight.x)* weight.y * weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *make_float3(voxels[ix[5]].color);
dist+= weight.x *(1.0f-weight.y)* weight.z *voxels[ix[6]].sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *make_float3(voxels[ix[6]].color);
dist+= weight.x * weight.y * weight.z *voxels[ix[7]].sdf; colorFloat+= weight.x * weight.y * weight.z *make_float3(voxels[ix[7]].color);
depth = dist;
colour = make_uchar3(colorFloat);
}
__global__ void nickRenderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData, RayCastParams params) {
// TODO(Nick) Reduce bank conflicts by aligning these
__shared__ ftl::voxhash::Voxel voxels[SDF_BLOCK_BUFFER];
__shared__ ftl::voxhash::HashEntry blocks[8];
const uint i = threadIdx.x; //inside of an SDF block
//TODO (Nick) Either don't use compactified or re-run compacitification using render cam frustrum
if (i == 0) blocks[0] = hashData.d_hashCompactified[blockIdx.x];
else if (i <= 7) blocks[i] = hashData.getHashEntryForSDFBlockPos(blockDelinear(blocks[0].pos, i));
// Make sure all hash entries are cached
__syncthreads();
const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(blocks[0].pos);
const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i));
const int3 pi = pi_base + vp;
const uint j = plinVoxelPos(vp); // Padded linear index
const float3 worldPos = hashData.virtualVoxelPosToWorld(pi);
// Load distances and colours into shared memory + padding
const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[blocks[0].ptr + i];
voxels[j] = v;
// TODO (Nick) Coalesce memory better?
uint imod = i & 0x7;
bool v_do = imod < 3;
if (v_do) {
const uint v_a = (i >> 3) & 0x7;
const uint v_c = (i >> 6);
const uint v_b = (imod >= 1) ? v_c : v_a;
const int3 v_cache = make_int3(((imod == 0) ? 8 : v_a), ((imod == 1) ? 8 : v_b), ((imod == 2) ? 8 : v_c));
const int3 v_ii = make_int3((imod == 0) ? 0 : v_a, (imod == 1) ? 0 : v_b, (imod == 2) ? 0 : v_c);
const int v_block = blockLinear((imod == 0) ? 1 : 0, (imod == 1) ? 1 : 0, (imod == 2) ? 1 : 0);
ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(v_cache)];
const uint ii = hashData.linearizeVoxelPos(v_ii);
if (blocks[v_block].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[v_block].ptr + ii];
else deleteVoxel(padVox);
}
/*if (vp.x == 7) {
ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x+1,vp.y,vp.z))];
const uint ii = hashData.linearizeVoxelPos(make_int3(0,vp.y,vp.z));
//padVox = hashData.getVoxel(make_int3(pi.x+1,pi.y,pi.z));
if (blocks[blockLinear(1,0,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(1,0,0)].ptr + ii];
else deleteVoxel(padVox);
}
if (vp.y == 7) {
ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y+1,vp.z))];
const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,0,vp.z));
//padVox = hashData.getVoxel(make_int3(pi.x,pi.y+1,pi.z));
if (blocks[blockLinear(0,1,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,1,0)].ptr + ii];
else deleteVoxel(padVox);
}
if (vp.z == 7) {
ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y,vp.z+1))];
const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,vp.y,0));
//padVox = hashData.getVoxel(make_int3(pi.x,pi.y,pi.z+1));
if (blocks[blockLinear(0,0,1)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,0,1)].ptr + ii];
else deleteVoxel(padVox);
}*/
// Indexes of the 8 neighbor voxels in one direction
const uint ix[8] = {
j, j+SDF_DX, j+SDF_DY, j+SDF_DZ, j+SDF_DX+SDF_DY, j+SDF_DY+SDF_DZ,
j+SDF_DX+SDF_DZ, j+SDF_DX+SDF_DY+SDF_DZ
};
__syncthreads();
// If any weight is 0, skip this voxel
const bool missweight = voxels[ix[0]].weight == 0 || voxels[ix[1]].weight == 0 || voxels[ix[2]].weight == 0 ||
voxels[ix[3]].weight == 0 || voxels[ix[4]].weight == 0 || voxels[ix[5]].weight == 0 ||
voxels[ix[6]].weight == 0 || voxels[ix[7]].weight == 0;
if (missweight) return;
// Trilinear Interpolation (simple and fast)
/*float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(worldPos)); // Should be world position of ray, not voxel??
float dist = 0.0f;
dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color);
dist+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+= weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color);
dist+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)* weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color);
dist+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)* weight.z *make_float3(voxels[ix[3]].color);
dist+= weight.x * weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+= weight.x * weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color);
dist+= (1.0f-weight.x)* weight.y * weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)* weight.y * weight.z *make_float3(voxels[ix[5]].color);
dist+= weight.x *(1.0f-weight.y)* weight.z *voxels[ix[6]].sdf; colorFloat+= weight.x *(1.0f-weight.y)* weight.z *make_float3(voxels[ix[6]].color);
dist+= weight.x * weight.y * weight.z *voxels[ix[7]].sdf; colorFloat+= weight.x * weight.y * weight.z *make_float3(voxels[ix[7]].color);
// Must finish using colours before updating colours
__syncthreads();
//voxels[j].color = make_uchar3(colorFloat);
//voxels[j].sdf = dist;
// What happens if fitlered voxel is put back?
//hashData.d_SDFBlocks[blocks[0].ptr + i] = voxels[j];
//return;*/
bool is_surface = false;
// Identify surfaces through sign change. Since we only check in one direction
// it is fine to check for any sign change?
#pragma unroll
for (int u=0; u<=1; u++) {
for (int v=0; v<=1; v++) {
for (int w=0; w<=1; w++) {
const int3 uvi = make_int3(vp.x+u,vp.y+v,vp.z+w);
// Skip these cases since we didn't load voxels properly
if (uvi.x == 8 && uvi.y == 8 || uvi.x == 8 && uvi.z == 8 || uvi.y == 8 && uvi.z == 8) continue;
if (signbit(voxels[j].sdf) != signbit(voxels[plinVoxelPos(uvi)].sdf)) {
is_surface = true;
break;
}
}
}
}
// Only for surface voxels, work out screen coordinates
// TODO Could adjust weights, strengthen on surface, weaken otherwise??
if (!is_surface) return;
const float3 camPos = params.m_viewMatrix * worldPos;
const float2 screenPosf = cameraData.cameraToKinectScreenFloat(camPos);
const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f)
/*if (screenPos.x < params.m_width && screenPos.y < params.m_height &&
rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] > camPos.z) {
rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] = camPos.z;
rayCastData.d_colors[(screenPos.y)*params.m_width+screenPos.x] = voxels[j].color;
}*/
//return;
// For this voxel in hash, get its screen position and check it is on screen
// Convert depth map to int by x1000 and use atomicMin
//const int pixsize = static_cast<int>(c_hashParams.m_virtualVoxelSize*c_depthCameraParams.fx/camPos.z)+1;
int pixsizeX = 10; // Max voxel pixels
int pixsizeY = 10;
for (int y=0; y<pixsizeY; y++) {
for (int x=0; x<pixsizeX; x++) {
// TODO(Nick) Within a window, check pixels that have same voxel id
// Then trilinear interpolate between current voxel and neighbors.
const float3 pixelWorldPos = params.m_viewMatrixInverse * cameraData.kinectDepthToSkeleton(screenPos.x+x,screenPos.y+y, camPos.z);
const float3 posInVoxel = (pixelWorldPos - worldPos) / make_float3(c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize);
if (posInVoxel.x >= 1.0f || posInVoxel.y >= 1.0f || posInVoxel.z >= 1.0f) {
pixsizeX = x;
continue;
}
/*float depth;
uchar3 col;
trilinearInterp(hashData, voxels, ix, pixelWorldPos, depth, col);*/
int idepth = static_cast<int>(camPos.z * 100.0f);
// TODO (Nick) MAKE THIS ATOMIC!!!!
/*if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height &&
rayCastData.d_depth_i[(screenPos.y+y)*params.m_width+screenPos.x+x] > idepth) {
rayCastData.d_depth[(screenPos.y+y)*params.m_width+screenPos.x+x] = idepth;
rayCastData.d_colors[(screenPos.y+y)*params.m_width+screenPos.x+x] = voxels[j].color;
}*/
if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height) {
int index = (screenPos.y+y)*params.m_width+screenPos.x+x;
if (rayCastData.d_depth_i[index] > idepth && atomicMin(&rayCastData.d_depth_i[index], idepth) != idepth) {
//rayCastData.d_depth[index] = idepth;
rayCastData.d_colors[index] = voxels[j].color;
}
}
}
if (pixsizeX == 0) break;
}
}
extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &params)
{
const dim3 clear_gridSize((params.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (params.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
clearDepthKernel<<<clear_gridSize, clear_blockSize>>>(hashData, rayCastData, cameraData);
const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;
const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1);
const dim3 blockSize(threadsPerBlock, 1);
if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated)
nickRenderKernel << <gridSize, blockSize >> >(hashData, rayCastData, cameraData, params);
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
/////////////////////////////////////////////////////////////////////////
// ray interval splatting
/////////////////////////////////////////////////////////////////////////
__global__ void resetRayIntervalSplatKernel(RayCastData data)
{
uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X;
data.point_cloud_[idx] = make_float3(MINF);
}
extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params)
{
const dim3 gridSize(NUM_GROUPS_X, (params.m_maxNumVertices + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1); // ! todo check if need third dimension?
const dim3 blockSize(1, 1, 1);
resetRayIntervalSplatKernel<<<gridSize, blockSize>>>(data);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
__global__ void rayIntervalSplatKernel(ftl::voxhash::HashData hashData, DepthCameraData depthCameraData, RayCastData rayCastData, DepthCameraData cameraData)
{
uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X;
const ftl::voxhash::HashEntry& entry = hashData.d_hashCompactified[idx];
if (entry.ptr != ftl::voxhash::FREE_ENTRY) {
//if (!hashData.isSDFBlockInCameraFrustumApprox(entry.pos)) return;
const RayCastParams &params = c_rayCastParams;
const float4x4& viewMatrix = params.m_viewMatrix;
float3 worldCurrentVoxel = hashData.SDFBlockToWorld(entry.pos);
float3 MINV = worldCurrentVoxel - c_hashParams.m_virtualVoxelSize / 2.0f;
float3 maxv = MINV+SDF_BLOCK_SIZE*c_hashParams.m_virtualVoxelSize;
float3 proj000 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, MINV.z));
float3 proj100 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, MINV.z));
float3 proj010 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, MINV.z));
float3 proj001 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, maxv.z));
float3 proj110 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, MINV.z));
float3 proj011 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, maxv.z));
float3 proj101 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, maxv.z));
float3 proj111 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, maxv.z));
// Tree Reduction Min
float3 min00 = fminf(proj000, proj100);
float3 min01 = fminf(proj010, proj001);
float3 min10 = fminf(proj110, proj011);
float3 min11 = fminf(proj101, proj111);
float3 min0 = fminf(min00, min01);
float3 min1 = fminf(min10, min11);
float3 minFinal = fminf(min0, min1);
// Tree Reduction Max
float3 max00 = fmaxf(proj000, proj100);
float3 max01 = fmaxf(proj010, proj001);
float3 max10 = fmaxf(proj110, proj011);
float3 max11 = fmaxf(proj101, proj111);
float3 max0 = fmaxf(max00, max01);
float3 max1 = fmaxf(max10, max11);
float3 maxFinal = fmaxf(max0, max1);
float depth = maxFinal.z;
if(params.m_splatMinimum == 1) {
depth = minFinal.z;
}
float depthWorld = cameraData.kinectProjToCameraZ(depth);
//uint addr = idx*4;
//rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld);
//rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
//rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
//rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld);
// Note (Nick) : Changed to create point cloud instead of vertex.
uint addr = idx;
rayCastData.point_cloud_[addr] = make_float3(maxFinal.x, maxFinal.y, depth);
//printf("Ray: %f\n", depth);
/*uint addr = idx*6;
rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld);
rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
rayCastData.d_vertexBuffer[addr+4] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
rayCastData.d_vertexBuffer[addr+5] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld);*/
}
}
extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData, const RayCastData &rayCastData, const RayCastParams &rayCastParams)
{
//printf("Ray casting...\n");
const dim3 gridSize(NUM_GROUPS_X, (rayCastParams.m_numOccupiedSDFBlocks + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1);
const dim3 blockSize(1, 1, 1);
rayIntervalSplatKernel<<<gridSize, blockSize>>>(hashData, cameraData, rayCastData, cameraData);
#ifdef _DEBUG
cutilSafeCall(cudaDeviceSynchronize());
cutilCheckMsg(__FUNCTION__);
#endif
}
...@@ -101,7 +101,7 @@ PointCloud<PointXYZ>::Ptr cornersToPointCloud(const vector<cv::Point2f> &corners ...@@ -101,7 +101,7 @@ PointCloud<PointXYZ>::Ptr cornersToPointCloud(const vector<cv::Point2f> &corners
for (int i = 0; i < corners_len; i++) { for (int i = 0; i < corners_len; i++) {
double x = corners[i].x; double x = corners[i].x;
double y = corners[i].y; double y = corners[i].y;
double d = disp.at<float>((int) y, (int) x) * 1000.0f; // todo: better estimation double d = disp.at<float>((int) y, (int) x); // * 1000.0f; // todo: better estimation
//cv::Vec4d homg_pt = Q_ * cv::Vec4d(x, y, d, 1.0); //cv::Vec4d homg_pt = Q_ * cv::Vec4d(x, y, d, 1.0);
//cv::Vec3d p = cv::Vec3d(homg_pt.val) / homg_pt[3]; //cv::Vec3d p = cv::Vec3d(homg_pt.val) / homg_pt[3];
......