Compare revisions

Nicolas Pope · Nicolas Pope · Nicolas Pope · Nicolas Pope · Nicolas Pope · Nicolas Pope
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,15 +14,14 @@ stages:
 #  paths:
 #    - build/
-docker:
+linux:
  stage: all
  tags:
-    - docker
+    - linux
-  image: ubuntu:18.04
+#  before_script:
-  before_script:
+#    - export DEBIAN_FRONTEND=noninteractive
-    - export DEBIAN_FRONTEND=noninteractive
+#    - apt-get update -qq && apt-get install -y -qq g++ cmake git
-    - apt-get update -qq && apt-get install -y -qq g++ cmake git
+#    - apt-get install -y -qq libopencv-dev libgoogle-glog-dev liburiparser-dev libreadline-dev libmsgpack-dev uuid-dev libpcl-dev
-    - apt-get install -y -qq libopencv-dev libgoogle-glog-dev liburiparser-dev libreadline-dev libmsgpack-dev uuid-dev libpcl-dev
  script:
    - mkdir build
    - cd build

--- a/applications/reconstruct/CMakeLists.txt
+++ b/applications/reconstruct/CMakeLists.txt
 # Need to include staged files and libs
-include_directories(${PROJECT_SOURCE_DIR}/applications/reconstruct/include)
+#include_directories(${PROJECT_SOURCE_DIR}/reconstruct/include)
 #include_directories(${PROJECT_BINARY_DIR})
 set(REPSRC
 	src/main.cpp
+	src/scene_rep_hash_sdf.cu
+	src/ray_cast_sdf.cu
+	src/camera_util.cu
+	src/ray_cast_sdf.cpp
 	src/registration.cpp
 )
@@ -11,6 +15,15 @@ add_executable(ftl-reconstruct ${REPSRC})
 add_dependencies(ftl-reconstruct ftlnet)
 add_dependencies(ftl-reconstruct ftlcommon)
+target_include_directories(ftl-reconstruct PUBLIC
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+	$<INSTALL_INTERFACE:include>
+	PRIVATE src)
+if (CUDA_FOUND)
+set_property(TARGET ftl-reconstruct PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+endif()
 #target_include_directories(cv-node PUBLIC ${PROJECT_SOURCE_DIR}/include)
 target_link_libraries(ftl-reconstruct ftlcommon ftlrgbd Threads::Threads ${OpenCV_LIBS} glog::glog ftlnet ftlrender)

--- a/applications/reconstruct/include/ftl/cuda_matrix_util.hpp
+++ b/applications/reconstruct/include/ftl/cuda_matrix_util.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/cuda_SimpleMatrixUtil.h
+#pragma once
+#ifndef _CUDA_SIMPLE_MATRIX_UTIL_
+#define _CUDA_SIMPLE_MATRIX_UTIL_
+#ifndef __CUDACC__
+inline float __int_as_float(int a) {
+	union {int a; float b;} u;
+	u.a = a;
+	return u.b;
+}
+#endif
+#define MINF __int_as_float(0xff800000)
+#define INF  __int_as_float(0x7f800000)
+#include <iostream>
+#include <ftl/cuda_util.hpp>
+//////////////////////////////
+// float2x2
+//////////////////////////////
+class float2x2
+{
+public:
+	inline __device__ __host__ float2x2()
+	{
+	}
+	inline __device__ __host__ float2x2(const float values[4])
+	{
+		m11 = values[0];	m12 = values[1];
+		m21 = values[2];	m22 = values[3];
+	}
+	inline __device__ __host__ float2x2(const float2x2& other)
+	{
+		m11 = other.m11;	m12 = other.m12;
+		m21 = other.m21; 	m22 = other.m22;
+	}
+	inline __device__ __host__ void setZero()
+	{
+		m11 = 0.0f;	m12 = 0.0f;
+		m21 = 0.0f; m22 = 0.0f;
+	}
+	static inline __device__ __host__ float2x2 getIdentity()
+	{
+		float2x2 res;
+		res.setZero();
+		res.m11 = res.m22 = 1.0f;
+		return res;
+	}
+	inline __device__ __host__ float2x2& operator=(const float2x2 &other)
+	{
+		m11 = other.m11;	m12 = other.m12;
+		m21 = other.m21;	m22 = other.m22;
+		return *this;
+	}
+	inline __device__ __host__ float2x2 getInverse()
+	{
+		float2x2 res;
+		res.m11 =  m22; res.m12 = -m12;
+		res.m21 = -m21; res.m22 =  m11;
+		return res*(1.0f/det());
+	}
+	inline __device__ __host__ float det()
+	{
+		return m11*m22-m21*m12;
+	}
+	inline __device__ __host__ float2 operator*(const float2& v) const
+	{
+		return make_float2(m11*v.x + m12*v.y, m21*v.x + m22*v.y);
+	}
+	//! matrix scalar multiplication
+	inline __device__ __host__ float2x2 operator*(const float t) const
+	{
+		float2x2 res;
+		res.m11 = m11 * t;	res.m12 = m12 * t;
+		res.m21 = m21 * t;	res.m22 = m22 * t;
+		return res;
+	}
+	//! matrix matrix multiplication
+	inline __device__ __host__ float2x2 operator*(const float2x2& other) const
+	{
+		float2x2 res;
+		res.m11 = m11 * other.m11 + m12 * other.m21;
+		res.m12 = m11 * other.m12 + m12 * other.m22;
+		res.m21 = m21 * other.m11 + m22 * other.m21;
+		res.m22 = m21 * other.m12 + m22 * other.m22;
+		return res;
+	}
+	//! matrix matrix addition
+	inline __device__ __host__ float2x2 operator+(const float2x2& other) const
+	{
+		float2x2 res;
+		res.m11 = m11 + other.m11;
+		res.m12 = m12 + other.m12;
+		res.m21 = m21 + other.m21;
+		res.m22 = m22 + other.m22;
+		return res;
+	}
+	inline __device__ __host__ float& operator()(int i, int j)
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union
+	{
+		struct
+		{
+			float m11; float m12;
+			float m21; float m22;
+		};
+		float entries[4];
+		float entries2[2][2];
+	};
+};
+//////////////////////////////
+// float2x3
+//////////////////////////////
+class float2x3
+{
+public:
+	inline __device__ __host__ float2x3()
+	{
+	}
+	inline __device__ __host__ float2x3(const float values[6])
+	{
+		m11 = values[0];	m12 = values[1];	m13 = values[2];
+		m21 = values[3];	m22 = values[4];	m23 = values[5];
+	}
+	inline __device__ __host__ float2x3(const float2x3& other)
+	{
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;
+		m21 = other.m21; 	m22 = other.m22;	m23 = other.m23;
+	}
+	inline __device__ __host__ float2x3& operator=(const float2x3 &other)
+	{
+		m11 = other.m11;	m12 = other.m12; m13 = other.m13;
+		m21 = other.m21;	m22 = other.m22; m23 = other.m23;
+		return *this;
+	}
+	inline __device__ __host__ float2 operator*(const float3 &v) const
+	{
+		return make_float2(m11*v.x + m12*v.y + m13*v.z, m21*v.x + m22*v.y + m23*v.z);
+	}
+	//! matrix scalar multiplication
+	inline __device__ __host__ float2x3 operator*(const float t) const
+	{
+		float2x3 res;
+		res.m11 = m11 * t;	res.m12 = m12 * t;	res.m13 = m13 * t;
+		res.m21 = m21 * t;	res.m22 = m22 * t;	res.m23 = m23 * t;
+		return res;
+	}
+	//! matrix scalar division
+	inline __device__ __host__ float2x3 operator/(const float t) const
+	{
+		float2x3 res;
+		res.m11 = m11 / t;	res.m12 = m12 / t;	res.m13 = m13 / t;
+		res.m21 = m21 / t;	res.m22 = m22 / t;	res.m23 = m23 / t;
+		return res;
+	}
+	inline __device__ __host__ float& operator()(int i, int j)
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union
+	{
+		struct
+		{
+			float m11; float m12; float m13;
+			float m21; float m22; float m23;
+		};
+		float entries[6];
+		float entries2[3][2];
+	};
+};
+//////////////////////////////
+// float3x2
+//////////////////////////////
+class float3x2
+{
+public:
+	inline __device__ __host__ float3x2()
+	{
+	}
+	inline __device__ __host__ float3x2(const float values[6])
+	{
+		m11 = values[0];	m12 = values[1];
+		m21 = values[2];	m22 = values[3];
+		m31 = values[4];	m32 = values[5];
+	}
+	inline __device__ __host__ float3x2& operator=(const float3x2& other)
+	{
+		m11 = other.m11;	m12 = other.m12;
+		m21 = other.m21;	m22 = other.m22;
+		m31 = other.m31;	m32 = other.m32;
+		return *this;
+	}
+	inline __device__ __host__ float3 operator*(const float2& v) const
+	{
+		return make_float3(m11*v.x + m12*v.y, m21*v.x + m22*v.y, m31*v.x + m32*v.y);
+	}
+	inline __device__ __host__ float3x2 operator*(const float t) const
+	{
+		float3x2 res;
+		res.m11 = m11 * t;	res.m12 = m12 * t;
+		res.m21 = m21 * t;	res.m22 = m22 * t;
+		res.m31 = m31 * t;	res.m32 = m32 * t;
+		return res;
+	}
+	inline __device__ __host__ float& operator()(int i, int j)
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const
+	{
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float2x3 getTranspose()
+	{
+		float2x3 res;
+		res.m11 = m11; res.m12 = m21; res.m13 = m31;
+		res.m21 = m12; res.m22 = m22; res.m23 = m32;
+		return res;
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union
+	{
+		struct
+		{
+			float m11; float m12;
+			float m21; float m22;
+			float m31; float m32;
+		};
+		float entries[6];
+		float entries2[3][2];
+	};
+};
+inline __device__ __host__ float2x2 matMul(const float2x3& m0, const float3x2& m1)
+{
+	float2x2 res;
+	res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31;
+	res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32;
+	res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31;
+	res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32;
+	return res;
+}
+class float3x3 {
+public:
+	inline __device__ __host__ float3x3() {
+	}
+	inline __device__ __host__ float3x3(const float values[9]) {
+		m11 = values[0];	m12 = values[1];	m13 = values[2];
+		m21 = values[3];	m22 = values[4];	m23 = values[5];
+		m31 = values[6];	m32 = values[7];	m33 = values[8];
+	}
+	inline __device__ __host__ float3x3(const float3x3& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;
+	}
+	explicit inline __device__ __host__ float3x3(const float2x2& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = 0.0;
+		m21 = other.m21;	m22 = other.m22;	m23 = 0.0;
+		m31 = 0.0;			m32 = 0.0;			m33 = 0.0;
+	}
+	inline __device__ __host__ float3x3& operator=(const float3x3 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;
+		return *this;
+	}
+	inline __device__ __host__ float& operator()(int i, int j) {
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const {
+		return entries2[i][j];
+	}
+	static inline __device__ __host__  void swap(float& v0, float& v1) {
+		float tmp = v0;
+		v0 = v1;
+		v1 = tmp;
+	}
+	inline __device__ __host__ void transpose() {
+		swap(m12, m21);
+		swap(m13, m31);
+		swap(m23, m32);
+	}
+	inline __device__ __host__ float3x3 getTranspose() const {
+		float3x3 ret = *this;
+		ret.transpose();
+		return ret;
+	}
+	//! inverts the matrix
+	inline __device__ __host__ void invert() {
+		*this = getInverse();
+	}
+	//! computes the inverse of the matrix; the result is returned
+	inline __device__ __host__ float3x3 getInverse() const {
+		float3x3 res;
+		res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7];
+		res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7];
+		res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4];
+		res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6];
+		res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6];
+		res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3];
+		res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6];
+		res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6];
+		res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3];
+		float nom = 1.0f/det();
+		return res * nom;
+	}
+	inline __device__ __host__ void setZero(float value = 0.0f) {
+		m11 = m12 = m13 = value;
+		m21 = m22 = m23 = value;
+		m31 = m32 = m33 = value;
+	}
+	inline __device__ __host__ float det() const {
+		return
+			+ m11*m22*m33
+			+ m12*m23*m31
+			+ m13*m21*m32
+			- m31*m22*m13
+			- m32*m23*m11
+			- m33*m21*m12;
+	}
+	inline __device__ __host__ float3 getRow(unsigned int i) {
+		return make_float3(entries[3*i+0], entries[3*i+1], entries[3*i+2]);
+	}
+	inline __device__ __host__ void setRow(unsigned int i, float3& r) {
+		entries[3*i+0] = r.x;
+		entries[3*i+1] = r.y;
+		entries[3*i+2] = r.z;
+	}
+	inline __device__ __host__ void normalizeRows()
+	{
+		//#pragma unroll 3
+		for(unsigned int i = 0; i<3; i++)
+		{
+			float3 r = getRow(i);
+			float l = length(r);
+			r.x /= l;
+			r.y /= l;
+			r.z /= l;
+			setRow(i, r);
+		}
+	}
+	//! computes the product of two matrices (result stored in this)
+	inline __device__ __host__ void mult(const float3x3 &other) {
+		float3x3 res;
+		res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
+		res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
+		res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33;
+		res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
+		res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
+		res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33;
+		res.m31 = m21 * other.m11 + m32 * other.m21 + m33 * other.m31;
+		res.m32 = m21 * other.m12 + m32 * other.m22 + m33 * other.m32;
+		res.m33 = m21 * other.m13 + m32 * other.m23 + m33 * other.m33;
+		*this = res;
+	}
+	//! computes the sum of two matrices (result stored in this)
+	inline __device__ __host__ void add(const float3x3 &other) {
+		m11 += other.m11;	m12 += other.m12;	m13 += other.m13;
+		m21 += other.m21;	m22 += other.m22;	m23 += other.m23;
+		m31 += other.m31;	m32 += other.m32;	m33 += other.m33;
+	}
+	//! standard matrix matrix multiplication
+	inline __device__ __host__ float3x3 operator*(const float3x3 &other) const {
+		float3x3 res;
+		res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
+		res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
+		res.m13 = m11 * other.m13 + m12 * other.m23 + m13 * other.m33;
+		res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
+		res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
+		res.m23 = m21 * other.m13 + m22 * other.m23 + m23 * other.m33;
+		res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31;
+		res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32;
+		res.m33 = m31 * other.m13 + m32 * other.m23 + m33 * other.m33;
+		return res;
+	}
+	//! standard matrix matrix multiplication
+	inline __device__ __host__ float3x2 operator*(const float3x2 &other) const {
+		float3x2 res;
+		res.m11 = m11 * other.m11 + m12 * other.m21 + m13 * other.m31;
+		res.m12 = m11 * other.m12 + m12 * other.m22 + m13 * other.m32;
+		res.m21 = m21 * other.m11 + m22 * other.m21 + m23 * other.m31;
+		res.m22 = m21 * other.m12 + m22 * other.m22 + m23 * other.m32;
+		res.m31 = m31 * other.m11 + m32 * other.m21 + m33 * other.m31;
+		res.m32 = m31 * other.m12 + m32 * other.m22 + m33 * other.m32;
+		return res;
+	}
+	inline __device__ __host__ float3 operator*(const float3 &v) const {
+		return make_float3(
+			m11*v.x + m12*v.y + m13*v.z,
+			m21*v.x + m22*v.y + m23*v.z,
+			m31*v.x + m32*v.y + m33*v.z
+			);
+	}
+	inline __device__ __host__ float3x3 operator*(const float t) const {
+		float3x3 res;
+		res.m11 = m11 * t;		res.m12 = m12 * t;		res.m13 = m13 * t;
+		res.m21 = m21 * t;		res.m22 = m22 * t;		res.m23 = m23 * t;
+		res.m31 = m31 * t;		res.m32 = m32 * t;		res.m33 = m33 * t;
+		return res;
+	}
+	inline __device__ __host__ float3x3 operator+(const float3x3 &other) const {
+		float3x3 res;
+		res.m11 = m11 + other.m11;	res.m12 = m12 + other.m12;	res.m13 = m13 + other.m13;
+		res.m21 = m21 + other.m21;	res.m22 = m22 + other.m22;	res.m23 = m23 + other.m23;
+		res.m31 = m31 + other.m31;	res.m32 = m32 + other.m32;	res.m33 = m33 + other.m33;
+		return res;
+	}
+	inline __device__ __host__ float3x3 operator-(const float3x3 &other) const {
+		float3x3 res;
+		res.m11 = m11 - other.m11;	res.m12 = m12 - other.m12;	res.m13 = m13 - other.m13;
+		res.m21 = m21 - other.m21;	res.m22 = m22 - other.m22;	res.m23 = m23 - other.m23;
+		res.m31 = m31 - other.m31;	res.m32 = m32 - other.m32;	res.m33 = m33 - other.m33;
+		return res;
+	}
+	static inline __device__ __host__ float3x3 getIdentity() {
+		float3x3 res;
+		res.setZero();
+		res.m11 = res.m22 = res.m33 = 1.0f;
+		return res;
+	}
+	static inline __device__ __host__ float3x3 getZeroMatrix() {
+		float3x3 res;
+		res.setZero();
+		return res;
+	}
+	static inline __device__ __host__ float3x3 getDiagonalMatrix(float diag = 1.0f) {
+		float3x3 res;
+		res.m11 = diag;		res.m12 = 0.0f;		res.m13 = 0.0f;
+		res.m21 = 0.0f;		res.m22 = diag;		res.m23 = 0.0f;
+		res.m31 = 0.0f;		res.m32 = 0.0f;		res.m33 = diag;
+		return res;
+	}
+	static inline __device__ __host__  float3x3 tensorProduct(const float3 &v, const float3 &vt) {
+		float3x3 res;
+		res.m11 = v.x * vt.x;	res.m12 = v.x * vt.y;	res.m13 = v.x * vt.z;
+		res.m21 = v.y * vt.x;	res.m22 = v.y * vt.y;	res.m23 = v.y * vt.z;
+		res.m31 = v.z * vt.x;	res.m32 = v.z * vt.y;	res.m33 = v.z * vt.z;
+		return res;
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union {
+		struct {
+			float m11; float m12; float m13;
+			float m21; float m22; float m23;
+			float m31; float m32; float m33;
+		};
+		float entries[9];
+		float entries2[3][3];
+	};
+};
+inline __device__ __host__ float2x3 matMul(const float2x3& m0, const float3x3& m1)
+{
+	float2x3 res;
+	res.m11 = m0.m11*m1.m11+m0.m12*m1.m21+m0.m13*m1.m31;
+	res.m12 = m0.m11*m1.m12+m0.m12*m1.m22+m0.m13*m1.m32;
+	res.m13 = m0.m11*m1.m13+m0.m12*m1.m23+m0.m13*m1.m33;
+	res.m21 = m0.m21*m1.m11+m0.m22*m1.m21+m0.m23*m1.m31;
+	res.m22 = m0.m21*m1.m12+m0.m22*m1.m22+m0.m23*m1.m32;
+	res.m23 = m0.m21*m1.m13+m0.m22*m1.m23+m0.m23*m1.m33;
+	return res;
+}
+// (1x2) row matrix as float2
+inline __device__ __host__ float3 matMul(const float2& m0, const float2x3& m1)
+{
+	float3 res;
+	res.x = m0.x*m1.m11+m0.y*m1.m21;
+	res.y = m0.x*m1.m12+m0.y*m1.m22;
+	res.z = m0.x*m1.m13+m0.y*m1.m23;
+	return res;
+}
+class float3x4 {
+public:
+	inline __device__ __host__ float3x4() {
+	}
+	inline __device__ __host__ float3x4(const float values[12]) {
+		m11 = values[0];	m12 = values[1];	m13 = values[2];	m14 = values[3];
+		m21 = values[4];	m22 = values[5];	m23 = values[6];	m24 = values[7];
+		m31 = values[8];	m32 = values[9];	m33 = values[10];	m34 = values[11];
+	}
+	inline __device__ __host__ float3x4(const float3x4& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+	}
+	inline __device__ __host__ float3x4(const float3x3& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = 0.0f;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = 0.0f;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = 0.0f;
+	}
+	inline __device__ __host__ float3x4 operator=(const float3x4 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+		return *this;
+	}
+	inline __device__ __host__ float3x4& operator=(const float3x3 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = 0.0f;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = 0.0f;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = 0.0f;
+		return *this;
+	}
+	//! assumes the last line of the matrix implicitly to be (0,0,0,1)
+	inline __device__ __host__ float4 operator*(const float4 &v) const {
+		return make_float4(
+			m11*v.x + m12*v.y + m13*v.z + m14*v.w,
+			m21*v.x + m22*v.y + m23*v.z + m24*v.w,
+			m31*v.x + m32*v.y + m33*v.z + m34*v.w,
+			v.w
+			);
+	}
+	//! assumes an implicit 1 in w component of the input vector
+	inline __device__ __host__ float3 operator*(const float3 &v) const {
+		return make_float3(
+			m11*v.x + m12*v.y + m13*v.z + m14,
+			m21*v.x + m22*v.y + m23*v.z + m24,
+			m31*v.x + m32*v.y + m33*v.z + m34
+			);
+	}
+	//! matrix scalar multiplication
+	inline __device__ __host__ float3x4 operator*(const float t) const {
+		float3x4 res;
+		res.m11 = m11 * t;		res.m12 = m12 * t;		res.m13 = m13 * t;		res.m14 = m14 * t;
+		res.m21 = m21 * t;		res.m22 = m22 * t;		res.m23 = m23 * t;		res.m24 = m24 * t;
+		res.m31 = m31 * t;		res.m32 = m32 * t;		res.m33 = m33 * t;		res.m34 = m34 * t;
+		return res;
+	}
+	inline __device__ __host__ float3x4& operator*=(const float t) {
+		*this = *this * t;
+		return *this;
+	}
+	//! matrix scalar division
+	inline __device__ __host__ float3x4 operator/(const float t) const {
+		float3x4 res;
+		res.m11 = m11 / t;		res.m12 = m12 / t;		res.m13 = m13 / t;		res.m14 = m14 / t;
+		res.m21 = m21 / t;		res.m22 = m22 / t;		res.m23 = m23 / t;		res.m24 = m24 / t;
+		res.m31 = m31 / t;		res.m32 = m32 / t;		res.m33 = m33 / t;		res.m34 = m34 / t;
+		return res;
+	}
+	inline __device__ __host__ float3x4& operator/=(const float t) {
+		*this = *this / t;
+		return *this;
+	}
+	//! assumes the last line of the matrix implicitly to be (0,0,0,1)
+	inline __device__ __host__ float3x4 operator*(const float3x4 &other) const {
+		float3x4 res;
+		res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31;  
+		res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32;  
+		res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33; 
+		res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14;
+		res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31;  
+		res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32;  
+		res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33; 
+		res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24;
+		res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31;  
+		res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32;  
+		res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33; 
+		res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34;
+		//res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41;  
+		//res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42;  
+		//res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43; 
+		//res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44;
+		return res;
+	}
+	//! assumes the last line of the matrix implicitly to be (0,0,0,1); and a (0,0,0) translation of other
+	inline __device__ __host__ float3x4 operator*(const float3x3 &other) const {
+		float3x4 res;
+		res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31;  
+		res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32;  
+		res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33; 
+		res.m14 = m14;
+		res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31;  
+		res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32;  
+		res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33; 
+		res.m24 = m24;
+		res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31;  
+		res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32;  
+		res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33; 
+		res.m34 = m34;
+		return res;
+	}
+	inline __device__ __host__ float& operator()(int i, int j) {
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const {
+		return entries2[i][j];
+	}
+	//! returns the translation part of the matrix
+	inline __device__ __host__ float3 getTranslation() {
+		return make_float3(m14, m24, m34);
+	}
+	//! sets only the translation part of the matrix (other values remain unchanged)
+	inline __device__ __host__ void setTranslation(const float3 &t) {
+		m14 = t.x;
+		m24 = t.y;
+		m34 = t.z;
+	}
+	//! returns the 3x3 part of the matrix
+	inline __device__ __host__ float3x3 getFloat3x3() {
+		float3x3 ret;
+		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
+		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
+		ret.m31 = m31;	ret.m32 = m32;	ret.m33 = m33;
+		return ret;
+	}
+	//! sets the 3x3 part of the matrix (other values remain unchanged)
+	inline __device__ __host__ void setFloat3x3(const float3x3 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;
+	}
+	//! inverts the matrix
+	inline __device__ __host__ void inverse() {
+		*this = getInverse();
+	}
+	//! computes the inverse of the matrix
+	inline __device__ __host__ float3x4 getInverse() {
+		float3x3 A = getFloat3x3();
+		A.invert();
+		float3 t = getTranslation();
+		t = A*t;
+		float3x4 ret;
+		ret.setFloat3x3(A);
+		ret.setTranslation(make_float3(-t.x, -t.y, -t.z));	//float3 doesn't have unary '-'... thank you cuda
+		return ret;
+	}
+	//! prints the matrix; only host	
+	__host__ void print() {
+		std::cout <<
+			m11 << " " << m12 << " " << m13 << " " << m14 << std::endl <<
+			m21 << " " << m22 << " " << m23 << " " << m24 << std::endl <<
+			m31 << " " << m32 << " " << m33 << " " << m34 << std::endl <<
+			std::endl;
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union {
+		struct {
+			float m11; float m12; float m13; float m14;
+			float m21; float m22; float m23; float m24;
+			float m31; float m32; float m33; float m34;
+		};
+		float entries[9];
+		float entries2[3][4];
+	};
+};
+class float4x4 {
+public:
+	inline __device__ __host__ float4x4() {
+	}
+	inline __device__ __host__ float4x4(const float values[16]) {
+		m11 = values[0];	m12 = values[1];	m13 = values[2];	m14 = values[3];
+		m21 = values[4];	m22 = values[5];	m23 = values[6];	m24 = values[7];
+		m31 = values[8];	m32 = values[9];	m33 = values[10];	m34 = values[11];
+		m41 = values[12];	m42 = values[13];	m43 = values[14];	m44 = values[15];
+	}
+	inline __device__ __host__ float4x4(const float4x4& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+		m41 = other.m41;	m42 = other.m42;	m43 = other.m43;	m44 = other.m44;
+	}
+	//implicitly assumes last line to (0,0,0,1)
+	inline __device__ __host__ float4x4(const float3x4& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+		m41 = 0.0f;			m42 = 0.0f;			m43 = 0.0f;			m44 = 1.0f;
+	}
+	inline __device__ __host__ float4x4(const float3x3& other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = 0.0f;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = 0.0f;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = 0.0f;
+		m41 = 0.0f;			m42 = 0.0f;			m43 = 0.0f;			m44 = 1.0f;
+	}
+	inline __device__ __host__ float4x4 operator=(const float4x4 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+		m41 = other.m41;	m42 = other.m42;	m43 = other.m43;	m44 = other.m44;
+		return *this;
+	}
+	inline __device__ __host__ float4x4 operator=(const float3x4 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+		m41 = 0.0f;			m42 = 0.0f;			m43 = 0.0f;			m44 = 1.0f;
+		return *this;
+	}
+	inline __device__ __host__ float4x4& operator=(const float3x3 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = 0.0f;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = 0.0f;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = 0.0f;
+		m41 = 0.0f;			m42 = 0.0f;			m43 = 0.0f;			m44 = 1.0f;
+		return *this;
+	}
+	//! not tested
+	inline __device__ __host__ float4x4 operator*(const float4x4 &other) const {
+		float4x4 res;
+		res.m11 = m11*other.m11 + m12*other.m21 + m13*other.m31 + m14*other.m41;  
+		res.m12 = m11*other.m12 + m12*other.m22 + m13*other.m32 + m14*other.m42;  
+		res.m13 = m11*other.m13 + m12*other.m23 + m13*other.m33 + m14*other.m43; 
+		res.m14 = m11*other.m14 + m12*other.m24 + m13*other.m34 + m14*other.m44;
+		res.m21 = m21*other.m11 + m22*other.m21 + m23*other.m31 + m24*other.m41;  
+		res.m22 = m21*other.m12 + m22*other.m22 + m23*other.m32 + m24*other.m42;  
+		res.m23 = m21*other.m13 + m22*other.m23 + m23*other.m33 + m24*other.m43; 
+		res.m24 = m21*other.m14 + m22*other.m24 + m23*other.m34 + m24*other.m44;
+		res.m31 = m31*other.m11 + m32*other.m21 + m33*other.m31 + m34*other.m41;  
+		res.m32 = m31*other.m12 + m32*other.m22 + m33*other.m32 + m34*other.m42;  
+		res.m33 = m31*other.m13 + m32*other.m23 + m33*other.m33 + m34*other.m43; 
+		res.m34 = m31*other.m14 + m32*other.m24 + m33*other.m34 + m34*other.m44;
+		res.m41 = m41*other.m11 + m42*other.m21 + m43*other.m31 + m44*other.m41;  
+		res.m42 = m41*other.m12 + m42*other.m22 + m43*other.m32 + m44*other.m42;  
+		res.m43 = m41*other.m13 + m42*other.m23 + m43*other.m33 + m44*other.m43; 
+		res.m44 = m41*other.m14 + m42*other.m24 + m43*other.m34 + m44*other.m44;
+		return res;
+	}
+	// untested
+	inline __device__ __host__ float4 operator*(const float4& v) const
+	{
+		return make_float4(
+			m11*v.x + m12*v.y + m13*v.z + m14*v.w,
+			m21*v.x + m22*v.y + m23*v.z + m24*v.w,
+			m31*v.x + m32*v.y + m33*v.z + m34*v.w,
+			m41*v.x + m42*v.y + m43*v.z + m44*v.w
+			);
+	}
+	// untested
+	//implicitly assumes w to be 1
+	inline __device__ __host__ float3 operator*(const float3& v) const
+	{
+		return make_float3(
+			m11*v.x + m12*v.y + m13*v.z + m14*1.0f,
+			m21*v.x + m22*v.y + m23*v.z + m24*1.0f,
+			m31*v.x + m32*v.y + m33*v.z + m34*1.0f
+			);
+	}
+	inline __device__ __host__ float& operator()(int i, int j) {
+		return entries2[i][j];
+	}
+	inline __device__ __host__ float operator()(int i, int j) const {
+		return entries2[i][j];
+	}
+	static inline __device__ __host__  void swap(float& v0, float& v1) {
+		float tmp = v0;
+		v0 = v1;
+		v1 = tmp;
+	}
+	inline __device__ __host__ void transpose() {
+		swap(m12, m21);
+		swap(m13, m31);
+		swap(m23, m32);
+		swap(m41, m14);
+		swap(m42, m24);
+		swap(m43, m34);
+	}
+	inline __device__ __host__ float4x4 getTranspose() const {
+		float4x4 ret = *this;
+		ret.transpose();
+		return ret;
+	}
+	inline __device__ __host__ void invert() {
+		*this = getInverse();
+	}
+	//! return the inverse matrix; but does not change the current matrix
+	inline __device__ __host__ float4x4 getInverse() const {
+		float inv[16];
+		inv[0] = entries[5]  * entries[10] * entries[15] - 
+			entries[5]  * entries[11] * entries[14] - 
+			entries[9]  * entries[6]  * entries[15] + 
+			entries[9]  * entries[7]  * entries[14] +
+			entries[13] * entries[6]  * entries[11] - 
+			entries[13] * entries[7]  * entries[10];
+		inv[4] = -entries[4]  * entries[10] * entries[15] + 
+			entries[4]  * entries[11] * entries[14] + 
+			entries[8]  * entries[6]  * entries[15] - 
+			entries[8]  * entries[7]  * entries[14] - 
+			entries[12] * entries[6]  * entries[11] + 
+			entries[12] * entries[7]  * entries[10];
+		inv[8] = entries[4]  * entries[9] * entries[15] - 
+			entries[4]  * entries[11] * entries[13] - 
+			entries[8]  * entries[5] * entries[15] + 
+			entries[8]  * entries[7] * entries[13] + 
+			entries[12] * entries[5] * entries[11] - 
+			entries[12] * entries[7] * entries[9];
+		inv[12] = -entries[4]  * entries[9] * entries[14] + 
+			entries[4]  * entries[10] * entries[13] +
+			entries[8]  * entries[5] * entries[14] - 
+			entries[8]  * entries[6] * entries[13] - 
+			entries[12] * entries[5] * entries[10] + 
+			entries[12] * entries[6] * entries[9];
+		inv[1] = -entries[1]  * entries[10] * entries[15] + 
+			entries[1]  * entries[11] * entries[14] + 
+			entries[9]  * entries[2] * entries[15] - 
+			entries[9]  * entries[3] * entries[14] - 
+			entries[13] * entries[2] * entries[11] + 
+			entries[13] * entries[3] * entries[10];
+		inv[5] = entries[0]  * entries[10] * entries[15] - 
+			entries[0]  * entries[11] * entries[14] - 
+			entries[8]  * entries[2] * entries[15] + 
+			entries[8]  * entries[3] * entries[14] + 
+			entries[12] * entries[2] * entries[11] - 
+			entries[12] * entries[3] * entries[10];
+		inv[9] = -entries[0]  * entries[9] * entries[15] + 
+			entries[0]  * entries[11] * entries[13] + 
+			entries[8]  * entries[1] * entries[15] - 
+			entries[8]  * entries[3] * entries[13] - 
+			entries[12] * entries[1] * entries[11] + 
+			entries[12] * entries[3] * entries[9];
+		inv[13] = entries[0]  * entries[9] * entries[14] - 
+			entries[0]  * entries[10] * entries[13] - 
+			entries[8]  * entries[1] * entries[14] + 
+			entries[8]  * entries[2] * entries[13] + 
+			entries[12] * entries[1] * entries[10] - 
+			entries[12] * entries[2] * entries[9];
+		inv[2] = entries[1]  * entries[6] * entries[15] - 
+			entries[1]  * entries[7] * entries[14] - 
+			entries[5]  * entries[2] * entries[15] + 
+			entries[5]  * entries[3] * entries[14] + 
+			entries[13] * entries[2] * entries[7] - 
+			entries[13] * entries[3] * entries[6];
+		inv[6] = -entries[0]  * entries[6] * entries[15] + 
+			entries[0]  * entries[7] * entries[14] + 
+			entries[4]  * entries[2] * entries[15] - 
+			entries[4]  * entries[3] * entries[14] - 
+			entries[12] * entries[2] * entries[7] + 
+			entries[12] * entries[3] * entries[6];
+		inv[10] = entries[0]  * entries[5] * entries[15] - 
+			entries[0]  * entries[7] * entries[13] - 
+			entries[4]  * entries[1] * entries[15] + 
+			entries[4]  * entries[3] * entries[13] + 
+			entries[12] * entries[1] * entries[7] - 
+			entries[12] * entries[3] * entries[5];
+		inv[14] = -entries[0]  * entries[5] * entries[14] + 
+			entries[0]  * entries[6] * entries[13] + 
+			entries[4]  * entries[1] * entries[14] - 
+			entries[4]  * entries[2] * entries[13] - 
+			entries[12] * entries[1] * entries[6] + 
+			entries[12] * entries[2] * entries[5];
+		inv[3] = -entries[1] * entries[6] * entries[11] + 
+			entries[1] * entries[7] * entries[10] + 
+			entries[5] * entries[2] * entries[11] - 
+			entries[5] * entries[3] * entries[10] - 
+			entries[9] * entries[2] * entries[7] + 
+			entries[9] * entries[3] * entries[6];
+		inv[7] = entries[0] * entries[6] * entries[11] - 
+			entries[0] * entries[7] * entries[10] - 
+			entries[4] * entries[2] * entries[11] + 
+			entries[4] * entries[3] * entries[10] + 
+			entries[8] * entries[2] * entries[7] - 
+			entries[8] * entries[3] * entries[6];
+		inv[11] = -entries[0] * entries[5] * entries[11] + 
+			entries[0] * entries[7] * entries[9] + 
+			entries[4] * entries[1] * entries[11] - 
+			entries[4] * entries[3] * entries[9] - 
+			entries[8] * entries[1] * entries[7] + 
+			entries[8] * entries[3] * entries[5];
+		inv[15] = entries[0] * entries[5] * entries[10] - 
+			entries[0] * entries[6] * entries[9] - 
+			entries[4] * entries[1] * entries[10] + 
+			entries[4] * entries[2] * entries[9] + 
+			entries[8] * entries[1] * entries[6] - 
+			entries[8] * entries[2] * entries[5];
+		float matrixDet = entries[0] * inv[0] + entries[1] * inv[4] + entries[2] * inv[8] + entries[3] * inv[12];
+		float matrixDetr = 1.0f / matrixDet;
+		float4x4 res;
+		for (unsigned int i = 0; i < 16; i++) {
+			res.entries[i] = inv[i] * matrixDetr;
+		}
+		return res;
+	}
+	//! returns the 3x3 part of the matrix
+	inline __device__ __host__ float3x3 getFloat3x3() {
+		float3x3 ret;
+		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;
+		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;
+		ret.m31 = m31;	ret.m32 = m32;	ret.m33 = m33;
+		return ret;
+	}
+	//! sets the 3x3 part of the matrix (other values remain unchanged)
+	inline __device__ __host__ void setFloat3x3(const float3x3 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;
+	}
+	//! sets the 4x4 part of the matrix to identity
+	inline __device__ __host__ void setIdentity()
+	{
+		m11 = 1.0f;	m12 = 0.0f;	m13 = 0.0f;	m14 = 0.0f;
+		m21 = 0.0f;	m22 = 1.0f;	m23 = 0.0f;	m24 = 0.0f;
+		m31 = 0.0f;	m32 = 0.0f;	m33 = 1.0f;	m34 = 0.0f;
+		m41 = 0.0f;	m42 = 0.0f;	m43 = 0.0f;	m44 = 1.0f;
+	}
+	//! sets the 4x4 part of the matrix to identity
+	inline __device__ __host__ void setValue(float v)
+	{
+		m11 = v;	m12 = v;	m13 = v;	m14 = v;
+		m21 = v;	m22 = v;	m23 = v;	m24 = v;
+		m31 = v;	m32 = v;	m33 = v;	m34 = v;
+		m41 = v;	m42 = v;	m43 = v;	m44 = v;
+	}
+	//! returns the 3x4 part of the matrix
+	inline __device__ __host__ float3x4 getFloat3x4() {
+		float3x4 ret;
+		ret.m11 = m11;	ret.m12 = m12;	ret.m13 = m13;	ret.m14 = m14;
+		ret.m21 = m21;	ret.m22 = m22;	ret.m23 = m23;	ret.m24 = m24;
+		ret.m31 = m31;	ret.m32 = m32;	ret.m33 = m33;	ret.m34 = m34;
+		return ret;
+	}
+	//! sets the 3x4 part of the matrix (other values remain unchanged)
+	inline __device__ __host__ void setFloat3x4(const float3x4 &other) {
+		m11 = other.m11;	m12 = other.m12;	m13 = other.m13;	m14 = other.m14;
+		m21 = other.m21;	m22 = other.m22;	m23 = other.m23;	m24 = other.m24;
+		m31 = other.m31;	m32 = other.m32;	m33 = other.m33;	m34 = other.m34;
+	}
+	inline __device__ __host__ const float* ptr() const {
+		return entries;
+	}
+	inline __device__ __host__ float* ptr() {
+		return entries;
+	}
+	union {
+		struct {
+			float m11; float m12; float m13; float m14;
+			float m21; float m22; float m23; float m24;
+			float m31; float m32; float m33; float m34;
+			float m41; float m42; float m43; float m44;
+		};
+		float entries[16];
+		float entries2[4][4];
+	};
+};
+//////////////////////////////
+// matNxM
+//////////////////////////////
+template<unsigned int N, unsigned int M>
+class matNxM
+{
+	public:
+		//////////////////////////////
+		// Initialization
+		//////////////////////////////
+		inline __device__ __host__ matNxM()
+		{
+		}
+		inline __device__ __host__ matNxM(float* values)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i];
+		}
+		inline __device__ __host__ matNxM(const float* values)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] = values[i];
+		}
+		inline __device__ __host__ matNxM(const matNxM& other)
+		{
+			(*this) = other;
+		}
+		inline __device__ __host__ matNxM<N,M>& operator=(const matNxM<N,M>& other)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] = other.entries[i];
+			return *this;
+		}
+		inline __device__ __host__ void setZero()
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] = 0.0f;
+		}
+		inline __device__ __host__ void setIdentity()
+		{
+			setZero();
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<min(N, M); i++) entries2D[i][i] = 1.0f;
+		}
+		static inline __device__ __host__ matNxM<N, M> getIdentity()
+		{
+			matNxM<N, M> R; R.setIdentity();
+			return R;
+		}
+		//////////////////////////////
+		// Conversion
+		//////////////////////////////
+		// declare generic constructors for compile time checking of matrix size
+		template<class B>
+		explicit inline __device__ __host__  matNxM(const B& other);
+		template<class B>
+		explicit inline __device__ __host__  matNxM(const B& other0, const B& other1);
+		// declare generic casts for compile time checking of matrix size
+		inline __device__ __host__ operator float();
+		inline __device__ __host__ operator float2();
+		inline __device__ __host__ operator float3();
+		inline __device__ __host__ operator float4();
+		inline __device__ __host__ operator float2x2();
+		inline __device__ __host__ operator float3x3();
+		inline __device__ __host__ operator float4x4();
+		//////////////////////////////
+		// Matrix - Matrix Multiplication
+		//////////////////////////////
+		template<unsigned int NOther, unsigned int MOther>
+		inline __device__ __host__ matNxM<N,MOther> operator*(const matNxM<NOther,MOther>& other) const
+		{
+			cudaAssert(M == NOther);
+			matNxM<N,MOther> res;
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<MOther; j++)
+				{
+					float sum = 0.0f;
+					__CONDITIONAL_UNROLL__
+					for(unsigned int k = 0; k<M; k++)
+					{
+						sum += (*this)(i, k)*other(k, j);
+					}
+					res(i, j) = sum;
+				}
+			}
+			return res;
+		}
+		//////////////////////////////
+		// Matrix - Inversion
+		//////////////////////////////
+		inline __device__ __host__ float det() const;
+		inline __device__ __host__  matNxM<N, M> getInverse() const;
+		//////////////////////////////
+		// Matrix - Transpose
+		//////////////////////////////
+		inline __device__ __host__ matNxM<M,N> getTranspose() const
+		{
+			matNxM<M,N> res;
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<M; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<N; j++)
+				{
+					res(i, j) = (*this)(j, i);
+				}
+			}
+			return res;
+		}
+		inline __device__ void printCUDA() const
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<M; j++)
+				{
+					printf("%f ", (*this)(i, j));
+				}
+				printf("\n");
+			}
+		}
+		inline __device__ bool checkMINF() const
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<M; j++)
+				{
+					if((*this)(i, j) == MINF) return true;
+				}
+			}
+			return false;
+		}
+		inline __device__ bool checkINF() const
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<M; j++)
+				{
+					if((*this)(i, j) == INF) return true;
+				}
+			}
+			return false;
+		}
+		inline __device__ bool checkQNAN() const
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<M; j++)
+				{
+					if((*this)(i, j) != (*this)(i, j)) return true;
+				}
+			}
+			return false;
+		}
+		//////////////////////////////
+		// Matrix - Matrix Addition
+		//////////////////////////////
+		inline __device__ __host__ matNxM<N,M> operator+(const matNxM<N,M>& other) const
+		{
+			matNxM<N,M> res = (*this);
+			res+=other;
+			return res;
+		}
+		inline __device__ __host__ matNxM<N,M>& operator+=(const matNxM<N,M>& other)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] += other.entries[i];
+			return (*this);
+		}
+		//////////////////////////////
+		// Matrix - Negation
+		//////////////////////////////
+		inline __device__ __host__ matNxM<N,M> operator-() const
+		{
+			matNxM<N,M> res = (*this)*(-1.0f);
+			return res;
+		}
+		//////////////////////////////
+		// Matrix - Matrix Subtraction
+		//////////////////////////////
+		inline __device__ __host__ matNxM<N,M> operator-(const matNxM<N,M>& other) const
+		{
+			matNxM<N,M> res = (*this);
+			res-=other;
+			return res;
+		}
+		inline __device__ __host__ matNxM<N,M>& operator-=(const matNxM<N,M>& other)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] -= other.entries[i];
+			return (*this);
+		}
+		//////////////////////////////
+		// Matrix - Scalar Multiplication
+		//////////////////////////////
+		inline __device__ __host__ matNxM<N,M> operator*(const float t) const
+		{
+			matNxM<N,M> res = (*this);
+			res*=t;
+			return res;
+		}
+		inline __device__ __host__ matNxM<N, M>& operator*=(const float t)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] *= t;
+			return (*this);
+		}
+		//////////////////////////////
+		// Matrix - Scalar Division
+		//////////////////////////////
+		inline __device__ __host__ matNxM<N, M> operator/(const float t) const
+		{
+			matNxM<N, M> res = (*this);
+			res/=t;
+			return res;
+		}
+		inline __device__ __host__ matNxM<N, M>& operator/=(const float t)
+		{
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<N*M; i++) entries[i] /= t;
+			return (*this);
+		}
+		//////////////////////////
+		// Element Access
+		//////////////////////////
+		inline __device__ __host__ unsigned int nRows()
+		{
+			return N;
+		}
+		inline __device__ __host__ unsigned int nCols()
+		{
+			return M;
+		}
+		inline __device__ __host__ float& operator()(unsigned int i, unsigned int j)
+		{
+			cudaAssert(i<N && j<M);
+			return entries2D[i][j];
+		}
+		inline __device__ __host__ float operator()(unsigned int i, unsigned int j) const
+		{
+			cudaAssert(i<N && j<M);
+			return entries2D[i][j];
+		}
+		inline __device__ __host__ float& operator()(unsigned int i)
+		{
+			cudaAssert(i<N*M);
+			return entries[i];
+		}
+		inline __device__ __host__ float operator()(unsigned int i) const
+		{
+			cudaAssert(i<N*M);
+			return entries[i];
+		}
+		template<unsigned int NOther, unsigned int MOther>
+		inline __device__ __host__ void getBlock(unsigned int xStart, unsigned int yStart, matNxM<NOther, MOther>& res) const
+		{
+			cudaAssert(xStart+NOther <= N && yStart+MOther <= M);
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<NOther; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<MOther; j++)
+				{
+					res(i, j) = (*this)(xStart+i, yStart+j);
+				}
+			}
+		}
+		template<unsigned int NOther, unsigned int MOther>
+		inline __device__ __host__ void setBlock(matNxM<NOther, MOther>& input, unsigned int xStart, unsigned int yStart)
+		{
+			cudaAssert(xStart+NOther <= N && yStart+MOther <= M);
+			__CONDITIONAL_UNROLL__
+			for(unsigned int i = 0; i<NOther; i++)
+			{
+				__CONDITIONAL_UNROLL__
+				for(unsigned int j = 0; j<MOther; j++)
+				{
+					(*this)(xStart+i, yStart+j) = input(i, j);
+				}
+			}
+		}
+		inline __device__ __host__ const float* ptr() const {
+			return entries;
+		}
+		inline __device__ __host__ float* ptr() {
+			return entries;
+		}
+		// Operators
+		inline __device__ __host__ float norm1DSquared() const
+		{
+			cudaAssert(M==1 || N==1);
+			float sum = 0.0f;
+			for(unsigned int i = 0; i<max(N, M); i++) sum += entries[i]*entries[i];
+			return sum;
+		}
+		inline __device__ __host__ float norm1D() const
+		{
+			return sqrt(norm1DSquared());
+		}
+	private:
+		union
+		{
+			float entries[N*M];
+			float entries2D[N][M];
+		};
+};
+//////////////////////////////
+// Scalar - Matrix Multiplication
+//////////////////////////////
+template<unsigned int N, unsigned int M>
+inline __device__ __host__ matNxM<N,M> operator*(const float t, const matNxM<N, M>& mat)
+{
+	matNxM<N,M> res = mat;
+	res*=t;
+	return res;
+}
+//////////////////////////////
+// Matrix Inversion
+//////////////////////////////
+template<>
+inline __device__ __host__ float  matNxM<3, 3>::det() const
+{
+	const float& m11 = entries2D[0][0];
+	const float& m12 = entries2D[0][1];
+	const float& m13 = entries2D[0][2];
+	const float& m21 = entries2D[1][0];
+	const float& m22 = entries2D[1][1];
+	const float& m23 = entries2D[1][2];
+	const float& m31 = entries2D[2][0];
+	const float& m32 = entries2D[2][1];
+	const float& m33 = entries2D[2][2];
+	return m11*m22*m33 + m12*m23*m31 + m13*m21*m32 - m31*m22*m13 - m32*m23*m11 - m33*m21*m12;
+}
+template<>
+inline __device__ __host__ matNxM<3, 3> matNxM<3, 3>::getInverse() const
+{
+	matNxM<3, 3> res;
+	res.entries[0] = entries[4]*entries[8] - entries[5]*entries[7];
+	res.entries[1] = -entries[1]*entries[8] + entries[2]*entries[7];
+	res.entries[2] = entries[1]*entries[5] - entries[2]*entries[4];
+	res.entries[3] = -entries[3]*entries[8] + entries[5]*entries[6];
+	res.entries[4] = entries[0]*entries[8] - entries[2]*entries[6];
+	res.entries[5] = -entries[0]*entries[5] + entries[2]*entries[3];
+	res.entries[6] = entries[3]*entries[7] - entries[4]*entries[6];
+	res.entries[7] = -entries[0]*entries[7] + entries[1]*entries[6];
+	res.entries[8] = entries[0]*entries[4] - entries[1]*entries[3];
+	return res*(1.0f/det());
+}
+template<>
+inline __device__ __host__ float matNxM<2, 2>::det() const
+{
+	return (*this)(0, 0)*(*this)(1, 1)-(*this)(1, 0)*(*this)(0, 1);
+}
+template<>
+inline __device__ __host__ matNxM<2, 2> matNxM<2, 2>::getInverse() const
+{
+	matNxM<2, 2> res;
+	res(0, 0) =  (*this)(1, 1); res(0, 1) = -(*this)(0, 1);
+	res(1, 0) = -(*this)(1, 0); res(1, 1) =  (*this)(0, 0);
+	return res*(1.0f/det());
+}
+//////////////////////////////
+// Conversion
+//////////////////////////////
+// To Matrix from floatNxN
+template<>
+template<>
+inline __device__ __host__  matNxM<1, 1>::matNxM(const float& other)
+{
+	entries[0] = other;
+}
+// To Matrix from floatNxN
+template<>
+template<>
+inline __device__ __host__  matNxM<2, 2>::matNxM(const float2x2& other)
+{
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<4; i++) entries[i] = other.entries[i];
+}
+template<>
+template<>
+inline __device__ __host__  matNxM<3, 3>::matNxM(const float3x3& other)
+{
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<9; i++) entries[i] = other.entries[i];
+}
+template<>
+template<>
+inline __device__ __host__  matNxM<4, 4>::matNxM(const float4x4& other)
+{
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<16; i++) entries[i] = other.entries[i];
+}
+template<>
+template<>
+inline __device__ __host__ matNxM<3, 2>::matNxM(const float3& col0, const float3& col1)
+{
+	entries2D[0][0] = col0.x; entries2D[0][1] = col1.x;
+	entries2D[1][0] = col0.y; entries2D[1][1] = col1.y;
+	entries2D[2][0] = col0.z; entries2D[2][1] = col1.z;
+}
+// To floatNxM from Matrix
+template<>
+inline __device__ __host__ matNxM<4, 4>::operator float4x4()
+{
+	float4x4 res;
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<16; i++) res.entries[i] = entries[i];
+	return res;
+}
+template<>
+inline __device__ __host__ matNxM<3, 3>::operator float3x3()
+{
+	float3x3 res;
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<9; i++) res.entries[i] = entries[i];
+	return res;
+}
+template<>
+inline __device__ __host__ matNxM<2, 2>::operator float2x2()
+{
+	float2x2 res;
+	__CONDITIONAL_UNROLL__
+	for(unsigned int i = 0; i<4; i++) res.entries[i] = entries[i];
+	return res;
+}
+// To Matrix from floatN
+template<>
+template<>
+inline __device__ __host__ matNxM<2, 1>::matNxM(const float2& other)
+{
+	entries[0] = other.x;
+	entries[1] = other.y;
+}
+template<>
+template<>
+inline __device__ __host__ matNxM<3, 1>::matNxM(const float3& other)
+{
+	entries[0] = other.x;
+	entries[1] = other.y;
+	entries[2] = other.z;
+}
+template<>
+template<>
+inline __device__ __host__ matNxM<4, 1>::matNxM(const float4& other)
+{
+	entries[0] = other.x;
+	entries[1] = other.y;
+	entries[2] = other.z;
+	entries[3] = other.w;
+}
+// To floatN from Matrix
+template<>
+inline __device__ __host__ matNxM<1, 1>::operator float()
+{
+	return entries[0];
+}
+template<>
+inline __device__ __host__ matNxM<2, 1>::operator float2()
+{
+	return make_float2(entries[0],  entries[1]);
+}
+template<>
+inline __device__ __host__ matNxM<3, 1>::operator float3()
+{
+	return make_float3(entries[0], entries[1], entries[2]);
+}
+template<>
+inline __device__ __host__ matNxM<4, 1>::operator float4()
+{
+	return make_float4(entries[0],  entries[1], entries[2], entries[3]);
+}
+//////////////////////////////
+// Typedefs
+//////////////////////////////
+typedef matNxM<9, 3> mat9x3;
+typedef matNxM<3, 9> mat3x9;
+typedef matNxM<9, 1> mat9x1;
+typedef matNxM<1, 9> mat1x9;
+typedef matNxM<6, 6> mat6x6;
+typedef matNxM<6, 1> mat6x1;
+typedef matNxM<1, 6> mat1x6;
+typedef matNxM<3, 6> mat3x6;
+typedef matNxM<6, 3> mat6x3;
+typedef matNxM<4, 4> mat4x4;
+typedef matNxM<4, 1> mat4x1;
+typedef matNxM<1, 4> mat1x4;
+typedef matNxM<3, 3> mat3x3;
+typedef matNxM<2, 3> mat2x3;
+typedef matNxM<3, 2> mat3x2;
+typedef matNxM<2, 2> mat2x2;
+typedef matNxM<1, 2> mat1x2;
+typedef matNxM<2, 1> mat2x1;
+typedef matNxM<1, 3> mat1x3;
+typedef matNxM<3, 1> mat3x1;
+typedef matNxM<1, 1> mat1x1;
+typedef matNxM<4, 4> mat4f;
+#endif
\ No newline at end of file
--- a/applications/reconstruct/include/ftl/cuda_operators.hpp
+++ b/applications/reconstruct/include/ftl/cuda_operators.hpp
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+/*
+    This file implements common mathematical operations on vector types
+    (float3, float4 etc.) since these are not provided as standard by CUDA.
+    The syntax is modelled on the Cg standard library.
+*/
+#ifndef _FTL_CUDA_OPERATORS_HPP_
+#define _FTL_CUDA_OPERATORS_HPP_
+#include <cuda_runtime.h>
+////////////////////////////////////////////////////////////////////////////////
+typedef unsigned int uint;
+typedef unsigned short ushort;
+#ifndef __CUDACC__
+#include <math.h>
+inline float fminf(float a, float b)
+{
+  return a < b ? a : b;
+}
+inline float fmaxf(float a, float b)
+{
+  return a > b ? a : b;
+}
+inline int max(int a, int b)
+{
+  return a > b ? a : b;
+}
+inline int min(int a, int b)
+{
+  return a < b ? a : b;
+}
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+// float functions
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+// clamp
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int sign(float x) { 
+	int t = x<0 ? -1 : 0;
+	return x > 0 ? 1 : t;
+}
+// int2 functions
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ int2 make_int2(float2 f)
+{
+  int2 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); return t;
+}
+inline __host__ __device__ uint2 make_uint2(int2 i)
+{
+  uint2 t; t.x = static_cast<uint>(i.x); t.y = static_cast<uint>(i.y); return t;
+}
+// negate
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+// addition
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x; a.y += b.y;
+}
+// subtract
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x; a.y -= b.y;
+}
+// multiply
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ int2 operator*(int2 a, int s)
+{
+    return make_int2(a.x * s, a.y * s);
+}
+inline __host__ __device__ int2 operator*(int s, int2 a)
+{
+    return make_int2(a.x * s, a.y * s);
+}
+inline __host__ __device__ void operator*=(int2 &a, int s)
+{
+    a.x *= s; a.y *= s;
+}
+// float2 functions
+////////////////////////////////////////////////////////////////////////////////
+// negate
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x; a.y += b.y;
+}
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x; a.y -= b.y;
+}
+// multiply
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ float2 operator*(float2 a, float s)
+{
+    return make_float2(a.x * s, a.y * s);
+}
+inline __host__ __device__ float2 operator*(float s, float2 a)
+{
+    return make_float2(a.x * s, a.y * s);
+}
+inline __host__ __device__ void operator*=(float2 &a, float s)
+{
+    a.x *= s; a.y *= s;
+}
+// divide
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ float2 operator/(float2 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float2 operator/(float s, float2 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float2 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+// lerp
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+// clamp
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+// dot product
+inline __host__ __device__ float dot(float2 a, float2 b)
+{ 
+    return a.x * b.x + a.y * b.y;
+}
+// length
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+// normalize
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+// floor
+inline __host__ __device__ float2 floor(const float2 v)
+{
+    return make_float2(floor(v.x), floor(v.y));
+}
+// reflect
+inline __host__ __device__ float2 reflect(float2 i, float2 n)
+{
+	return i - 2.0f * n * dot(n,i);
+}
+// absolute value
+inline __host__ __device__ float2 fabs(float2 v)
+{
+	return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __device__ __host__ int2 sign(float2 f) { 
+	return make_int2(sign(f.x), sign(f.y));
+}
+// float3 functions
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float3 make_float3(int3 i)
+{
+  float3 t; t.x = static_cast<float>(i.x); t.y = static_cast<float>(i.y); t.z = static_cast<float>(i.z); return t;
+}
+inline __host__ __device__ float3 make_float3(float4 f)
+{
+    return make_float3(f.x,f.y,f.z);
+}
+inline __host__ __device__ float3 make_float3(uchar3 c)
+{
+    return make_float3(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z));
+}
+inline __host__ __device__ uchar3 make_uchar3(float3 f)
+{
+    return make_uchar3(static_cast<unsigned char>(f.x), static_cast<unsigned char>(f.y), static_cast<unsigned char>(f.z));
+}
+inline __host__ __device__ float3 make_float3(float f)
+{
+    return make_float3(f,f,f);
+}
+// negate
+inline __host__ __device__ float3 operator-(const float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+// min
+static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+	return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+// max
+static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+	return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+// addition
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+// subtract
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+// multiply
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ float3 operator*(float3 a, float s)
+{
+    return make_float3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ float3 operator*(float s, float3 a)
+{
+    return make_float3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(float3 &a, float s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+// divide
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ float3 operator/(float3 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float3 operator/(float s, float3 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float3 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+// lerp
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+// clamp
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+// dot product
+inline __host__ __device__ float dot(float3 a, float3 b)
+{ 
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+// cross product
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{ 
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
+}
+// length
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+// normalize
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+// floor
+inline __host__ __device__ float3 floor(const float3 v)
+{
+    return make_float3(floor(v.x), floor(v.y), floor(v.z));
+}
+// reflect
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+	return i - 2.0f * n * dot(n,i);
+}
+// absolute value
+inline __host__ __device__ float3 fabs(float3 v)
+{
+	return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __device__ __host__ int3 sign(float3 f) { 
+	return make_int3(sign(f.x), sign(f.y), sign(f.z));
+}
+// float4 functions
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ float4 make_float4(float a)
+{
+    return make_float4(a,a,a,a);
+}
+inline __host__ __device__ float4 make_float4(float3 f, float a)
+{
+    return make_float4(f.x,f.y,f.z,a);
+}
+// negate
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+// min
+static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+	return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+// max
+static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+	return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+// addition
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
+}
+// subtract
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
+}
+// multiply
+inline __host__ __device__ float4 operator*(float4 a, float s)
+{
+    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __host__ __device__ float4 operator*(float s, float4 a)
+{
+    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
+}
+inline __host__ __device__ void operator*=(float4 &a, float s)
+{
+    a.x *= s; a.y *= s; a.z *= s; a.w *= s;
+}
+// divide
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+inline __host__ __device__ float4 operator/(float4 a, float s)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ float4 operator/(float s, float4 a)
+{
+    float inv = 1.0f / s;
+    return a * inv;
+}
+inline __host__ __device__ void operator/=(float4 &a, float s)
+{
+    float inv = 1.0f / s;
+    a *= inv;
+}
+// lerp
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+// clamp
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+// dot product
+inline __host__ __device__ float dot(float4 a, float4 b)
+{ 
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+// length
+inline __host__ __device__ float length(float4 r)
+{
+    return sqrtf(dot(r, r));
+}
+// normalize
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+// floor
+inline __host__ __device__ float4 floor(const float4 v)
+{
+    return make_float4(floor(v.x), floor(v.y), floor(v.z), floor(v.w));
+}
+// absolute value
+inline __host__ __device__ float4 fabs(float4 v)
+{
+	return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+// int3 functions
+////////////////////////////////////////////////////////////////////////////////
+inline __host__ __device__ int3 make_int3(float3 f)
+{
+  int3 t; t.x = static_cast<int>(f.x); t.y = static_cast<int>(f.y); t.z = static_cast<int>(f.z); return t;
+}
+inline __host__ __device__ int3 make_int3(uint3 i)
+{
+  int3 t; t.x = static_cast<int>(i.x); t.y = static_cast<int>(i.y); t.z = static_cast<int>(i.z); return t;
+}
+inline __host__ __device__ int3 make_int3(int i)
+{
+  int3 t; t.x = i; t.y = i; t.z = i; return t;
+}
+// negate
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+// min
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+// max
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+// addition
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+// subtract
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+// multiply
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ int3 operator*(int3 a, int s)
+{
+    return make_int3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ int3 operator*(int s, int3 a)
+{
+    return make_int3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(int3 &a, int s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+// divide
+inline __host__ __device__ int3 operator/(int3 a, int3 b)
+{
+    return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ int3 operator/(int3 a, int s)
+{
+    return make_int3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ int3 operator/(int s, int3 a)
+{
+    return make_int3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ void operator/=(int3 &a, int s)
+{
+    a.x /= s; a.y /= s; a.z /= s;
+}
+// clamp
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+// uint3 functions
+////////////////////////////////////////////////////////////////////////////////
+// min
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+// max
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+// addition
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x; a.y += b.y; a.z += b.z;
+}
+// subtract
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x; a.y -= b.y; a.z -= b.z;
+}
+// multiply
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint s)
+{
+    return make_uint3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ uint3 operator*(uint s, uint3 a)
+{
+    return make_uint3(a.x * s, a.y * s, a.z * s);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint s)
+{
+    a.x *= s; a.y *= s; a.z *= s;
+}
+// divide
+inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
+{
+    return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ uint3 operator/(uint3 a, uint s)
+{
+    return make_uint3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ uint3 operator/(uint s, uint3 a)
+{
+    return make_uint3(a.x / s, a.y / s, a.z / s);
+}
+inline __host__ __device__ void operator/=(uint3 &a, uint s)
+{
+    a.x /= s; a.y /= s; a.z /= s;
+}
+// clamp
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+#endif  // _FTL_CUDA_OPERATORS_HPP_
--- a/applications/reconstruct/include/ftl/cuda_util.hpp
+++ b/applications/reconstruct/include/ftl/cuda_util.hpp
+#pragma once
+#ifndef _CUDA_UTIL_
+#define _CUDA_UTIL_
+#undef max
+#undef min
+#include <cuda_runtime.h>
+#include <ftl/cuda_operators.hpp>
+// Enable run time assertion checking in kernel code
+#define cudaAssert(condition) if (!(condition)) { printf("ASSERT: %s %s\n", #condition, __FILE__); }
+//#define cudaAssert(condition)
+#if defined(__CUDA_ARCH__)
+#define __CONDITIONAL_UNROLL__ #pragma unroll
+#else
+#define __CONDITIONAL_UNROLL__ 
+#endif
+#endif
--- a/applications/reconstruct/include/ftl/depth_camera.hpp
+++ b/applications/reconstruct/include/ftl/depth_camera.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/DepthCameraUtil.h
+#pragma once
+//#include <cutil_inline.h>
+//#include <cutil_math.h>
+#include <vector_types.h>
+#include <cuda_runtime.h>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/cuda_common.hpp>
+#include <ftl/depth_camera_params.hpp>
+extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params);
+extern __constant__ DepthCameraParams c_depthCameraParams;
+struct DepthCameraData {
+	///////////////
+	// Host part //
+	///////////////
+	__device__ __host__
+	DepthCameraData() {
+		/*d_depthData = NULL;
+		d_colorData = NULL;
+		d_depthArray = NULL;
+		d_colorArray = NULL;*/
+		depth_mat_ = nullptr;
+		colour_mat_ = nullptr;
+		depth_tex_ = nullptr;
+		colour_tex_ = nullptr;
+	}
+	__host__
+	void alloc(const DepthCameraParams& params) { //! todo resizing???
+		/*cudaSafeCall(cudaMalloc(&d_depthData, sizeof(float) * params.m_imageWidth * params.m_imageHeight));
+		cudaSafeCall(cudaMalloc(&d_colorData, sizeof(float4) * params.m_imageWidth * params.m_imageHeight));
+		h_depthChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+		cudaSafeCall(cudaMallocArray(&d_depthArray, &h_depthChannelDesc, params.m_imageWidth, params.m_imageHeight));
+		h_colorChannelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
+		cudaSafeCall(cudaMallocArray(&d_colorArray, &h_colorChannelDesc, params.m_imageWidth, params.m_imageHeight));*/
+		std::cout << "Create texture objects: " << params.m_imageWidth << "," << params.m_imageHeight << std::endl;
+		depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1);
+		colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4);
+		depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_);
+		colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_);
+		depth_obj_ = depth_tex_->cudaTexture();
+		colour_obj_ = colour_tex_->cudaTexture();
+	}
+	__host__
+	void updateParams(const DepthCameraParams& params) {
+		updateConstantDepthCameraParams(params);
+	}
+	__host__
+	void updateData(const cv::Mat &depth, const cv::Mat &rgb) {
+		depth_mat_->upload(depth);
+		colour_mat_->upload(rgb);
+	}
+	__host__
+	void free() {
+		/*if (d_depthData) cudaSafeCall(cudaFree(d_depthData));
+		if (d_colorData) cudaSafeCall(cudaFree(d_colorData));
+		if (d_depthArray) cudaSafeCall(cudaFreeArray(d_depthArray));
+		if (d_colorArray) cudaSafeCall(cudaFreeArray(d_colorArray));*/
+		/*d_depthData = NULL;
+		d_colorData = NULL;
+		d_depthArray = NULL;
+		d_colorArray = NULL;*/
+		if (depth_mat_) delete depth_mat_;
+		if (colour_mat_) delete colour_mat_;
+		delete depth_tex_;
+		delete colour_tex_;
+	}
+	/////////////////
+	// Device part //
+	/////////////////
+	static inline const DepthCameraParams& params() {
+		return c_depthCameraParams;
+	}
+		///////////////////////////////////////////////////////////////
+		// Camera to Screen
+		///////////////////////////////////////////////////////////////
+	__device__
+	static inline float2 cameraToKinectScreenFloat(const float3& pos)	{
+		//return make_float2(pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx, c_depthCameraParams.my - pos.y*c_depthCameraParams.fy/pos.z);
+		return make_float2(
+			pos.x*c_depthCameraParams.fx/pos.z + c_depthCameraParams.mx,			
+			pos.y*c_depthCameraParams.fy/pos.z + c_depthCameraParams.my);
+	}
+	__device__
+	static inline int2 cameraToKinectScreenInt(const float3& pos)	{
+		float2 pImage = cameraToKinectScreenFloat(pos);
+		return make_int2(pImage + make_float2(0.5f, 0.5f));
+	}
+	__device__
+	static inline uint2 cameraToKinectScreen(const float3& pos)	{
+		int2 p = cameraToKinectScreenInt(pos);
+		return make_uint2(p.x, p.y);
+	}
+	__device__
+	static inline float cameraToKinectProjZ(float z)	{
+		return (z - c_depthCameraParams.m_sensorDepthWorldMin)/(c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin);
+	}
+	__device__
+	static inline float3 cameraToKinectProj(const float3& pos) {
+		float2 proj = cameraToKinectScreenFloat(pos);
+		float3 pImage = make_float3(proj.x, proj.y, pos.z);
+		pImage.x = (2.0f*pImage.x - (c_depthCameraParams.m_imageWidth- 1.0f))/(c_depthCameraParams.m_imageWidth- 1.0f);
+		//pImage.y = (2.0f*pImage.y - (c_depthCameraParams.m_imageHeight-1.0f))/(c_depthCameraParams.m_imageHeight-1.0f);
+		pImage.y = ((c_depthCameraParams.m_imageHeight-1.0f) - 2.0f*pImage.y)/(c_depthCameraParams.m_imageHeight-1.0f);
+		pImage.z = cameraToKinectProjZ(pImage.z);
+		return pImage;
+	}
+		///////////////////////////////////////////////////////////////
+		// Screen to Camera (depth in meters)
+		///////////////////////////////////////////////////////////////
+	__device__
+	static inline float3 kinectDepthToSkeleton(uint ux, uint uy, float depth)	{
+		const float x = ((float)ux-c_depthCameraParams.mx) / c_depthCameraParams.fx;
+		const float y = ((float)uy-c_depthCameraParams.my) / c_depthCameraParams.fy;
+		//const float y = (c_depthCameraParams.my-(float)uy) / c_depthCameraParams.fy;
+		return make_float3(depth*x, depth*y, depth);
+	}
+		///////////////////////////////////////////////////////////////
+		// RenderScreen to Camera -- ATTENTION ASSUMES [1,0]-Z range!!!!
+		///////////////////////////////////////////////////////////////
+	__device__
+	static inline float kinectProjToCameraZ(float z) {
+		return z * (c_depthCameraParams.m_sensorDepthWorldMax - c_depthCameraParams.m_sensorDepthWorldMin) + c_depthCameraParams.m_sensorDepthWorldMin;
+	}
+	// z has to be in [0, 1]
+	__device__
+	static inline float3 kinectProjToCamera(uint ux, uint uy, float z)	{
+		float fSkeletonZ = kinectProjToCameraZ(z);
+		return kinectDepthToSkeleton(ux, uy, fSkeletonZ);
+	}
+	__device__
+	static inline bool isInCameraFrustumApprox(const float4x4& viewMatrixInverse, const float3& pos) {
+		float3 pCamera = viewMatrixInverse * pos;
+		float3 pProj = cameraToKinectProj(pCamera);
+		//pProj *= 1.5f;	//TODO THIS IS A HACK FIX IT :)
+		pProj *= 0.95;
+		return !(pProj.x < -1.0f || pProj.x > 1.0f || pProj.y < -1.0f || pProj.y > 1.0f || pProj.z < 0.0f || pProj.z > 1.0f);  
+	}
+	//float*		d_depthData;	//depth data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor
+	//float4*		d_colorData;
+	//uchar4*		d_colorData;	//color data of the current frame (in screen space):: TODO data allocation lives in RGBD Sensor
+	cv::cuda::GpuMat *depth_mat_;
+	cv::cuda::GpuMat *colour_mat_;
+	ftl::cuda::TextureObject<float> *depth_tex_;
+	ftl::cuda::TextureObject<uchar4> *colour_tex_;
+	cudaTextureObject_t depth_obj_;
+	cudaTextureObject_t colour_obj_;
+	// cuda arrays for texture access
+	/*cudaArray*	d_depthArray;
+	cudaArray*	d_colorArray;
+	cudaChannelFormatDesc h_depthChannelDesc;
+	cudaChannelFormatDesc h_colorChannelDesc;*/
+};
--- a/applications/reconstruct/include/ftl/depth_camera_params.hpp
+++ b/applications/reconstruct/include/ftl/depth_camera_params.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDADepthCameraParams.h
+//#include <cutil_inline.h>
+//#include <cutil_math.h>
+#include <vector_types.h>
+#include <cuda_runtime.h>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/camera_params.hpp>
+struct __align__(16) DepthCameraParams {
+	float fx;
+	float fy;
+	float mx;
+	float my;
+	unsigned short m_imageWidth;
+	unsigned short m_imageHeight;
+	unsigned int flags;
+	float m_sensorDepthWorldMin;
+	float m_sensorDepthWorldMax;
+};
--- a/applications/reconstruct/include/ftl/matrix_conversion.hpp
+++ b/applications/reconstruct/include/ftl/matrix_conversion.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/MatrixConversion.h
+#pragma once
+#include <eigen3/Eigen/Eigen>
+// #include "d3dx9math.h"
+#include <ftl/cuda_matrix_util.hpp>
+namespace MatrixConversion
+{
+	//static D3DXMATRIX EigToMat(const Eigen::Matrix4f& mat)
+	//{
+	//	D3DXMATRIX m(mat.data());
+	//	D3DXMATRIX res; D3DXMatrixTranspose(&res, &m);
+	//	return res;
+	//}
+	//static Eigen::Matrix4f MatToEig(const D3DXMATRIX& mat)
+	//{
+	//	return Eigen::Matrix4f((float*)mat.m).transpose();
+	//}
+	/*static mat4f EigToMat(const Eigen::Matrix4f& mat)
+	{
+		return mat4f(mat.data()).getTranspose();
+	}
+	static Eigen::Matrix4f MatToEig(const mat4f& mat)
+	{
+		return Eigen::Matrix4f(mat.ptr()).transpose();
+	}*/
+	static Eigen::Vector4f VecH(const Eigen::Vector3f& v)
+	{
+		return Eigen::Vector4f(v[0], v[1], v[2], 1.0);
+	}
+	static Eigen::Vector3f VecDH(const Eigen::Vector4f& v)
+	{
+		return Eigen::Vector3f(v[0]/v[3], v[1]/v[3], v[2]/v[3]);
+	}
+	/*static Eigen::Vector3f VecToEig(const vec3f& v)
+	{
+		return Eigen::Vector3f(v[0], v[1], v[2]);
+	}
+	static vec3f EigToVec(const Eigen::Vector3f& v)
+	{
+		return vec3f(v[0], v[1], v[2]);
+	}
+	static vec3f EigToVec(const D3DXMATRIX v)
+	{
+		return vec3f(v[0], v[1], v[2]);
+	}*/
+	// dx/cuda conversion
+	/*static vec3f toMlib(const D3DXVECTOR3& v) {
+		return vec3f(v.x, v.y, v.z);
+	}
+	static vec4f toMlib(const D3DXVECTOR4& v) {
+		return vec4f(v.x, v.y, v.z, v.w);
+	}
+	static mat4f toMlib(const D3DXMATRIX& m) {
+		mat4f c((const float*)&m);
+		return c.getTranspose();
+	}
+	static D3DXVECTOR3 toDX(const vec3f& v) {
+		return D3DXVECTOR3(v.x, v.y, v.z);
+	}
+	static D3DXVECTOR4 toDX(const vec4f& v) {
+		return D3DXVECTOR4(v.x, v.y, v.z, v.w);
+	}
+	static D3DXMATRIX toDX(const mat4f& m) {
+		D3DXMATRIX c((const float*)m.ptr());
+		D3DXMatrixTranspose(&c, &c);
+		return c;
+	}*/
+	/*static mat4f toMlib(const float4x4& m) {
+		return mat4f(m.ptr());
+	}
+	static vec4f toMlib(const float4& v) {
+		return vec4f(v.x, v.y, v.z, v.w);
+	}
+	static vec3f toMlib(const float3& v) {
+		return vec3f(v.x, v.y, v.z);
+	}
+	static vec4i toMlib(const int4& v) {
+		return vec4i(v.x, v.y, v.z, v.w);
+	}
+	static vec3i toMlib(const int3& v) {
+		return vec3i(v.x, v.y, v.z);
+	}
+	static float4x4 toCUDA(const mat4f& m) {
+		return float4x4(m.ptr());
+	}*/
+	static float4x4 toCUDA(const Eigen::Matrix4f& mat) {
+		return float4x4(mat.data()).getTranspose();
+	}
+	static Eigen::Matrix4f toEigen(const float4x4& m) {
+		return Eigen::Matrix4f(m.ptr()).transpose();
+	}
+	/*static float4 toCUDA(const vec4f& v) {
+		return make_float4(v.x, v.y, v.z, v.w);
+	}
+	static float3 toCUDA(const vec3f& v) {
+		return make_float3(v.x, v.y, v.z);
+	}
+	static int4 toCUDA(const vec4i& v) {
+		return make_int4(v.x, v.y, v.z, v.w);
+	}
+	static int3 toCUDA(const vec3i& v) {
+		return make_int3(v.x, v.y, v.z);
+	}*/
+	//doesnt really work
+	//template<class FloatType>
+	//point3d<FloatType> eulerAngles(const Matrix3x3<FloatType>& r) {
+	//	point3d<FloatType> res;
+	//	//check for gimbal lock
+	//	if (r(0,2) == (FloatType)-1.0) {
+	//		FloatType x = 0; //gimbal lock, value of x doesn't matter
+	//		FloatType y = (FloatType)M_PI / 2;
+	//		FloatType z = x + atan2(r(1,0), r(2,0));
+	//		res = point3d<FloatType>(x, y, z);
+	//	} else if (r(0,2) == (FloatType)1.0) {
+	//		FloatType x = 0;
+	//		FloatType y = -(FloatType)M_PI / 2;
+	//		FloatType z = -x + atan2(-r(1,0), -r(2,0));
+	//		res = point3d<FloatType>(x, y, z);
+	//	} else { //two solutions exist
+	//		FloatType x1 = -asin(r(0,2));
+	//		FloatType x2 = (FloatType)M_PI - x1;
+	//		FloatType y1 = atan2(r(1,2) / cos(x1), r(2,2) / cos(x1));
+	//		FloatType y2 = atan2(r(1,2) / cos(x2), r(2,2) / cos(x2));
+	//		FloatType z2 = atan2(r(0,1) / cos(x2), r(0,0) / cos(x2));
+	//		FloatType z1 = atan2(r(0,1) / cos(x1), r(0,0) / cos(x1));
+	//		//choose one solution to return
+	//		//for example the "shortest" rotation
+	//		if ((std::abs(x1) + std::abs(y1) + std::abs(z1)) <= (std::abs(x2) + std::abs(y2) + std::abs(z2))) {
+	//			res = point3d<FloatType>(x1, y1, z1);
+	//		} else {
+	//			res = point3d<FloatType>(x2, y2, z2);
+	//		}
+	//	}
+	//	res.x = math::radiansToDegrees(-res.x);
+	//	res.y = math::radiansToDegrees(-res.y);
+	//	res.z = math::radiansToDegrees(-res.z);
+	//	return res;
+	//}
+}
--- a/applications/reconstruct/include/ftl/ray_cast_params.hpp
+++ b/applications/reconstruct/include/ftl/ray_cast_params.hpp
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+struct __align__(16) RayCastParams {
+	float4x4 m_viewMatrix;
+	float4x4 m_viewMatrixInverse;
+	float4x4 m_intrinsics;
+	float4x4 m_intrinsicsInverse;
+	unsigned int m_width;
+	unsigned int m_height;
+	unsigned int m_numOccupiedSDFBlocks;
+	unsigned int m_maxNumVertices;
+	int m_splatMinimum;
+	float m_minDepth;
+	float m_maxDepth;
+	float m_rayIncrement;
+	float m_thresSampleDist;
+	float m_thresDist;
+	bool  m_useGradients;
+	uint dummy0;
+};
--- a/applications/reconstruct/include/ftl/ray_cast_sdf.hpp
+++ b/applications/reconstruct/include/ftl/ray_cast_sdf.hpp
+#pragma once
+#include <ftl/configurable.hpp>
+#include <ftl/matrix_conversion.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include <ftl/ray_cast_util.hpp>
+#include <nlohmann/json.hpp>
+// #include "DX11RayIntervalSplatting.h"
+class CUDARayCastSDF : public ftl::Configurable
+{
+public:
+	CUDARayCastSDF(nlohmann::json& config) : ftl::Configurable(config) {
+		create(parametersFromConfig(config));
+		hash_render_ = config.value("hash_renderer", false);
+	}
+	~CUDARayCastSDF(void) {
+		destroy();
+	}
+	static RayCastParams parametersFromConfig(const nlohmann::json& gas) {
+		RayCastParams params;
+		params.m_width = gas["adapterWidth"].get<unsigned int>();
+		params.m_height = gas["adapterHeight"].get<unsigned int>();
+		params.m_intrinsics = MatrixConversion::toCUDA(Eigen::Matrix4f());
+		params.m_intrinsicsInverse = MatrixConversion::toCUDA(Eigen::Matrix4f());
+		params.m_minDepth = gas["sensorDepthMin"].get<float>();
+		params.m_maxDepth = gas["sensorDepthMax"].get<float>();
+		params.m_rayIncrement = gas["SDFRayIncrementFactor"].get<float>() * gas["SDFTruncation"].get<float>();
+		params.m_thresSampleDist = gas["SDFRayThresSampleDistFactor"].get<float>() * params.m_rayIncrement;
+		params.m_thresDist = gas["SDFRayThresDistFactor"].get<float>() * params.m_rayIncrement;
+		params.m_useGradients = gas["SDFUseGradients"].get<bool>();
+		params.m_maxNumVertices = gas["hashNumSDFBlocks"].get<unsigned int>() * 6;
+		return params;
+	}
+	void render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform);
+	const RayCastData& getRayCastData(void) {
+		return m_data;
+	}
+	const RayCastParams& getRayCastParams() const {
+		return m_params;
+	}
+	// debugging
+	void convertToCameraSpace(const DepthCameraData& cameraData);
+private:
+	void create(const RayCastParams& params);
+	void destroy(void);
+	void rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform); // rasterize
+	RayCastParams m_params;
+	RayCastData m_data;
+	bool hash_render_;
+	// DX11RayIntervalSplatting m_rayIntervalSplatting;
+};
--- a/applications/reconstruct/include/ftl/ray_cast_util.hpp
+++ b/applications/reconstruct/include/ftl/ray_cast_util.hpp
+#pragma once
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include <ftl/voxel_hash.hpp>
+#include <ftl/ray_cast_params.hpp>
+struct RayCastSample
+{
+	float sdf;
+	float alpha;
+	uint weight;
+	//uint3 color;
+};
+#ifndef MINF
+#define MINF asfloat(0xff800000)
+#endif
+extern __constant__ RayCastParams c_rayCastParams;
+extern "C" void updateConstantRayCastParams(const RayCastParams& params);
+struct RayCastData {
+	///////////////
+	// Host part //
+	///////////////
+	__device__ __host__
+	RayCastData() {
+		d_depth = NULL;
+		d_depth3 = NULL;
+		d_normals = NULL;
+		d_colors = NULL;
+		//d_vertexBuffer = NULL;
+		point_cloud_ = nullptr;
+		d_rayIntervalSplatMinArray = NULL;
+		d_rayIntervalSplatMaxArray = NULL;
+	}
+#ifndef __CUDACC__
+	__host__
+	void allocate(const RayCastParams& params) {
+		//point_cloud_ = new cv::cuda::GpuMat(params.m_width*params.m_height, 1, CV_32FC3);
+		cudaSafeCall(cudaMalloc(&d_depth, sizeof(float) * params.m_width * params.m_height));
+		cudaSafeCall(cudaMalloc(&d_depth3, sizeof(float3) * params.m_width * params.m_height));
+		cudaSafeCall(cudaMalloc(&d_normals, sizeof(float4) * params.m_width * params.m_height));
+		cudaSafeCall(cudaMalloc(&d_colors, sizeof(uchar3) * params.m_width * params.m_height));
+		//cudaSafeCall(cudaMalloc(&point_cloud_, sizeof(float3) * params.m_width * params.m_height));
+		//printf("Allocate ray cast data: %lld \n", (unsigned long long)point_cloud_);
+	}
+	__host__
+	void updateParams(const RayCastParams& params) {
+		updateConstantRayCastParams(params);
+	}
+	__host__ void download(float3 *points, uchar3 *colours, const RayCastParams& params) const {
+		//printf("Download: %d,%d\n", params.m_width, params.m_height);
+		//cudaSafeCall(cudaMemcpy(points, d_depth3, sizeof(float3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(colours, d_colors, sizeof(uchar3) * params.m_width * params.m_height, cudaMemcpyDeviceToHost));
+	}
+	__host__
+		void free() {
+			//cudaFree(point_cloud_);
+			cudaFree(d_depth);
+			cudaFree(d_depth3);
+			cudaFree(d_normals);
+			cudaFree(d_colors);
+	}
+#endif
+	/////////////////
+	// Device part //
+	/////////////////
+#ifdef __CUDACC__
+	__device__
+		const RayCastParams& params() const {
+			return c_rayCastParams;
+	}
+	__device__
+	float frac(float val) const {
+		return (val - floorf(val));
+	}
+	__device__
+	float3 frac(const float3& val) const {
+			return make_float3(frac(val.x), frac(val.y), frac(val.z));
+	}
+	__device__
+	bool trilinearInterpolationSimpleFastFast(const ftl::voxhash::HashData& hash, const float3& pos, float& dist, uchar3& color) const {
+		const float oSet = c_hashParams.m_virtualVoxelSize;
+		const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f);
+		float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos));
+		dist = 0.0f;
+		float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
+		ftl::voxhash::Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; float3 vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*vColor;
+		      v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*vColor;
+		      v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*v.sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*vColor;
+		      v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *v.sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *vColor;
+		      v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+=	   weight.x *	   weight.y *(1.0f-weight.z)*v.sdf; colorFloat+=	   weight.x *	   weight.y *(1.0f-weight.z)*vColor;
+		      v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+= (1.0f-weight.x)*	   weight.y *	   weight.z *v.sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *	   weight.z *vColor;
+		      v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+=	   weight.x *(1.0f-weight.y)*	   weight.z *v.sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*	   weight.z *vColor;
+		      v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false;		   vColor = make_float3(v.color.x, v.color.y, v.color.z); dist+=	   weight.x *	   weight.y *	   weight.z *v.sdf; colorFloat+=	   weight.x *	   weight.y *	   weight.z *vColor;
+		color = make_uchar3(colorFloat.x, colorFloat.y, colorFloat.z);//v.color;
+		return true;
+	}
+	//__device__
+	//bool trilinearInterpolationSimpleFastFast(const HashData& hash, const float3& pos, float& dist, uchar3& color) const {
+	//	const float oSet = c_hashParams.m_virtualVoxelSize;
+	//	const float3 posDual = pos-make_float3(oSet/2.0f, oSet/2.0f, oSet/2.0f);
+	//	float3 weight = frac(hash.worldToVirtualVoxelPosFloat(pos));
+	//	dist = 0.0f;
+	//	Voxel v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, 0.0f)); if(v.weight == 0) return false; dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, 0.0f)); if(v.weight == 0) return false;		dist+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(0.0f, oSet, 0.0f)); if(v.weight == 0) return false;		dist+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(0.0f, 0.0f, oSet)); if(v.weight == 0) return false;		dist+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(oSet, oSet, 0.0f)); if(v.weight == 0) return false;		dist+=	   weight.x *	   weight.y *(1.0f-weight.z)*v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(0.0f, oSet, oSet)); if(v.weight == 0) return false;		dist+= (1.0f-weight.x)*	   weight.y *	   weight.z *v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(oSet, 0.0f, oSet)); if(v.weight == 0) return false;		dist+=	   weight.x *(1.0f-weight.y)*	   weight.z *v.sdf;
+	//	v = hash.getVoxel(posDual+make_float3(oSet, oSet, oSet)); if(v.weight == 0) return false;		dist+=	   weight.x *	   weight.y *	   weight.z *v.sdf;
+	//	color = v.color;
+	//	return true;
+	//}
+	__device__
+	float findIntersectionLinear(float tNear, float tFar, float dNear, float dFar) const
+	{
+		return tNear+(dNear/(dNear-dFar))*(tFar-tNear);
+	}
+	static const unsigned int nIterationsBisection = 3;
+	// d0 near, d1 far
+	__device__
+		bool findIntersectionBisection(const ftl::voxhash::HashData& hash, const float3& worldCamPos, const float3& worldDir, float d0, float r0, float d1, float r1, float& alpha, uchar3& color) const
+	{
+		float a = r0; float aDist = d0;
+		float b = r1; float bDist = d1;
+		float c = 0.0f;
+#pragma unroll 1
+		for(uint i = 0; i<nIterationsBisection; i++)
+		{
+			c = findIntersectionLinear(a, b, aDist, bDist);
+			float cDist;
+			if(!trilinearInterpolationSimpleFastFast(hash, worldCamPos+c*worldDir, cDist, color)) return false;
+			if(aDist*cDist > 0.0) { a = c; aDist = cDist; }
+			else { b = c; bDist = cDist; }
+		}
+		alpha = c;
+		return true;
+	}
+	__device__
+	float3 gradientForPoint(const ftl::voxhash::HashData& hash, const float3& pos) const
+	{
+		const float voxelSize = c_hashParams.m_virtualVoxelSize;
+		float3 offset = make_float3(voxelSize, voxelSize, voxelSize);
+		float distp00; uchar3 colorp00; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.5f*offset.x, 0.0f, 0.0f), distp00, colorp00);
+		float dist0p0; uchar3 color0p0; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.5f*offset.y, 0.0f), dist0p0, color0p0);
+		float dist00p; uchar3 color00p; trilinearInterpolationSimpleFastFast(hash, pos-make_float3(0.0f, 0.0f, 0.5f*offset.z), dist00p, color00p);
+		float dist100; uchar3 color100; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.5f*offset.x, 0.0f, 0.0f), dist100, color100);
+		float dist010; uchar3 color010; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.5f*offset.y, 0.0f), dist010, color010);
+		float dist001; uchar3 color001; trilinearInterpolationSimpleFastFast(hash, pos+make_float3(0.0f, 0.0f, 0.5f*offset.z), dist001, color001);
+		float3 grad = make_float3((distp00-dist100)/offset.x, (dist0p0-dist010)/offset.y, (dist00p-dist001)/offset.z);
+		float l = length(grad);
+		if(l == 0.0f) {
+			return make_float3(0.0f, 0.0f, 0.0f);
+		}
+		return -grad/l;
+	}
+	__device__
+	void traverseCoarseGridSimpleSampleAll(const ftl::voxhash::HashData& hash, const DepthCameraData& cameraData, const float3& worldCamPos, const float3& worldDir, const float3& camDir, const int3& dTid, float minInterval, float maxInterval) const
+	{
+		const RayCastParams& rayCastParams = c_rayCastParams;
+		// Last Sample
+		RayCastSample lastSample; lastSample.sdf = 0.0f; lastSample.alpha = 0.0f; lastSample.weight = 0; // lastSample.color = int3(0, 0, 0);
+		const float depthToRayLength = 1.0f/camDir.z; // scale factor to convert from depth to ray length
+		float rayCurrent = depthToRayLength * max(rayCastParams.m_minDepth, minInterval);	// Convert depth to raylength
+		float rayEnd = depthToRayLength * min(rayCastParams.m_maxDepth, maxInterval);		// Convert depth to raylength
+		//float rayCurrent = depthToRayLength * rayCastParams.m_minDepth;	// Convert depth to raylength
+		//float rayEnd = depthToRayLength * rayCastParams.m_maxDepth;		// Convert depth to raylength
+		//printf("MINMAX: %f,%f\n", minInterval, maxInterval);
+#pragma unroll 1
+		while(rayCurrent < rayEnd)
+		{
+			float3 currentPosWorld = worldCamPos+rayCurrent*worldDir;
+			float dist;	uchar3 color;
+			//printf("RAY LOOP: %f,%f,%f\n", currentPosWorld.x, currentPosWorld.y, currentPosWorld.z);
+			if(trilinearInterpolationSimpleFastFast(hash, currentPosWorld, dist, color))
+			{
+				if(lastSample.weight > 0 && lastSample.sdf > 0.0f && dist < 0.0f) // current sample is always valid here 
+				{
+					float alpha; // = findIntersectionLinear(lastSample.alpha, rayCurrent, lastSample.sdf, dist);
+					uchar3 color2;
+					bool b = findIntersectionBisection(hash, worldCamPos, worldDir, lastSample.sdf, lastSample.alpha, dist, rayCurrent, alpha, color2);
+					float3 currentIso = worldCamPos+alpha*worldDir;
+					if(b && abs(lastSample.sdf - dist) < rayCastParams.m_thresSampleDist)
+					{
+						if(abs(dist) < rayCastParams.m_thresDist)
+						{
+							float depth = alpha / depthToRayLength; // Convert ray length to depth depthToRayLength
+							d_depth[dTid.y*rayCastParams.m_width+dTid.x] = depth;
+							d_depth3[dTid.y*rayCastParams.m_width+dTid.x] = cameraData.kinectDepthToSkeleton(dTid.x, dTid.y, depth);
+							d_colors[dTid.y*rayCastParams.m_width+dTid.x] = make_uchar3(color2.x, color2.y, color2.z);
+							if(rayCastParams.m_useGradients)
+							{
+								float3 normal = -gradientForPoint(hash, currentIso);
+								float4 n = rayCastParams.m_viewMatrix * make_float4(normal, 0.0f);
+								d_normals[dTid.y*rayCastParams.m_width+dTid.x] = make_float4(n.x, n.y, n.z, 1.0f);
+							}
+							return;
+						}
+					}
+				}
+				lastSample.sdf = dist;
+				lastSample.alpha = rayCurrent;
+				// lastSample.color = color;
+				lastSample.weight = 1;
+				rayCurrent += rayCastParams.m_rayIncrement;
+			} else {
+				lastSample.weight = 0;
+				rayCurrent += rayCastParams.m_rayIncrement;
+			}
+		}
+	}
+#endif // __CUDACC__
+	union {
+	float*  d_depth;
+	int* d_depth_i;
+	};
+	float3* d_depth3;
+	float4* d_normals;
+	uchar3* d_colors;
+	//float4* d_vertexBuffer; // ray interval splatting triangles, mapped from directx (memory lives there)
+	float3 *point_cloud_;
+	cudaArray* d_rayIntervalSplatMinArray;
+	cudaArray* d_rayIntervalSplatMaxArray;
+};
\ No newline at end of file
--- a/applications/reconstruct/include/ftl/scene_rep_hash_sdf.hpp
+++ b/applications/reconstruct/include/ftl/scene_rep_hash_sdf.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDASceneRepHashSDF.h
+#pragma once
+#include <cuda_runtime.h>
+#include <ftl/configurable.hpp>
+#include <ftl/matrix_conversion.hpp>
+#include <ftl/voxel_hash.hpp>
+#include <ftl/depth_camera.hpp>
+#include <unordered_set>
+//#include "CUDAScan.h"
+// #include "CUDATimer.h"
+// #include "GlobalAppState.h"
+// #include "TimingLog.h"
+#define 	SAFE_DELETE_ARRAY(a)   { delete [] (a); (a) = NULL; }
+extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask);
+extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData);
+extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams);
+extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData);
+extern "C" void starveVoxelsKernelCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" void garbageCollectIdentifyCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+extern "C" void garbageCollectFreeCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
+namespace ftl {
+namespace voxhash {
+class SceneRep : public ftl::Configurable {
+	public:
+	SceneRep(nlohmann::json &config) : Configurable(config) {
+		REQUIRED({
+			{"hashNumBuckets", "Desired hash entries divide bucket size", "number"},
+			{"hashMaxCollisionLinkedListSize", "", "number"},
+			{"hashNumSDFBlocks", "", "number"},
+			{"SDFVoxelSize", "Size in meters of one voxel", "number"},
+			{"SDFMaxIntegrationDistance", "", "number"},
+			{"SDFTruncation", "Base error size", "number"},
+			{"SDFTruncationScale", "Error size scale with depth", "number"},
+			{"SDFIntegrationWeightSample", "", "number"},
+			{"SDFIntegrationWeightMax", "", "number"}
+		});
+		create(parametersFromConfig(config));
+	}
+	~SceneRep() {
+		destroy();
+	}
+	static HashParams parametersFromConfig(nlohmann::json &config) {
+		HashParams params;
+		// First camera view is set to identity pose to be at the centre of
+		// the virtual coordinate space.
+		params.m_rigidTransform.setIdentity();
+		params.m_rigidTransformInverse.setIdentity();
+		params.m_hashNumBuckets = config["hashNumBuckets"];
+		params.m_hashBucketSize = HASH_BUCKET_SIZE;
+		params.m_hashMaxCollisionLinkedListSize = config["hashMaxCollisionLinkedListSize"];
+		params.m_SDFBlockSize = SDF_BLOCK_SIZE;
+		params.m_numSDFBlocks = config["hashNumSDFBlocks"];
+		params.m_virtualVoxelSize = config["SDFVoxelSize"];
+		params.m_maxIntegrationDistance = config["SDFMaxIntegrationDistance"];
+		params.m_truncation = config["SDFTruncation"];
+		params.m_truncScale = config["SDFTruncationScale"];
+		params.m_integrationWeightSample = config["SDFIntegrationWeightSample"];
+		params.m_integrationWeightMax = config["SDFIntegrationWeightMax"];
+		// Note (Nick): We are not streaming voxels in/out of GPU
+		//params.m_streamingVoxelExtents = MatrixConversion::toCUDA(gas.s_streamingVoxelExtents);
+		//params.m_streamingGridDimensions = MatrixConversion::toCUDA(gas.s_streamingGridDimensions);
+		//params.m_streamingMinGridPos = MatrixConversion::toCUDA(gas.s_streamingMinGridPos);
+		//params.m_streamingInitialChunkListSize = gas.s_streamingInitialChunkListSize;
+		return params;
+	}
+	void bindDepthCameraTextures(const DepthCameraData& depthCameraData) {
+		//bindInputDepthColorTextures(depthCameraData);
+	}
+	/**
+	 * Note: lastRigidTransform appears to be the estimated camera pose.
+	 * Note: bitMask can be nullptr if not streaming out voxels from GPU
+	 */
+	void integrate(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, unsigned int* d_bitMask) {
+		setLastRigidTransform(lastRigidTransform);
+		//make the rigid transform available on the GPU
+		m_hashData.updateParams(m_hashParams);
+		//allocate all hash blocks which are corresponding to depth map entries
+		alloc(depthCameraData, depthCameraParams, d_bitMask);
+		//generate a linear hash array with only occupied entries
+		compactifyHashEntries(depthCameraData);
+		//volumetrically integrate the depth data into the depth SDFBlocks
+		integrateDepthMap(depthCameraData, depthCameraParams);
+		garbageCollect(depthCameraData);
+		m_numIntegratedFrames++;
+	}
+	void setLastRigidTransform(const Eigen::Matrix4f& lastRigidTransform) {
+		m_hashParams.m_rigidTransform = MatrixConversion::toCUDA(lastRigidTransform);
+		m_hashParams.m_rigidTransformInverse = m_hashParams.m_rigidTransform.getInverse();
+	}
+	void setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) {
+		setLastRigidTransform(lastRigidTransform);
+		compactifyHashEntries(depthCameraData);
+	}
+	const Eigen::Matrix4f getLastRigidTransform() const {
+		return MatrixConversion::toEigen(m_hashParams.m_rigidTransform);
+	}
+	/* Nick: To reduce weights between frames */
+	void nextFrame() {
+		starveVoxelsKernelCUDA(m_hashData, m_hashParams);
+		m_numIntegratedFrames = 0;
+	}
+	//! resets the hash to the initial state (i.e., clears all data)
+	void reset() {
+		m_numIntegratedFrames = 0;
+		m_hashParams.m_rigidTransform.setIdentity();
+		m_hashParams.m_rigidTransformInverse.setIdentity();
+		m_hashParams.m_numOccupiedBlocks = 0;
+		m_hashData.updateParams(m_hashParams);
+		resetCUDA(m_hashData, m_hashParams);
+	}
+	ftl::voxhash::HashData& getHashData() {
+		return m_hashData;
+	} 
+	const HashParams& getHashParams() const {
+		return m_hashParams;
+	}
+	//! debug only!
+	unsigned int getHeapFreeCount() {
+		unsigned int count;
+		cudaSafeCall(cudaMemcpy(&count, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+		return count+1;	//there is one more free than the address suggests (0 would be also a valid address)
+	}
+	//! debug only!
+	void debugHash() {
+		HashEntry* hashCPU = new HashEntry[m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets];
+		unsigned int* heapCPU = new unsigned int[m_hashParams.m_numSDFBlocks];
+		unsigned int heapCounterCPU;
+		cudaSafeCall(cudaMemcpy(&heapCounterCPU, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+		heapCounterCPU++;	//points to the first free entry: number of blocks is one more
+		cudaSafeCall(cudaMemcpy(heapCPU, m_hashData.d_heap, sizeof(unsigned int)*m_hashParams.m_numSDFBlocks, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashCPU, m_hashData.d_hash, sizeof(HashEntry)*m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets, cudaMemcpyDeviceToHost));
+		//Check for duplicates
+		class myint3Voxel {
+		public:
+			myint3Voxel() {}
+			~myint3Voxel() {}
+			bool operator<(const myint3Voxel& other) const {
+				if (x == other.x) {
+					if (y == other.y) {
+						return z < other.z;
+					}
+					return y < other.y;
+				}
+				return x < other.x;
+			}
+			bool operator==(const myint3Voxel& other) const {
+				return x == other.x && y == other.y && z == other.z;
+			}
+			int x,y,z, i;
+			int offset;
+			int ptr;
+		}; 
+		std::unordered_set<unsigned int> pointersFreeHash;
+		std::vector<unsigned int> pointersFreeVec(m_hashParams.m_numSDFBlocks, 0);
+		for (unsigned int i = 0; i < heapCounterCPU; i++) {
+			pointersFreeHash.insert(heapCPU[i]);
+			pointersFreeVec[heapCPU[i]] = FREE_ENTRY;
+		}
+		if (pointersFreeHash.size() != heapCounterCPU) {
+			throw std::runtime_error("ERROR: duplicate free pointers in heap array");
+		}
+		unsigned int numOccupied = 0;
+		unsigned int numMinusOne = 0;
+		unsigned int listOverallFound = 0;
+		std::list<myint3Voxel> l;
+		//std::vector<myint3Voxel> v;
+		for (unsigned int i = 0; i < m_hashParams.m_hashBucketSize*m_hashParams.m_hashNumBuckets; i++) {
+			if (hashCPU[i].ptr == -1) {
+				numMinusOne++;
+			}
+			if (hashCPU[i].ptr != -2) {
+				numOccupied++;	// != FREE_ENTRY
+				myint3Voxel a;	
+				a.x = hashCPU[i].pos.x;
+				a.y = hashCPU[i].pos.y;
+				a.z = hashCPU[i].pos.z;
+				l.push_back(a);
+				//v.push_back(a);
+				unsigned int linearBlockSize = m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize;
+				if (pointersFreeHash.find(hashCPU[i].ptr / linearBlockSize) != pointersFreeHash.end()) {
+					throw std::runtime_error("ERROR: ptr is on free heap, but also marked as an allocated entry");
+				}
+				pointersFreeVec[hashCPU[i].ptr / linearBlockSize] = LOCK_ENTRY;
+			}
+		}
+		unsigned int numHeapFree = 0;
+		unsigned int numHeapOccupied = 0;
+		for (unsigned int i = 0; i < m_hashParams.m_numSDFBlocks; i++) {
+			if		(pointersFreeVec[i] == FREE_ENTRY) numHeapFree++;
+			else if (pointersFreeVec[i] == LOCK_ENTRY) numHeapOccupied++;
+			else {
+				throw std::runtime_error("memory leak detected: neither free nor allocated");
+			}
+		}
+		if (numHeapFree + numHeapOccupied == m_hashParams.m_numSDFBlocks) std::cout << "HEAP OK!" << std::endl;
+		else throw std::runtime_error("HEAP CORRUPTED");
+		l.sort();
+		size_t sizeBefore = l.size();
+		l.unique();
+		size_t sizeAfter = l.size();
+		std::cout << "diff: " << sizeBefore - sizeAfter << std::endl;
+		std::cout << "minOne: " << numMinusOne << std::endl;
+		std::cout << "numOccupied: " << numOccupied << "\t numFree: " << getHeapFreeCount() << std::endl;
+		std::cout << "numOccupied + free: " << numOccupied + getHeapFreeCount() << std::endl;
+		std::cout << "numInFrustum: " << m_hashParams.m_numOccupiedBlocks << std::endl;
+		SAFE_DELETE_ARRAY(heapCPU);
+		SAFE_DELETE_ARRAY(hashCPU);
+		//getchar();
+	}
+private:
+	void create(const HashParams& params) {
+		m_hashParams = params;
+		m_hashData.allocate(m_hashParams);
+		reset();
+	}
+	void destroy() {
+		m_hashData.free();
+	}
+	void alloc(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, const unsigned int* d_bitMask) {
+		//Start Timing
+		//if (GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
+		// NOTE (nick): We might want this later...
+		if (true) {
+			//allocate until all blocks are allocated
+			unsigned int prevFree = getHeapFreeCount();
+			while (1) {
+				resetHashBucketMutexCUDA(m_hashData, m_hashParams);
+				allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask);
+				unsigned int currFree = getHeapFreeCount();
+				if (prevFree != currFree) {
+					prevFree = currFree;
+				}
+				else {
+					break;
+				}
+			}
+		}
+		else {
+			//this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame)
+			resetHashBucketMutexCUDA(m_hashData, m_hashParams);
+			allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, d_bitMask);
+		}
+		// Stop Timing
+		//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeAlloc += m_timer.getElapsedTimeMS(); TimingLog::countTimeAlloc++; }
+	}
+	void compactifyHashEntries(const DepthCameraData& depthCameraData) {
+		//Start Timing
+		//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
+		//CUDATimer t;
+		//t.startEvent("fillDecisionArray");
+		//fillDecisionArrayCUDA(m_hashData, m_hashParams, depthCameraData);
+		//t.endEvent();
+		//t.startEvent("prefixSum");
+		//m_hashParams.m_numOccupiedBlocks = 
+		//	m_cudaScan.prefixSum(
+		//		m_hashParams.m_hashNumBuckets*m_hashParams.m_hashBucketSize,
+		//		m_hashData.d_hashDecision,
+		//		m_hashData.d_hashDecisionPrefix);
+		//t.endEvent();
+		//t.startEvent("compactifyHash");
+		//m_hashData.updateParams(m_hashParams);	//make sure numOccupiedBlocks is updated on the GPU
+		//compactifyHashCUDA(m_hashData, m_hashParams);
+		//t.endEvent();
+		//t.startEvent("compactifyAllInOne");
+		m_hashParams.m_numOccupiedBlocks = compactifyHashAllInOneCUDA(m_hashData, m_hashParams);		//this version uses atomics over prefix sums, which has a much better performance
+		std::cout << "Occ blocks = " << m_hashParams.m_numOccupiedBlocks << std::endl;
+		m_hashData.updateParams(m_hashParams);	//make sure numOccupiedBlocks is updated on the GPU
+		//t.endEvent();
+		//t.evaluate();
+		// Stop Timing
+		//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeCompactifyHash += m_timer.getElapsedTimeMS(); TimingLog::countTimeCompactifyHash++; }
+		//std::cout << "numOccupiedBlocks: " << m_hashParams.m_numOccupiedBlocks << std::endl;
+	}
+	void integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) {
+		//Start Timing
+		//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.start(); }
+		integrateDepthMapCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams);
+		// Stop Timing
+		//if(GlobalAppState::get().s_timingsDetailledEnabled) { cutilSafeCall(cudaDeviceSynchronize()); m_timer.stop(); TimingLog::totalTimeIntegrate += m_timer.getElapsedTimeMS(); TimingLog::countTimeIntegrate++; }
+	}
+	void garbageCollect(const DepthCameraData& depthCameraData) {
+		//only perform if enabled by global app state
+		//if (GlobalAppState::get().s_garbageCollectionEnabled) {
+			garbageCollectIdentifyCUDA(m_hashData, m_hashParams);
+			resetHashBucketMutexCUDA(m_hashData, m_hashParams);	//needed if linked lists are enabled -> for memeory deletion
+			garbageCollectFreeCUDA(m_hashData, m_hashParams);
+		//} 
+	}
+	HashParams		m_hashParams;
+	ftl::voxhash::HashData		m_hashData;
+	//CUDAScan		m_cudaScan;
+	unsigned int	m_numIntegratedFrames;	//used for garbage collect
+	// static Timer m_timer;
+};
+};  // namespace voxhash
+};  // namespace ftl
--- a/applications/reconstruct/include/ftl/sdf_util.hpp
+++ b/applications/reconstruct/include/ftl/sdf_util.hpp
--- a/applications/reconstruct/include/ftl/voxel_hash.hpp
+++ b/applications/reconstruct/include/ftl/voxel_hash.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/VoxelUtilHashSDF.h
+#pragma once
+#ifndef sint
+typedef signed int sint;
+#endif
+#ifndef uint
+typedef unsigned int uint;
+#endif 
+#ifndef slong 
+typedef signed long slong;
+#endif
+#ifndef ulong
+typedef unsigned long ulong;
+#endif
+#ifndef uchar
+typedef unsigned char uchar;
+#endif
+#ifndef schar
+typedef signed char schar;
+#endif
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/voxel_hash_params.hpp>
+#include <ftl/depth_camera.hpp>
+#define HANDLE_COLLISIONS
+#define SDF_BLOCK_SIZE 8
+#define HASH_BUCKET_SIZE 10
+#ifndef MINF
+#define MINF __int_as_float(0xff800000)
+#endif
+#ifndef PINF
+#define PINF __int_as_float(0x7f800000)
+#endif
+extern  __constant__ ftl::voxhash::HashParams c_hashParams;
+extern "C" void updateConstantHashParams(const ftl::voxhash::HashParams& hashParams);
+namespace ftl {
+namespace voxhash {
+//status flags for hash entries
+static const int LOCK_ENTRY = -1;
+static const int FREE_ENTRY = -2;
+static const int NO_OFFSET = 0;
+struct __align__(16) HashEntry 
+{
+	int3	pos;		//hash position (lower left corner of SDFBlock))
+	int		ptr;		//pointer into heap to SDFBlock
+	uint	offset;		//offset for collisions
+	uint	flags;		//for interframe checks (Nick)
+	__device__ void operator=(const struct HashEntry& e) {
+		((long long*)this)[0] = ((const long long*)&e)[0];
+		((long long*)this)[1] = ((const long long*)&e)[1];
+		//((int*)this)[4] = ((const int*)&e)[4];
+		((long long*)this)[2] = ((const long long*)&e)[2];
+	}
+};
+struct __align__(8) Voxel {
+	float	sdf;		//signed distance function
+	uchar3	color;		//color 
+	uchar	weight;		//accumulated sdf weight
+	__device__ void operator=(const struct Voxel& v) {
+		((long long*)this)[0] = ((const long long*)&v)[0];
+	}
+};
+/**
+ * Voxel Hash Table structure and operations. Works on both CPU and GPU with
+ * host <-> device transfer included.
+ */
+struct HashData {
+	///////////////
+	// Host part //
+	///////////////
+	__device__ __host__
+	HashData() {
+		d_heap = NULL;
+		d_heapCounter = NULL;
+		d_hash = NULL;
+		d_hashDecision = NULL;
+		d_hashDecisionPrefix = NULL;
+		d_hashCompactified = NULL;
+		d_hashCompactifiedCounter = NULL;
+		d_SDFBlocks = NULL;
+		d_hashBucketMutex = NULL;
+		m_bIsOnGPU = false;
+	}
+	__host__
+	void allocate(const HashParams& params, bool dataOnGPU = true) {
+		m_bIsOnGPU = dataOnGPU;
+		if (m_bIsOnGPU) {
+			cudaSafeCall(cudaMalloc(&d_heap, sizeof(unsigned int) * params.m_numSDFBlocks));
+			cudaSafeCall(cudaMalloc(&d_heapCounter, sizeof(unsigned int)));
+			cudaSafeCall(cudaMalloc(&d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize));
+			cudaSafeCall(cudaMalloc(&d_hashDecision, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize));
+			cudaSafeCall(cudaMalloc(&d_hashDecisionPrefix, sizeof(int)* params.m_hashNumBuckets * params.m_hashBucketSize));
+			cudaSafeCall(cudaMalloc(&d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize));
+			cudaSafeCall(cudaMalloc(&d_hashCompactifiedCounter, sizeof(int)));
+			cudaSafeCall(cudaMalloc(&d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize));
+			cudaSafeCall(cudaMalloc(&d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets));
+		} else {
+			d_heap = new unsigned int[params.m_numSDFBlocks];
+			d_heapCounter = new unsigned int[1];
+			d_hash = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize];
+			d_hashDecision = new int[params.m_hashNumBuckets * params.m_hashBucketSize];
+			d_hashDecisionPrefix = new int[params.m_hashNumBuckets * params.m_hashBucketSize];
+			d_hashCompactified = new HashEntry[params.m_hashNumBuckets * params.m_hashBucketSize];
+			d_hashCompactifiedCounter = new int[1];
+			d_SDFBlocks = new Voxel[params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize];
+			d_hashBucketMutex = new int[params.m_hashNumBuckets];
+		}
+		updateParams(params);
+	}
+	__host__
+	void updateParams(const HashParams& params) {
+		if (m_bIsOnGPU) {
+			updateConstantHashParams(params);
+		} 
+	}
+	__host__
+	void free() {
+		if (m_bIsOnGPU) {
+			cudaSafeCall(cudaFree(d_heap));
+			cudaSafeCall(cudaFree(d_heapCounter));
+			cudaSafeCall(cudaFree(d_hash));
+			cudaSafeCall(cudaFree(d_hashDecision));
+			cudaSafeCall(cudaFree(d_hashDecisionPrefix));
+			cudaSafeCall(cudaFree(d_hashCompactified));
+			cudaSafeCall(cudaFree(d_hashCompactifiedCounter));
+			cudaSafeCall(cudaFree(d_SDFBlocks));
+			cudaSafeCall(cudaFree(d_hashBucketMutex));
+		} else {
+			if (d_heap) delete[] d_heap;
+			if (d_heapCounter) delete[] d_heapCounter;
+			if (d_hash) delete[] d_hash;
+			if (d_hashDecision) delete[] d_hashDecision;
+			if (d_hashDecisionPrefix) delete[] d_hashDecisionPrefix;
+			if (d_hashCompactified) delete[] d_hashCompactified;
+			if (d_hashCompactifiedCounter) delete[] d_hashCompactifiedCounter;
+			if (d_SDFBlocks) delete[] d_SDFBlocks;
+			if (d_hashBucketMutex) delete[] d_hashBucketMutex;
+		}
+		d_hash = NULL;
+		d_heap = NULL;
+		d_heapCounter = NULL;
+		d_hashDecision = NULL;
+		d_hashDecisionPrefix = NULL;
+		d_hashCompactified = NULL;
+		d_hashCompactifiedCounter = NULL;
+		d_SDFBlocks = NULL;
+		d_hashBucketMutex = NULL;
+	}
+	__host__
+	HashData copyToCPU() const {
+		HashParams params;
+		HashData hashData;
+		hashData.allocate(params, false);	//allocate the data on the CPU
+		cudaSafeCall(cudaMemcpy(hashData.d_heap, d_heap, sizeof(unsigned int) * params.m_numSDFBlocks, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_heapCounter, d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hash, d_hash, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hashDecision, d_hashDecision, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hashDecisionPrefix, d_hashDecisionPrefix, sizeof(int)*params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets * params.m_hashBucketSize, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hashCompactifiedCounter, d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_SDFBlocks, d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize, cudaMemcpyDeviceToHost));
+		cudaSafeCall(cudaMemcpy(hashData.d_hashBucketMutex, d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost));
+		return hashData;	//TODO MATTHIAS look at this (i.e,. when does memory get destroyed ; if it's in the destructer it would kill everything here 
+	}
+	/////////////////
+	// Device part //
+	/////////////////
+//#define __CUDACC__
+#ifdef __CUDACC__
+	__device__
+	const HashParams& params() const {
+		return c_hashParams;
+	}
+	//! see teschner et al. (but with correct prime values)
+	__device__ 
+	uint computeHashPos(const int3& virtualVoxelPos) const { 
+		const int p0 = 73856093;
+		const int p1 = 19349669;
+		const int p2 = 83492791;
+		int res = ((virtualVoxelPos.x * p0) ^ (virtualVoxelPos.y * p1) ^ (virtualVoxelPos.z * p2)) % c_hashParams.m_hashNumBuckets;
+		if (res < 0) res += c_hashParams.m_hashNumBuckets;
+		return (uint)res;
+	}
+	//merges two voxels (v0 the currently stored voxel, v1 is the input voxel)
+	__device__ 
+	void combineVoxel(const Voxel &v0, const Voxel& v1, Voxel &out) const 	{
+		//v.color = (10*v0.weight * v0.color + v1.weight * v1.color)/(10*v0.weight + v1.weight);	//give the currently observed color more weight
+		//v.color = (v0.weight * v0.color + v1.weight * v1.color)/(v0.weight + v1.weight);
+		//out.color = 0.5f * (v0.color + v1.color);	//exponential running average 
+		float3 c0 = make_float3(v0.color.x, v0.color.y, v0.color.z);
+		float3 c1 = make_float3(v1.color.x, v1.color.y, v1.color.z);
+		float3 res = (c0.x+c0.y+c0.z == 0) ? c1 : 0.5f*c0 + 0.5f*c1;
+		//float3 res = (c0+c1)/2;
+		//float3 res = (c0 * (float)v0.weight + c1 * (float)v1.weight) / ((float)v0.weight + (float)v1.weight);
+		//float3 res = c1;
+		out.color.x = (uchar)(res.x+0.5f);	out.color.y = (uchar)(res.y+0.5f); out.color.z = (uchar)(res.z+0.5f);
+		out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight);
+		out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight);
+	}
+	//! returns the truncation of the SDF for a given distance value
+	__device__ 
+	float getTruncation(float z) const {
+		return c_hashParams.m_truncation + c_hashParams.m_truncScale * z;
+	}
+	__device__ 
+	float3 worldToVirtualVoxelPosFloat(const float3& pos) const	{
+		return pos / c_hashParams.m_virtualVoxelSize;
+	}
+	__device__ 
+	int3 worldToVirtualVoxelPos(const float3& pos) const {
+		//const float3 p = pos*g_VirtualVoxelResolutionScalar;
+		const float3 p = pos / c_hashParams.m_virtualVoxelSize;
+		return make_int3(p+make_float3(sign(p))*0.5f);
+	}
+	__device__ 
+	int3 virtualVoxelPosToSDFBlock(int3 virtualVoxelPos) const {
+		if (virtualVoxelPos.x < 0) virtualVoxelPos.x -= SDF_BLOCK_SIZE-1;
+		if (virtualVoxelPos.y < 0) virtualVoxelPos.y -= SDF_BLOCK_SIZE-1;
+		if (virtualVoxelPos.z < 0) virtualVoxelPos.z -= SDF_BLOCK_SIZE-1;
+		return make_int3(
+			virtualVoxelPos.x/SDF_BLOCK_SIZE,
+			virtualVoxelPos.y/SDF_BLOCK_SIZE,
+			virtualVoxelPos.z/SDF_BLOCK_SIZE);
+	}
+	// Computes virtual voxel position of corner sample position
+	__device__ 
+	int3 SDFBlockToVirtualVoxelPos(const int3& sdfBlock) const	{
+		return sdfBlock*SDF_BLOCK_SIZE;
+	}
+	__device__ 
+	float3 virtualVoxelPosToWorld(const int3& pos) const	{
+		return make_float3(pos)*c_hashParams.m_virtualVoxelSize;
+	}
+	__device__ 
+	float3 SDFBlockToWorld(const int3& sdfBlock) const	{
+		return virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock));
+	}
+	__device__ 
+	int3 worldToSDFBlock(const float3& worldPos) const	{
+		return virtualVoxelPosToSDFBlock(worldToVirtualVoxelPos(worldPos));
+	}
+	__device__
+	bool isSDFBlockInCameraFrustumApprox(const int3& sdfBlock) {
+		// NOTE (Nick): Changed, just assume all voxels are potentially in frustrum
+		//float3 posWorld = virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)) + c_hashParams.m_virtualVoxelSize * 0.5f * (SDF_BLOCK_SIZE - 1.0f);
+		//return DepthCameraData::isInCameraFrustumApprox(c_hashParams.m_rigidTransformInverse, posWorld);
+		return true;
+	}
+	//! computes the (local) virtual voxel pos of an index; idx in [0;511]
+	__device__ 
+	uint3 delinearizeVoxelIndex(uint idx) const	{
+		uint x = idx % SDF_BLOCK_SIZE;
+		uint y = (idx % (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE)) / SDF_BLOCK_SIZE;
+		uint z = idx / (SDF_BLOCK_SIZE * SDF_BLOCK_SIZE);	
+		return make_uint3(x,y,z);
+	}
+	//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
+	__device__ 
+	uint linearizeVoxelPos(const int3& virtualVoxelPos)	const {
+		return  
+			virtualVoxelPos.z * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE +
+			virtualVoxelPos.y * SDF_BLOCK_SIZE +
+			virtualVoxelPos.x;
+	}
+	__device__ 
+	int virtualVoxelPosToLocalSDFBlockIndex(const int3& virtualVoxelPos) const	{
+		int3 localVoxelPos = make_int3(
+			virtualVoxelPos.x % SDF_BLOCK_SIZE,
+			virtualVoxelPos.y % SDF_BLOCK_SIZE,
+			virtualVoxelPos.z % SDF_BLOCK_SIZE);
+		if (localVoxelPos.x < 0) localVoxelPos.x += SDF_BLOCK_SIZE;
+		if (localVoxelPos.y < 0) localVoxelPos.y += SDF_BLOCK_SIZE;
+		if (localVoxelPos.z < 0) localVoxelPos.z += SDF_BLOCK_SIZE;
+		return linearizeVoxelPos(localVoxelPos);
+	}
+	__device__ 
+	int worldToLocalSDFBlockIndex(const float3& world) const	{
+		int3 virtualVoxelPos = worldToVirtualVoxelPos(world);
+		return virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos);
+	}
+		//! returns the hash entry for a given worldPos; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set
+	__device__ 
+	HashEntry getHashEntry(const float3& worldPos) const	{
+		//int3 blockID = worldToSDFVirtualVoxelPos(worldPos)/SDF_BLOCK_SIZE;	//position of sdf block
+		int3 blockID = worldToSDFBlock(worldPos);
+		return getHashEntryForSDFBlockPos(blockID);
+	}
+	__device__ 
+		void deleteHashEntry(uint id) {
+			deleteHashEntry(d_hash[id]);
+	}
+	__device__ 
+		void deleteHashEntry(HashEntry& hashEntry) {
+			hashEntry.pos = make_int3(0);
+			hashEntry.offset = 0;
+			hashEntry.ptr = FREE_ENTRY;
+	}
+	__device__ 
+		bool voxelExists(const float3& worldPos) const	{
+			HashEntry hashEntry = getHashEntry(worldPos);
+			return (hashEntry.ptr != FREE_ENTRY);
+	}
+	__device__  
+	void deleteVoxel(Voxel& v) const {
+		v.color = make_uchar3(0,0,0);
+		v.weight = 0;
+		v.sdf = 0.0f;
+	}
+	__device__ 
+		void deleteVoxel(uint id) {
+			deleteVoxel(d_SDFBlocks[id]);
+	}
+	__device__ 
+	Voxel getVoxel(const float3& worldPos) const	{
+		HashEntry hashEntry = getHashEntry(worldPos);
+		Voxel v;
+		if (hashEntry.ptr == FREE_ENTRY) {
+			deleteVoxel(v);			
+		} else {
+			int3 virtualVoxelPos = worldToVirtualVoxelPos(worldPos);
+			v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)];
+		}
+		return v;
+	}
+	__device__ 
+	Voxel getVoxel(const int3& virtualVoxelPos) const	{
+		HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos));
+		Voxel v;
+		if (hashEntry.ptr == FREE_ENTRY) {
+			deleteVoxel(v);			
+		} else {
+			v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)];
+		}
+		return v;
+	}
+	__device__ 
+	void setVoxel(const int3& virtualVoxelPos, Voxel& voxelInput) const {
+		HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos));
+		if (hashEntry.ptr != FREE_ENTRY) {
+			d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)] = voxelInput;
+		}
+	}
+	//! returns the hash entry for a given sdf block id; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set
+	__device__ 
+	HashEntry getHashEntryForSDFBlockPos(const int3& sdfBlock) const
+	{
+		uint h = computeHashPos(sdfBlock);			//hash bucket
+		uint hp = h * HASH_BUCKET_SIZE;	//hash position
+		HashEntry entry;
+		entry.pos = sdfBlock;
+		entry.offset = 0;
+		entry.ptr = FREE_ENTRY;
+		for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
+			uint i = j + hp;
+			HashEntry curr = d_hash[i];
+			if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) {
+				return curr;
+			}
+		}
+#ifdef HANDLE_COLLISIONS
+		const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;
+		int i = idxLastEntryInBucket;	//start with the last entry of the current bucket
+		HashEntry curr;
+		//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
+		unsigned int maxIter = 0;
+		uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
+		#pragma unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {
+			curr = d_hash[i];
+			if (curr.pos.x == entry.pos.x && curr.pos.y == entry.pos.y && curr.pos.z == entry.pos.z && curr.ptr != FREE_ENTRY) {
+				return curr;
+			}
+			if (curr.offset == 0) {	//we have found the end of the list
+				break;
+			}
+			i = idxLastEntryInBucket + curr.offset;						//go to next element in the list
+			i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+			maxIter++;
+		}
+#endif
+		return entry;
+	}
+	//for histogram (no collision traversal)
+	__device__ 
+	unsigned int getNumHashEntriesPerBucket(unsigned int bucketID) {
+		unsigned int h = 0;
+		for (uint i = 0; i < HASH_BUCKET_SIZE; i++) {
+			if (d_hash[bucketID*HASH_BUCKET_SIZE+i].ptr != FREE_ENTRY) {
+				h++;
+			}
+		} 
+		return h;
+	}
+	//for histogram (collisions traversal only)
+	__device__ 
+	unsigned int getNumHashLinkedList(unsigned int bucketID) {
+		unsigned int listLen = 0;
+#ifdef HANDLE_COLLISIONS
+		const uint idxLastEntryInBucket = (bucketID+1)*HASH_BUCKET_SIZE - 1;
+		unsigned int i = idxLastEntryInBucket;	//start with the last entry of the current bucket
+		//int offset = 0;
+		HashEntry curr;	curr.offset = 0;
+		//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
+		unsigned int maxIter = 0;
+		uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
+		#pragma unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {
+			//offset = curr.offset;
+			//curr = getHashEntry(g_Hash, i);
+			curr = d_hash[i];
+			if (curr.offset == 0) {	//we have found the end of the list
+				break;
+			}
+			i = idxLastEntryInBucket + curr.offset;		//go to next element in the list
+			i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+			listLen++;
+			maxIter++;
+		}
+#endif
+		return listLen;
+	}
+	__device__
+	uint consumeHeap() {
+		uint addr = atomicSub(&d_heapCounter[0], 1);
+		//TODO MATTHIAS check some error handling?
+		return d_heap[addr];
+	}
+	__device__
+	void appendHeap(uint ptr) {
+		uint addr = atomicAdd(&d_heapCounter[0], 1);
+		//TODO MATTHIAS check some error handling?
+		d_heap[addr+1] = ptr;
+	}
+	//pos in SDF block coordinates
+	__device__
+	void allocBlock(const int3& pos, const uchar frame) {
+		uint h = computeHashPos(pos);				//hash bucket
+		uint hp = h * HASH_BUCKET_SIZE;	//hash position
+		int firstEmpty = -1;
+		for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
+			uint i = j + hp;		
+			HashEntry& curr = d_hash[i];
+			//in that case the SDF-block is already allocated and corresponds to the current position -> exit thread
+			if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
+				//curr.flags = frame;  // Flag block as valid in this frame (Nick)
+				return;
+			}
+			//store the first FREE_ENTRY hash entry
+			if (firstEmpty == -1 && curr.ptr == FREE_ENTRY) {
+				firstEmpty = i;
+			}
+		}
+#ifdef HANDLE_COLLISIONS
+		//updated variables as after the loop
+		const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;	//get last index of bucket
+		uint i = idxLastEntryInBucket;											//start with the last entry of the current bucket
+		//int offset = 0;
+		HashEntry curr;	curr.offset = 0;
+		//traverse list until end: memorize idx at list end and memorize offset from last element of bucket to list end
+		//int k = 0;
+		unsigned int maxIter = 0;
+		uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
+		#pragma  unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {
+			//offset = curr.offset;
+			curr = d_hash[i];	//TODO MATTHIAS do by reference
+			if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
+				//curr.flags = frame;  // Flag block as valid in this frame (Nick)
+				return;
+			}
+			if (curr.offset == 0) {	//we have found the end of the list
+				break;
+			}
+			i = idxLastEntryInBucket + curr.offset;		//go to next element in the list
+			i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+			maxIter++;
+		}
+#endif
+		if (firstEmpty != -1) {	//if there is an empty entry and we haven't allocated the current entry before
+			//int prevValue = 0;
+			//InterlockedExchange(d_hashBucketMutex[h], LOCK_ENTRY, prevValue);	//lock the hash bucket
+			int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
+			if (prevValue != LOCK_ENTRY) {	//only proceed if the bucket has been locked
+				HashEntry& entry = d_hash[firstEmpty];
+				entry.pos = pos;
+				entry.offset = NO_OFFSET;	
+				entry.flags = 0;  // Flag block as valid in this frame (Nick)	
+				entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;	//memory alloc
+			}
+			return;
+		}
+#ifdef HANDLE_COLLISIONS
+		//if (i != idxLastEntryInBucket) return;
+		int offset = 0;
+		//linear search for free entry
+		maxIter = 0;
+		#pragma  unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {
+			offset++;
+			i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//go to next hash element
+			if ((offset % HASH_BUCKET_SIZE) == 0) continue;			//cannot insert into a last bucket element (would conflict with other linked lists)
+			curr = d_hash[i];
+			//if (curr.pos.x == pos.x && curr.pos.y == pos.y && curr.pos.z == pos.z && curr.ptr != FREE_ENTRY) {
+			//	return;
+			//} 
+			if (curr.ptr == FREE_ENTRY) {	//this is the first free entry
+				//int prevValue = 0;
+				//InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue);	//lock the original hash bucket
+				int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
+				if (prevValue != LOCK_ENTRY) {
+					HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket];
+					h = i / HASH_BUCKET_SIZE;
+					//InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue);	//lock the hash bucket where we have found a free entry
+					prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
+					if (prevValue != LOCK_ENTRY) {	//only proceed if the bucket has been locked
+						HashEntry& entry = d_hash[i];
+						entry.pos = pos;
+						entry.offset = lastEntryInBucket.offset;
+						entry.flags = 0;  // Flag block as valid in this frame (Nick)		
+						entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;	//memory alloc
+						lastEntryInBucket.offset = offset;
+						d_hash[idxLastEntryInBucket] = lastEntryInBucket;
+						//setHashEntry(g_Hash, idxLastEntryInBucket, lastEntryInBucket);
+					}
+				} 
+				return;	//bucket was already locked
+			}
+			maxIter++;
+		} 
+#endif
+	}
+	//!inserts a hash entry without allocating any memory: used by streaming: TODO MATTHIAS check the atomics in this function
+	__device__
+	bool insertHashEntry(HashEntry entry)
+	{
+		uint h = computeHashPos(entry.pos);
+		uint hp = h * HASH_BUCKET_SIZE;
+		for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
+			uint i = j + hp;		
+			//const HashEntry& curr = d_hash[i];
+			int prevWeight = 0;
+			//InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight);
+			prevWeight = atomicCAS(&d_hash[i].ptr, FREE_ENTRY, LOCK_ENTRY);
+			if (prevWeight == FREE_ENTRY) {
+				d_hash[i] = entry;
+				//setHashEntry(hash, i, entry);
+				return true;
+			}
+		}
+#ifdef HANDLE_COLLISIONS
+		//updated variables as after the loop
+		const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;	//get last index of bucket
+		uint i = idxLastEntryInBucket;											//start with the last entry of the current bucket
+		HashEntry curr;
+		unsigned int maxIter = 0;
+		//[allow_uav_condition]
+		uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
+		#pragma  unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {									//traverse list until end // why find the end? we you are inserting at the start !!!
+			//curr = getHashEntry(hash, i);
+			curr = d_hash[i];	//TODO MATTHIAS do by reference
+			if (curr.offset == 0) break;									//we have found the end of the list
+			i = idxLastEntryInBucket + curr.offset;							//go to next element in the list
+			i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+			maxIter++;
+		}
+		maxIter = 0;
+		int offset = 0;
+		#pragma  unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {													//linear search for free entry
+			offset++;
+			uint i = (idxLastEntryInBucket + offset) % (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//go to next hash element
+			if ((offset % HASH_BUCKET_SIZE) == 0) continue;										//cannot insert into a last bucket element (would conflict with other linked lists)
+			int prevWeight = 0;
+			//InterlockedCompareExchange(hash[3*i+2], FREE_ENTRY, LOCK_ENTRY, prevWeight);		//check for a free entry
+			uint* d_hashUI = (uint*)d_hash;
+			prevWeight = prevWeight = atomicCAS(&d_hashUI[3*idxLastEntryInBucket+1], (uint)FREE_ENTRY, (uint)LOCK_ENTRY);
+			if (prevWeight == FREE_ENTRY) {														//if free entry found set prev->next = curr & curr->next = prev->next
+				//[allow_uav_condition]
+				//while(hash[3*idxLastEntryInBucket+2] == LOCK_ENTRY); // expects setHashEntry to set the ptr last, required because pos.z is packed into the same value -> prev->next = curr -> might corrput pos.z
+				HashEntry lastEntryInBucket = d_hash[idxLastEntryInBucket];			//get prev (= lastEntry in Bucket)
+				int newOffsetPrev = (offset << 16) | (lastEntryInBucket.pos.z & 0x0000ffff);	//prev->next = curr (maintain old z-pos)
+				int oldOffsetPrev = 0;
+				//InterlockedExchange(hash[3*idxLastEntryInBucket+1], newOffsetPrev, oldOffsetPrev);	//set prev offset atomically
+				uint* d_hashUI = (uint*)d_hash;
+				oldOffsetPrev = prevWeight = atomicExch(&d_hashUI[3*idxLastEntryInBucket+1], newOffsetPrev);
+				entry.offset = oldOffsetPrev >> 16;													//remove prev z-pos from old offset
+				//setHashEntry(hash, i, entry);														//sets the current hashEntry with: curr->next = prev->next
+				d_hash[i] = entry;
+				return true;
+			}
+			maxIter++;
+		} 
+#endif
+		return false;
+	}
+	//! deletes a hash entry position for a given sdfBlock index (returns true uppon successful deletion; otherwise returns false)
+	__device__
+	bool deleteHashEntryElement(const int3& sdfBlock) {
+		uint h = computeHashPos(sdfBlock);	//hash bucket
+		uint hp = h * HASH_BUCKET_SIZE;		//hash position
+		for (uint j = 0; j < HASH_BUCKET_SIZE; j++) {
+			uint i = j + hp;
+			const HashEntry& curr = d_hash[i];
+			if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) {
+#ifndef HANDLE_COLLISIONS
+				const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
+				appendHeap(curr.ptr / linBlockSize);
+				//heapAppend.Append(curr.ptr / linBlockSize);
+				deleteHashEntry(i);
+				return true;
+#endif
+#ifdef HANDLE_COLLISIONS
+				if (curr.offset != 0) {	//if there was a pointer set it to the next list element
+					//int prevValue = 0;
+					//InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue);	//lock the hash bucket
+					int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
+					if (prevValue == LOCK_ENTRY)	return false;
+					if (prevValue != LOCK_ENTRY) {
+						const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
+						appendHeap(curr.ptr / linBlockSize);
+						//heapAppend.Append(curr.ptr / linBlockSize);
+						int nextIdx = (i + curr.offset) % (HASH_BUCKET_SIZE*c_hashParams.m_hashNumBuckets);
+						//setHashEntry(hash, i, getHashEntry(hash, nextIdx));
+						d_hash[i] = d_hash[nextIdx];
+						deleteHashEntry(nextIdx);
+						return true;
+					}
+				} else {
+					const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
+					appendHeap(curr.ptr / linBlockSize);
+					//heapAppend.Append(curr.ptr / linBlockSize);
+					deleteHashEntry(i);
+					return true;
+				}
+#endif	//HANDLE_COLLSISION
+			}
+		}	
+#ifdef HANDLE_COLLISIONS
+		const uint idxLastEntryInBucket = (h+1)*HASH_BUCKET_SIZE - 1;
+		int i = idxLastEntryInBucket;
+		HashEntry curr;
+		curr = d_hash[i];
+		int prevIdx = i;
+		i = idxLastEntryInBucket + curr.offset;							//go to next element in the list
+		i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+		unsigned int maxIter = 0;
+		uint g_MaxLoopIterCount = c_hashParams.m_hashMaxCollisionLinkedListSize;
+		#pragma  unroll 1 
+		while (maxIter < g_MaxLoopIterCount) {
+			curr = d_hash[i];
+			//found that dude that we need/want to delete
+			if (curr.pos.x == sdfBlock.x && curr.pos.y == sdfBlock.y && curr.pos.z == sdfBlock.z && curr.ptr != FREE_ENTRY) {
+				//int prevValue = 0;
+				//InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue);	//lock the hash bucket
+				int prevValue = atomicExch(&d_hashBucketMutex[h], LOCK_ENTRY);
+				if (prevValue == LOCK_ENTRY)	return false;
+				if (prevValue != LOCK_ENTRY) {
+					const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE;
+					appendHeap(curr.ptr / linBlockSize);
+					//heapAppend.Append(curr.ptr / linBlockSize);
+					deleteHashEntry(i);
+					HashEntry prev = d_hash[prevIdx];				
+					prev.offset = curr.offset;
+					//setHashEntry(hash, prevIdx, prev);
+					d_hash[prevIdx] = prev;
+					return true;
+				}
+			}
+			if (curr.offset == 0) {	//we have found the end of the list
+				return false;	//should actually never happen because we need to find that guy before
+			}
+			prevIdx = i;
+			i = idxLastEntryInBucket + curr.offset;		//go to next element in the list
+			i %= (HASH_BUCKET_SIZE * c_hashParams.m_hashNumBuckets);	//check for overflow
+			maxIter++;
+		}
+#endif	// HANDLE_COLLSISION
+		return false;
+	}
+#endif	//CUDACC
+	uint*		d_heap;						//heap that manages free memory
+	uint*		d_heapCounter;				//single element; used as an atomic counter (points to the next free block)
+	int*		d_hashDecision;				//
+	int*		d_hashDecisionPrefix;		//
+	HashEntry*	d_hash;						//hash that stores pointers to sdf blocks
+	HashEntry*	d_hashCompactified;			//same as before except that only valid pointers are there
+	int*		d_hashCompactifiedCounter;	//atomic counter to add compactified entries atomically 
+	Voxel*		d_SDFBlocks;				//sub-blocks that contain 8x8x8 voxels (linearized); are allocated by heap
+	int*		d_hashBucketMutex;			//binary flag per hash bucket; used for allocation to atomically lock a bucket
+	bool		m_bIsOnGPU;					//the class be be used on both cpu and gpu
+};
+}  // namespace voxhash
+}  // namespace ftl
--- a/applications/reconstruct/include/ftl/voxel_hash_params.hpp
+++ b/applications/reconstruct/include/ftl/voxel_hash_params.hpp
+// From: https://github.com/niessner/VoxelHashing/blob/master/DepthSensingCUDA/Source/CUDAHashParams.h
+#pragma once
+//#include <cutil_inline.h>
+//#include <cutil_math.h>
+#include <vector_types.h>
+#include <cuda_runtime.h>
+#include <ftl/cuda_matrix_util.hpp>
+namespace ftl {
+namespace voxhash {
+//TODO might have to be split into static and dynamics
+struct __align__(16) HashParams {
+	HashParams() {
+	}
+	float4x4		m_rigidTransform;
+	float4x4		m_rigidTransformInverse;
+	unsigned int	m_hashNumBuckets;
+	unsigned int	m_hashBucketSize;
+	unsigned int	m_hashMaxCollisionLinkedListSize;
+	unsigned int	m_numSDFBlocks;
+	int				m_SDFBlockSize;
+	float			m_virtualVoxelSize;
+	unsigned int	m_numOccupiedBlocks;	//occupied blocks in the viewing frustum
+	float			m_maxIntegrationDistance;
+	float			m_truncScale;
+	float			m_truncation;
+	unsigned int	m_integrationWeightSample;
+	unsigned int	m_integrationWeightMax;
+	float3			m_streamingVoxelExtents;
+	int3			m_streamingGridDimensions;
+	int3			m_streamingMinGridPos;
+	unsigned int	m_streamingInitialChunkListSize;
+	uint2			m_dummy;
+};
+}  // namespace voxhash
+}  // namespace ftl
--- a/applications/reconstruct/src/camera_util.cu
+++ b/applications/reconstruct/src/camera_util.cu
+#ifndef _CAMERA_UTIL_
+#define _CAMERA_UTIL_
+#include <ftl/cuda_util.hpp>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/depth_camera.hpp>
+#ifndef BYTE
+#define BYTE unsigned char
+#endif
+#define T_PER_BLOCK 16
+#define MINF __int_as_float(0xff800000)
+#ifndef CUDART_PI_F
+#define CUDART_PI_F 3.141592654f
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Copy Float Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void copyFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = d_input[y*width+x];
+}
+extern "C" void copyFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	copyFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+__global__ void copyDepthFloatMapDevice(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const float depth = d_input[y*width+x];
+	if (depth >= minDepth && depth <= maxDepth) 
+		d_output[y*width+x] = depth;
+	else
+		d_output[y*width+x] = MINF;
+}
+extern "C" void copyDepthFloatMap(float* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	copyDepthFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height,minDepth, maxDepth);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Copy Float Map Fill
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void initializeOptimizerMapsDevice(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const float depth = d_input[y*width+x];
+	if(d_mask[y*width+x] != MINF) { d_output[y*width+x] = depth; }
+	else						  { d_output[y*width+x] = MINF; d_input[y*width+x] = MINF; d_input2[y*width+x] = MINF; }
+}
+extern "C" void initializeOptimizerMaps(float* d_output, float* d_input, float* d_input2, float* d_mask, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	initializeOptimizerMapsDevice<<<gridSize, blockSize>>>(d_output, d_input, d_input2, d_mask, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Copy Float4 Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void copyFloat4MapDevice(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = d_input[y*width+x];
+}
+extern "C" void copyFloat4Map(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	copyFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Raw Color to float
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertColorRawToFloatDevice(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	//uchar4 c = make_uchar4(d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+3]);	//note the flip from BGRW to RGBW
+	uchar4 c = make_uchar4(d_input[4*(y*width+x)+0], d_input[4*(y*width+x)+1], d_input[4*(y*width+x)+2], d_input[4*(y*width+x)+3]);	
+	if (c.x == 0 && c.y == 0 && c.z == 0) {
+		d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+	} else {
+		d_output[y*width+x] = make_float4(c.x/255.0f, c.y/255.0f, c.z/255.0f, c.w/255);	
+	}
+}
+extern "C" void convertColorRawToFloat4(float4* d_output, BYTE* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertColorRawToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Float4 Color to UCHAR4
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertColorFloat4ToUCHAR4Device(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	float4 color = d_input[y*width+x];
+	d_output[y*width+x] = make_uchar4(color.x*255.0f, color.y*255.0f, color.z*255.0f, color.w*255.0f);	
+}
+extern "C" void convertColorFloat4ToUCHAR4(uchar4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertColorFloat4ToUCHAR4Device<<<blockSize, gridSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Mask Color Map using Depth
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void maskColorMapFloat4MapDevice(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	float4 color = d_inputColor[y*width+x];
+	if(d_inputDepth[y*width+x].x != MINF)	d_inputColor[y*width+x] = color;	
+	else									d_inputColor[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+}
+extern "C" void maskColorMapFloat4Map(float4* d_inputColor, float4* d_inputDepth, unsigned int width, unsigned int height)
+{
+	const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);
+	maskColorMapFloat4MapDevice<<<blockSize, gridSize>>>(d_inputColor, d_inputDepth, width, height);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Color to Intensity Float4
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertColorToIntensityFloat4Device(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const float4 color = d_input[y*width+x];
+	const float I = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z;
+	d_output[y*width+x] = make_float4(I, I, I, 1.0f);
+}
+extern "C" void convertColorToIntensityFloat4(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertColorToIntensityFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Color to Intensity
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertColorToIntensityFloatDevice(float* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const float4 color = d_input[y*width+x];
+	d_output[y*width+x] = 0.299f*color.x + 0.587f*color.y + 0.114f*color.z;
+}
+extern "C" void convertColorToIntensityFloat(float* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertColorToIntensityFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert depth map to color map view
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertDepthToColorSpaceDevice(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x < depthWidth && y < depthHeight)
+	{
+		const float depth = d_input[y*depthWidth+x];
+		if(depth != MINF && depth < 1.0f)
+		{
+			// Cam space depth
+			float4 depthCamSpace = depthIntrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth);
+			depthCamSpace = make_float4(depthCamSpace.x, depthCamSpace.y, depthCamSpace.w, 1.0f);
+			// World Space
+			const float4 worldSpace = depthExtrinsicsInv*depthCamSpace;
+			// Cam space color
+			float4 colorCamSpace = colorExtrinsics*worldSpace;
+			//colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, 0.0f, colorCamSpace.z);
+			colorCamSpace = make_float4(colorCamSpace.x, colorCamSpace.y, colorCamSpace.z, 1.0f);
+			// Get coordinates in color image and set pixel to new depth
+			const float4 screenSpaceColor  = colorIntrinsics*colorCamSpace;
+			//const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.w + 0.5f);
+			//const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.w + 0.5f);
+			const unsigned int cx = (unsigned int)(screenSpaceColor.x/screenSpaceColor.z + 0.5f);
+			const unsigned int cy = (unsigned int)(screenSpaceColor.y/screenSpaceColor.z + 0.5f);
+			//if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.w; // Check for minimum !!!
+			if(cx < colorWidth && cy < colorHeight) d_output[cy*colorWidth+cx] = screenSpaceColor.z; // Check for minimum !!!
+		}
+	}
+}
+extern "C" void convertDepthToColorSpace(float* d_output, float* d_input, float4x4 depthIntrinsicsInv, float4x4 colorIntrinsics, float4x4 depthExtrinsicsInv, float4x4 colorExtrinsics, unsigned int depthWidth, unsigned int depthHeight, unsigned int colorWidth, unsigned int colorHeight)
+{
+	const dim3 gridSize((depthWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertDepthToColorSpaceDevice<<<gridSize, blockSize>>>(d_output, d_input, depthIntrinsicsInv, colorIntrinsics, depthExtrinsicsInv, colorExtrinsics, depthWidth, depthHeight, colorWidth, colorHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Set invalid float map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void setInvalidFloatMapDevice(float* d_output, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = MINF;
+}
+extern "C" void setInvalidFloatMap(float* d_output, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	setInvalidFloatMapDevice<<<gridSize, blockSize>>>(d_output, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Set invalid float4 map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void setInvalidFloat4MapDevice(float4* d_output, unsigned int width, unsigned int height)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = make_float4(MINF, MINF, MINF ,MINF);
+}
+extern "C" void setInvalidFloat4Map(float4* d_output, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	setInvalidFloat4MapDevice<<<gridSize, blockSize>>>(d_output, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Depth to Camera Space Positions
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertDepthFloatToCameraSpaceFloat3Device(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, DepthCameraData depthCameraData)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if (x < width && y < height) {
+		d_output[y*width+x] = make_float3(MINF, MINF, MINF);
+		float depth = d_input[y*width+x];
+		if(depth != MINF)
+		{
+			//float4 cameraSpace(intrinsicsInv*make_float4((float)x*depth, (float)y*depth, depth, depth));
+			//d_output[y*width+x] = make_float4(cameraSpace.x, cameraSpace.y, cameraSpace.w, 1.0f);
+			d_output[y*width+x] = depthCameraData.kinectDepthToSkeleton(x, y, depth);
+		}
+	}
+}
+extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertDepthFloatToCameraSpaceFloat3Device<<<gridSize, blockSize>>>(d_output, d_input, intrinsicsInv, width, height, depthCameraData);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Bilateral Filter Float Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+inline __device__ float gaussR(float sigma, float dist)
+{
+	return exp(-(dist*dist)/(2.0*sigma*sigma));
+}
+inline __device__ float linearR(float sigma, float dist)
+{
+	return max(1.0f, min(0.0f, 1.0f-(dist*dist)/(2.0*sigma*sigma)));
+}
+inline __device__ float gaussD(float sigma, int x, int y)
+{
+	return exp(-((x*x+y*y)/(2.0f*sigma*sigma)));
+}
+inline __device__ float gaussD(float sigma, int x)
+{
+	return exp(-((x*x)/(2.0f*sigma*sigma)));
+}
+__global__ void bilateralFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const int kernelRadius = (int)ceil(2.0*sigmaD);
+	d_output[y*width+x] = MINF;
+	float sum = 0.0f;
+	float sumWeight = 0.0f;
+	const float depthCenter = d_input[y*width+x];
+	if(depthCenter != MINF)
+	{
+		for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
+		{
+			for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
+			{		
+				if(m >= 0 && n >= 0 && m < width && n < height)
+				{
+					const float currentDepth = d_input[n*width+m];
+					if (currentDepth != MINF) {
+						const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, currentDepth-depthCenter);
+						sumWeight += weight;
+						sum += weight*currentDepth;
+					}
+				}
+			}
+		}
+		if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
+	}
+}
+extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	bilateralFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Bilateral Filter Float4 Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void bilateralFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const int kernelRadius = (int)ceil(2.0*sigmaD);
+	//d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+	float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	float sumWeight = 0.0f;
+	const float4 depthCenter = d_input[y*width+x];
+	if (depthCenter.x != MINF) {
+		for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
+		{
+			for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
+			{		
+				if(m >= 0 && n >= 0 && m < width && n < height)
+				{
+					const float4 currentDepth = d_input[n*width+m];
+					if (currentDepth.x != MINF) {
+						const float weight = gaussD(sigmaD, m-x, n-y)*gaussR(sigmaR, length(currentDepth-depthCenter));
+						sum += weight*currentDepth;
+						sumWeight += weight;
+					}
+				}
+			}
+		}
+	}
+	if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
+}
+extern "C" void bilateralFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize(T_PER_BLOCK, T_PER_BLOCK);	
+	const dim3 blockSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);	
+	bilateralFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gauss Filter Float Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void gaussFilterFloatMapDevice(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const int kernelRadius = (int)ceil(2.0*sigmaD);
+	d_output[y*width+x] = MINF;
+	float sum = 0.0f;
+	float sumWeight = 0.0f;
+	const float depthCenter = d_input[y*width+x];
+	if(depthCenter != MINF)
+	{
+		for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
+		{
+			for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
+			{		
+				if(m >= 0 && n >= 0 && m < width && n < height)
+				{
+					const float currentDepth = d_input[n*width+m];
+					if(currentDepth != MINF && fabs(depthCenter-currentDepth) < sigmaR)
+					{
+						const float weight = gaussD(sigmaD, m-x, n-y);
+						sumWeight += weight;
+						sum += weight*currentDepth;
+					}
+				}
+			}
+		}
+	}
+	if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
+}
+extern "C" void gaussFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	gaussFilterFloatMapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gauss Filter Float4 Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void gaussFilterFloat4MapDevice(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const int kernelRadius = (int)ceil(2.0*sigmaD);
+	//d_output[y*width+x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+	float4 sum = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	float sumWeight = 0.0f;
+	const float4 depthCenter = d_input[y*width+x];
+	if (depthCenter.x != MINF) {
+		for(int m = x-kernelRadius; m <= x+kernelRadius; m++)
+		{
+			for(int n = y-kernelRadius; n <= y+kernelRadius; n++)
+			{		
+				if(m >= 0 && n >= 0 && m < width && n < height)
+				{
+					const float4 currentDepth = d_input[n*width+m];
+					if (currentDepth.x != MINF) {
+						if(length(depthCenter-currentDepth) < sigmaR)
+						{
+							const float weight = gaussD(sigmaD, m-x, n-y);
+							sumWeight += weight;
+							sum += weight*currentDepth;
+						}
+					}
+				}
+			}
+		}
+	}
+	if(sumWeight > 0.0f) d_output[y*width+x] = sum / sumWeight;
+}
+extern "C" void gaussFilterFloat4Map(float4* d_output, float4* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	gaussFilterFloat4MapDevice<<<gridSize, blockSize>>>(d_output, d_input, sigmaD, sigmaR, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Normal Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeNormalsDevice(float4* d_output, float3* d_input, unsigned int width, unsigned int height)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+	if(x > 0 && x < width-1 && y > 0 && y < height-1)
+	{
+		const float3 CC = d_input[(y+0)*width+(x+0)];
+		const float3 PC = d_input[(y+1)*width+(x+0)];
+		const float3 CP = d_input[(y+0)*width+(x+1)];
+		const float3 MC = d_input[(y-1)*width+(x+0)];
+		const float3 CM = d_input[(y+0)*width+(x-1)];
+		if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF)
+		{
+			const float3 n = cross(PC-MC, CP-CM);
+			const float  l = length(n);
+			if(l > 0.0f)
+			{
+				d_output[y*width+x] = make_float4(n/-l, 1.0f);
+			}
+		}
+	}
+}
+extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeNormalsDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Normal Map 2
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeNormalsDevice2(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = make_float4(MINF, MINF, MINF, MINF);
+	if(x > 0 && x < width-1 && y > 0 && y < height-1)
+	{
+		const float4 CC = d_input[(y+0)*width+(x+0)];
+		const float4 MC = d_input[(y-1)*width+(x+0)];
+		const float4 CM = d_input[(y+0)*width+(x-1)];
+		if(CC.x != MINF && MC.x != MINF && CM.x != MINF)
+		{
+			const float3 n = cross(make_float3(MC)-make_float3(CC), make_float3(CM)-make_float3(CC));
+			const float  l = length(n);
+			if(l > 0.0f)
+			{
+				d_output[y*width+x] = make_float4(n/-l, 1.0f);
+			}
+		}
+	}
+}
+extern "C" void computeNormals2(float4* d_output, float4* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeNormalsDevice2<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Shading Value
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+inline __device__ void evaluateLightingModelTerms(float* d_out, float4 n)
+{
+	d_out[0] = 1.0;
+	d_out[1] = n.y;
+	d_out[2] = n.z;
+	d_out[3] = n.x;
+	d_out[4] = n.x*n.y;
+	d_out[5] = n.y*n.z;
+	d_out[6] = 3*n.z*n.z - 1;
+	d_out[7] = n.z*n.x;
+	d_out[8] = n.x*n.x-n.y*n.y;
+}
+inline __device__ float evaluateLightingModel(float* d_lit, float4 n)
+{
+	float tmp[9];
+	evaluateLightingModelTerms(tmp, n);
+	float sum = 0.0f;
+	for(unsigned int i = 0; i<9; i++) sum += tmp[i]*d_lit[i];
+	return sum;
+}
+__global__ void computeShadingValueDevice(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 Intrinsic, float* d_litcoeff, unsigned int width, unsigned int height)
+{
+	const unsigned int posx = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int posy = blockIdx.y*blockDim.y + threadIdx.y;
+	if(posx >= width || posy >= height) return;
+	d_outShading[posy*width+posx] = 0;
+	if(posx > 0 && posx < width-1 && posy > 0 && posy < height-1)
+	{
+		float4 n = d_normals[posy*width+posx];
+		if(n.x != MINF)
+		{
+			n.x = -n.x; // Change handedness
+			n.z = -n.z;
+			float albedo = d_albedoEstimates[(unsigned int)(d_clusterIDs[posy*width+posx]+0.5f)];
+			float shadingval = albedo*evaluateLightingModel(d_litcoeff, n);
+			if(shadingval<0.0f) shadingval = 0.0f;
+			if(shadingval>1.0f) shadingval = 1.0f;
+			d_outShading[posy*width+posx] = shadingval;
+		}
+	}
+}
+extern "C" void computeShadingValue(float* d_outShading, float* d_indepth, float4* d_normals, float* d_clusterIDs, float* d_albedoEstimates, float4x4 &Intrinsic, float* d_lighting, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeShadingValueDevice<<<gridSize, blockSize>>>(d_outShading, d_indepth, d_normals, d_clusterIDs, d_albedoEstimates, Intrinsic, d_lighting, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Simple Segmentation
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeSimpleSegmentationDevice(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	const float inputDepth = d_input[y*width+x];
+	if(inputDepth != MINF && inputDepth < depthThres) d_output[y*width+x] = inputDepth;
+	else											  d_output[y*width+x] = MINF;
+}
+extern "C" void computeSimpleSegmentation(float* d_output, float* d_input, float depthThres, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeSimpleSegmentationDevice<<<gridSize, blockSize>>>(d_output, d_input, depthThres, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Edge Mask
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeMaskEdgeMapFloat4Device(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height)
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = 1;
+	d_output[width*height+y*width+x] = 1;
+	const float thre = threshold *threshold *3.0f;
+	if(x > 0 && y > 0 && x < width-1 && y < height-1)
+	{	
+		if(d_indepth[y*width+x] == MINF)
+		{
+			d_output[y*width+x] = 0;
+			d_output[y*width+x-1] = 0;
+			d_output[width*height+y*width+x] = 0;
+			d_output[width*height+(y-1)*width+x] = 0;
+			return;
+		}
+		const float4& p0 = d_input[(y+0)*width+(x+0)];
+		const float4& p1 = d_input[(y+0)*width+(x+1)];
+		const float4& p2 = d_input[(y+1)*width+(x+0)];
+		float dU = sqrt(((p1.x-p0.x)*(p1.x-p0.x) + (p1.y-p0.y) * (p1.y-p0.y) + (p1.z-p0.z)*(p1.z-p0.z))/3.0f);
+		float dV = sqrt(((p2.x-p0.x)*(p2.x-p0.x) + (p2.y-p0.y) * (p2.y-p0.y) + (p2.z-p0.z)*(p2.z-p0.z))/3.0f);
+		//float dgradx = abs(d_indepth[y*width+x-1] + d_indepth[y*width+x+1] - 2.0f * d_indepth[y*width+x]);
+		//float dgrady = abs(d_indepth[y*width+x-width] + d_indepth[y*width+x+width] - 2.0f * d_indepth[y*width+x]);
+		if(dU > thre ) d_output[y*width+x] = 0;
+		if(dV > thre ) d_output[width*height+y*width+x] = 0;
+		//remove depth discontinuities
+		const int r = 1;
+		const float thres = 0.01f;
+		const float pCC = d_indepth[y*width+x];
+		for(int i = -r; i<=r; i++)
+		{
+			for(int j = -r; j<=r; j++)
+			{
+				int currentX = x+j;
+				int currentY = y+i;
+				if(currentX >= 0 && currentX < width && currentY >= 0 && currentY < height)
+				{
+					float d = d_indepth[currentY*width+currentX];
+					if(d != MINF && abs(pCC-d) > thres)
+					{
+						d_output[y*width+x] = 0;
+						d_output[width*height+y*width+x] = 0;
+						return;
+					}
+				}
+			}
+		}
+	}
+}
+extern "C" void computeMaskEdgeMapFloat4(unsigned char* d_output, float4* d_input, float* d_indepth, float threshold, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeMaskEdgeMapFloat4Device<<<gridSize, blockSize>>>(d_output, d_input, d_indepth, threshold,width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Clear Decission Array for Patch Depth Mask
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void clearDecissionArrayPatchDepthMaskDevice(int* d_output, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight) d_output[y*inputWidth+x] = 0;
+}
+extern "C" void clearDecissionArrayPatchDepthMask(int* d_output, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clearDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, inputWidth, inputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Decission Array for Patch Depth Mask
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeDecissionArrayPatchDepthMaskDevice(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight)
+	{
+		const int patchId_x = x/patchSize;
+		const int patchId_y = y/patchSize;
+		const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
+		const float d = d_input[y*inputWidth+x];
+		if(d != MINF) atomicMax(&d_output[patchId_y*nPatchesWidth+patchId_x], 1);
+	}
+}
+extern "C" void computeDecissionArrayPatchDepthMask(int* d_output, float* d_input, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeDecissionArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, patchSize, inputWidth, inputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Compute Remapping Array for Patch Depth Mask
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void computeRemappingArrayPatchDepthMaskDevice(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= 0 && x < inputWidth && y >= 0 && y < inputHeight)
+	{
+		const int patchId_x = x/patchSize;
+		const int patchId_y = y/patchSize;
+		const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
+		const float d = d_input[y*inputWidth+x];
+		if(d != MINF) d_output[d_prefixSum[patchId_y*nPatchesWidth+patchId_x]-1] = patchId_y*nPatchesWidth+patchId_x;
+	}
+}
+extern "C" void computeRemappingArrayPatchDepthMask(int* d_output, float* d_input, int* d_prefixSum, unsigned int patchSize, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((inputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (inputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeRemappingArrayPatchDepthMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, d_prefixSum, patchSize, inputWidth, inputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Debug Patch Remap Array
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void DebugPatchRemapArrayDevice(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	if(x < numElements)
+	{
+		int patchID = d_remapArray[x];
+		const int nPatchesWidth = (inputWidth+patchSize-1)/patchSize;
+		const int patchId_x = patchID%nPatchesWidth;
+		const int patchId_y = patchID/nPatchesWidth;
+		for(unsigned int i = 0; i<patchSize; i++)
+		{
+			for(unsigned int j = 0; j<patchSize; j++)
+			{
+				const int pixel_x = patchId_x*patchSize;
+				const int pixel_y = patchId_y*patchSize;
+				d_mask[(pixel_y+i)*inputWidth+(pixel_x+j)] = 3.0f;
+			}
+		}
+	}
+}
+extern "C" void DebugPatchRemapArray(float* d_mask, int* d_remapArray, unsigned int patchSize, unsigned int numElements, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((numElements + T_PER_BLOCK*T_PER_BLOCK - 1)/(T_PER_BLOCK*T_PER_BLOCK));
+	const dim3 blockSize(T_PER_BLOCK*T_PER_BLOCK);
+	DebugPatchRemapArrayDevice<<<gridSize, blockSize>>>(d_mask, d_remapArray, patchSize, numElements, inputWidth, inputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Resample Float Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+inline __device__ float bilinearInterpolationFloat(float x, float y, float* d_input, unsigned int imageWidth, unsigned int imageHeight)
+{
+	const int2 p00 = make_int2(floor(x), floor(y));
+	const int2 p01 = p00 + make_int2(0.0f, 1.0f);
+	const int2 p10 = p00 + make_int2(1.0f, 0.0f);
+	const int2 p11 = p00 + make_int2(1.0f, 1.0f);
+	const float alpha = x - p00.x;
+	const float beta  = y - p00.y;
+	float s0 = 0.0f; float w0 = 0.0f;
+	if(p00.x < imageWidth && p00.y < imageHeight) { float v00 = d_input[p00.y*imageWidth + p00.x]; if(v00 != MINF) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } }
+	if(p10.x < imageWidth && p10.y < imageHeight) { float v10 = d_input[p10.y*imageWidth + p10.x]; if(v10 != MINF) { s0 +=		 alpha *v10; w0 +=		 alpha ; } }
+	float s1 = 0.0f; float w1 = 0.0f;
+	if(p01.x < imageWidth && p01.y < imageHeight) { float v01 = d_input[p01.y*imageWidth + p01.x]; if(v01 != MINF) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} }
+	if(p11.x < imageWidth && p11.y < imageHeight) { float v11 = d_input[p11.y*imageWidth + p11.x]; if(v11 != MINF) { s1 +=		 alpha *v11; w1 +=		 alpha ;} }
+	const float p0 = s0/w0;
+	const float p1 = s1/w1;
+	float ss = 0.0f; float ww = 0.0f;
+	if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); }
+	if(w1 > 0.0f) { ss +=		beta *p1; ww +=		  beta ; }
+	if(ww > 0.0f) return ss/ww;
+	else		  return MINF;
+}
+__global__ void resampleFloatMapDevice(float* d_colorMapResampledFloat, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x < outputWidth && y < outputHeight)
+	{
+		const float scaleWidth  = (float)(inputWidth-1) /(float)(outputWidth-1);
+		const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1);
+		const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f);
+		const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f);
+		if(xInput < inputWidth && yInput < inputHeight)
+		{
+			d_colorMapResampledFloat[y*outputWidth+x] = bilinearInterpolationFloat(x*scaleWidth, y*scaleHeight, d_colorMapFloat, inputWidth, inputHeight);
+		}
+	}
+}
+extern "C" void resampleFloatMap(float* d_colorMapResampledFloat, unsigned int outputWidth, unsigned int outputHeight, float* d_colorMapFloat, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	resampleFloatMapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat, d_colorMapFloat, inputWidth, inputHeight, outputWidth, outputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Resample Float4 Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+inline __device__ float4 bilinearInterpolationFloat4(float x, float y, float4* d_input, unsigned int imageWidth, unsigned int imageHeight)
+{
+	const int2 p00 = make_int2(floor(x), floor(y));
+	const int2 p01 = p00 + make_int2(0.0f, 1.0f);
+	const int2 p10 = p00 + make_int2(1.0f, 0.0f);
+	const int2 p11 = p00 + make_int2(1.0f, 1.0f);
+	const float alpha = x - p00.x;
+	const float beta  = y - p00.y;
+	//const float INVALID = 0.0f;
+	const float INVALID = MINF;
+	float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f;
+	if(p00.x < imageWidth && p00.y < imageHeight) { float4 v00 = d_input[p00.y*imageWidth + p00.x]; if(v00.x != INVALID && v00.y != INVALID && v00.z != INVALID) { s0 += (1.0f-alpha)*v00; w0 += (1.0f-alpha); } }
+	if(p10.x < imageWidth && p10.y < imageHeight) { float4 v10 = d_input[p10.y*imageWidth + p10.x]; if(v10.x != INVALID && v10.y != INVALID && v10.z != INVALID) { s0 +=		alpha *v10; w0 +=		alpha ; } }
+	float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f;
+	if(p01.x < imageWidth && p01.y < imageHeight) { float4 v01 = d_input[p01.y*imageWidth + p01.x]; if(v01.x != INVALID && v01.y != INVALID && v01.z != INVALID) { s1 += (1.0f-alpha)*v01; w1 += (1.0f-alpha);} }
+	if(p11.x < imageWidth && p11.y < imageHeight) { float4 v11 = d_input[p11.y*imageWidth + p11.x]; if(v11.x != INVALID && v11.y != INVALID && v11.z != INVALID) { s1 +=		alpha *v11; w1 +=		alpha ;} }
+	const float4 p0 = s0/w0;
+	const float4 p1 = s1/w1;
+	float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f;
+	if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); }
+	if(w1 > 0.0f) { ss +=		beta *p1; ww +=		  beta ; }
+	if(ww > 0.0f) return ss/ww;
+	else		  return make_float4(MINF, MINF, MINF, MINF);
+}
+__global__ void resampleFloat4MapDevice(float4* d_colorMapResampledFloat4, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x < outputWidth && y < outputHeight)
+	{
+		const float scaleWidth  = (float)(inputWidth-1) /(float)(outputWidth-1);
+		const float scaleHeight = (float)(inputHeight-1)/(float)(outputHeight-1);
+		const unsigned int xInput = (unsigned int)(x*scaleWidth +0.5f);
+		const unsigned int yInput = (unsigned int)(y*scaleHeight+0.5f);
+		if(xInput < inputWidth && yInput < inputHeight)
+		{
+			d_colorMapResampledFloat4[y*outputWidth+x] = bilinearInterpolationFloat4(x*scaleWidth, y*scaleHeight, d_colorMapFloat4, inputWidth, inputHeight);
+		}
+	}
+}
+extern "C" void resampleFloat4Map(float4* d_colorMapResampledFloat4, unsigned int outputWidth, unsigned int outputHeight, float4* d_colorMapFloat4, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	resampleFloat4MapDevice<<<gridSize, blockSize>>>(d_colorMapResampledFloat4, d_colorMapFloat4, inputWidth, inputHeight, outputWidth, outputHeight);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Resample Unsigned Char Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void downsampleUnsignedCharMapDevice(unsigned char* d_MapResampled, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight, unsigned int outputWidth, unsigned int outputHeight, unsigned int layerOffsetInput, unsigned int layerOffsetOutput)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= outputWidth || y >= outputHeight) return;
+	unsigned char res = 0;
+	const unsigned int inputX = 2*x;
+	const unsigned int inputY = 2*y;
+	if((inputY+0) < inputHeight && (inputX+0) < inputWidth)	res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+0))];
+	if((inputY+0) < inputHeight && (inputX+1) < inputWidth)	res += d_Map[layerOffsetInput + ((inputY+0)*inputWidth + (inputX+1))];
+	if((inputY+1) < inputHeight && (inputX+0) < inputWidth)	res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+0))];
+	if((inputY+1) < inputHeight && (inputX+1) < inputWidth) res += d_Map[layerOffsetInput + ((inputY+1)*inputWidth + (inputX+1))];
+	if(res == 4) d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 1;
+	else		 d_MapResampled[layerOffsetOutput+(y*outputWidth+x)] = 0;
+}
+extern "C" void downsampleUnsignedCharMap(unsigned char* d_MapResampled, unsigned int outputWidth, unsigned int outputHeight, unsigned char* d_Map, unsigned int inputWidth, unsigned int inputHeight)
+{
+	const dim3 gridSize((outputWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (outputHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, 0, 0);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+	downsampleUnsignedCharMapDevice<<<gridSize, blockSize>>>(d_MapResampled, d_Map, inputWidth, inputHeight, outputWidth, outputHeight, inputWidth*inputHeight, outputWidth*outputHeight);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Convert Edge Mask to Float Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void convertEdgeMaskToFloatDevice(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= width || y >= height) return;
+	d_output[y*width+x] = min(d_input[y*width+x], d_input[width*height+y*width+x]);
+}
+extern "C" void convertEdgeMaskToFloat(float* d_output, unsigned char* d_input, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	convertEdgeMaskToFloatDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Dilate Depth Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void dilateDepthMapDevice(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= 0 && x < width && y >= 0 && y < height)
+	{
+		float sum = 0.0f;
+		float count = 0.0f;
+		float oldDepth = d_inputOrig[y*width+x];
+		if(oldDepth != MINF && oldDepth != 0)
+		{
+			for(int i = -structureSize; i<=structureSize; i++)
+			{
+				for(int j = -structureSize; j<=structureSize; j++)
+				{
+					if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
+					{
+						const float d = d_input[(y+i)*width+(x+j)];
+						if(d != MINF && d != 0.0f && fabs(d-oldDepth) < 0.05f)
+						{
+							sum += d;
+							count += 1.0f;
+						}
+					}
+				}
+			}
+		}
+		if(count > ((2*structureSize+1)*(2*structureSize+1))/36) d_output[y*width+x] = 1.0f;
+		else			 d_output[y*width+x] = MINF;
+	}
+}
+extern "C" void dilateDepthMapMask(float* d_output, float* d_input, float* d_inputOrig, int structureSize, int width, int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	dilateDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, d_inputOrig, structureSize, width, height);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Mean Filter Depth Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void removeDevMeanMapMaskDevice(float* d_output, float* d_input, int structureSize, int width, int height)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	d_output[y*width+x] = d_input[y*width+x];
+	if(x >= 0 && x < width && y >= 0 && y < height)
+	{
+		float oldDepth = d_input[y*width+x];
+		float mean = 0.0f;
+		float meanSquared = 0.0f;
+		float count = 0.0f;
+		for(int i = -structureSize; i<=structureSize; i++)
+		{
+			for(int j = -structureSize; j<=structureSize; j++)
+			{
+				if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
+				{
+					float depth = d_input[(y+i)*width+(x+j)];
+					if(depth == MINF)
+					{
+						depth = 8.0f;
+					}
+					if(depth > 0.0f)
+					{
+						mean		+= depth;
+						meanSquared += depth*depth;
+						count		+= 1.0f;
+					}
+				}
+			}
+		}
+		mean/=count;
+		meanSquared/=count;
+		float stdDev = sqrt(meanSquared-mean*mean);
+		if(fabs(oldDepth-mean) > 0.5f*stdDev)// || stdDev> 0.005f)
+		{
+			d_output[y*width+x] = MINF;
+		}
+	}
+}
+extern "C" void removeDevMeanMapMask(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	removeDevMeanMapMaskDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+// Nearest neighbour
+inline __device__ bool getValueNearestNeighbourNoCheck(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue)
+{
+	const int u = (int)(p.x + 0.5f);
+	const int v = (int)(p.y + 0.5f);
+	if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false;
+	*outValue = inputMap[v*imageWidth + u];
+	return true;
+}
+inline __device__ bool getValueNearestNeighbour(const float2& p, const float4* inputMap, unsigned int imageWidth, unsigned int imageHeight, float4* outValue)
+{
+	bool valid = getValueNearestNeighbourNoCheck(p, inputMap, imageWidth, imageHeight, outValue);
+	return valid && (outValue->x != MINF && outValue->y != MINF && outValue->z != MINF);
+}
+// Nearest neighbour
+inline __device__ bool getValueNearestNeighbourFloatNoCheck(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue)
+{
+	const int u = (int)(p.x + 0.5f);
+	const int v = (int)(p.y + 0.5f);
+	if(u < 0 || u > imageWidth || v < 0 || v > imageHeight) return false;
+	*outValue = inputMap[v*imageWidth + u];
+	return true;
+}
+inline __device__ bool getValueNearestNeighbourFloat(const float2& p, const float* inputMap, unsigned int imageWidth, unsigned int imageHeight, float* outValue)
+{
+	bool valid = getValueNearestNeighbourFloatNoCheck(p, inputMap, imageWidth, imageHeight, outValue);
+	return valid && (*outValue != MINF);
+}
+/////////////////////////////////////////////
+// Compute derivatives in camera space
+/////////////////////////////////////////////
+__global__ void computeDerivativesCameraSpaceDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const int index = y*imageWidth+x;
+	if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
+	{
+		d_positionsDU[index] = make_float4(MINF, MINF, MINF, MINF);
+		d_positionsDV[index] = make_float4(MINF, MINF, MINF, MINF);
+		if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1)
+		{
+			float4 pos00; bool valid00 = getValueNearestNeighbour(make_float2(x-1, y-1), d_positions, imageWidth, imageHeight, &pos00); if(!valid00) return;
+			float4 pos01; bool valid01 = getValueNearestNeighbour(make_float2(x-1, y-0), d_positions, imageWidth, imageHeight, &pos01); if(!valid01) return;
+			float4 pos02; bool valid02 = getValueNearestNeighbour(make_float2(x-1, y+1), d_positions, imageWidth, imageHeight, &pos02); if(!valid02) return;
+			float4 pos10; bool valid10 = getValueNearestNeighbour(make_float2(x-0, y-1), d_positions, imageWidth, imageHeight, &pos10); if(!valid10) return;
+			float4 pos11; bool valid11 = getValueNearestNeighbour(make_float2(x-0, y-0), d_positions, imageWidth, imageHeight, &pos11); if(!valid11) return;
+			float4 pos12; bool valid12 = getValueNearestNeighbour(make_float2(x-0, y+1), d_positions, imageWidth, imageHeight, &pos12); if(!valid12) return;
+			float4 pos20; bool valid20 = getValueNearestNeighbour(make_float2(x+1, y-1), d_positions, imageWidth, imageHeight, &pos20); if(!valid20) return;
+			float4 pos21; bool valid21 = getValueNearestNeighbour(make_float2(x+1, y-0), d_positions, imageWidth, imageHeight, &pos21); if(!valid21) return;
+			float4 pos22; bool valid22 = getValueNearestNeighbour(make_float2(x+1, y+1), d_positions, imageWidth, imageHeight, &pos22); if(!valid22) return;
+			float4 resU = (-1.0f)*pos00 + (1.0f)*pos20 +
+						  (-2.0f)*pos01 + (2.0f)*pos21 +
+						  (-1.0f)*pos02 + (1.0f)*pos22;
+			resU /= 8.0f;
+			float4 resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 + 
+						  ( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22;
+			resV /= 8.0f;
+			//if(mat3x1(make_float3(resU)).norm1D() > 0.02f) return;
+			//if(mat3x1(make_float3(resV)).norm1D() > 0.02f) return;
+			d_positionsDU[index] = resU;
+			d_positionsDV[index] = resV;
+		}
+	}
+}
+extern "C" void computeDerivativesCameraSpace(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight, float4* d_positionsDU, float4* d_positionsDV)
+{
+	const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeDerivativesCameraSpaceDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_positionsDU, d_positionsDV);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+/////////////////////////////////////////////
+// Compute Intensity and Derivatives
+/////////////////////////////////////////////
+__global__ void computeIntensityAndDerivativesDevice(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const int index = y*imageWidth+x;
+	if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
+	{
+		d_intensityAndDerivatives[index] = make_float4(MINF, MINF, MINF, MINF);
+		if(x > 0 && x < imageWidth - 1 && y > 0 && y < imageHeight - 1)
+		{
+			float pos00; bool valid00 = getValueNearestNeighbourFloat(make_float2(x-1, y-1), d_intensity, imageWidth, imageHeight, &pos00); if(!valid00) return;
+			float pos01; bool valid01 = getValueNearestNeighbourFloat(make_float2(x-1, y-0), d_intensity, imageWidth, imageHeight, &pos01); if(!valid01) return;
+			float pos02; bool valid02 = getValueNearestNeighbourFloat(make_float2(x-1, y+1), d_intensity, imageWidth, imageHeight, &pos02); if(!valid02) return;
+			float pos10; bool valid10 = getValueNearestNeighbourFloat(make_float2(x-0, y-1), d_intensity, imageWidth, imageHeight, &pos10); if(!valid10) return;
+			float pos11; bool valid11 = getValueNearestNeighbourFloat(make_float2(x-0, y-0), d_intensity, imageWidth, imageHeight, &pos11); if(!valid11) return;
+			float pos12; bool valid12 = getValueNearestNeighbourFloat(make_float2(x-0, y+1), d_intensity, imageWidth, imageHeight, &pos12); if(!valid12) return;
+			float pos20; bool valid20 = getValueNearestNeighbourFloat(make_float2(x+1, y-1), d_intensity, imageWidth, imageHeight, &pos20); if(!valid20) return;
+			float pos21; bool valid21 = getValueNearestNeighbourFloat(make_float2(x+1, y-0), d_intensity, imageWidth, imageHeight, &pos21); if(!valid21) return;
+			float pos22; bool valid22 = getValueNearestNeighbourFloat(make_float2(x+1, y+1), d_intensity, imageWidth, imageHeight, &pos22); if(!valid22) return;
+			float resU = (-1.0f)*pos00 + (1.0f)*pos20 +
+						  (-2.0f)*pos01 + (2.0f)*pos21 +
+						  (-1.0f)*pos02 + (1.0f)*pos22;
+			resU /= 8.0f;
+			float resV = (-1.0f)*pos00 + (-2.0f)*pos10 + (-1.0f)*pos20 + 
+						  ( 1.0f)*pos02 + ( 2.0f)*pos12 + ( 1.0f)*pos22;
+			resV /= 8.0f;
+			d_intensityAndDerivatives[index] = make_float4(pos11, resU, resV, 1.0f);
+		}
+	}
+}
+extern "C" void computeIntensityAndDerivatives(float* d_intensity, unsigned int imageWidth, unsigned int imageHeight, float4* d_intensityAndDerivatives)
+{
+	const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeIntensityAndDerivativesDevice<<<gridSize, blockSize>>>(d_intensity, imageWidth, imageHeight, d_intensityAndDerivatives);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+/////////////////////////////////////////////
+// Compute grdient intensity magnitude
+/////////////////////////////////////////////
+__global__ void computeGradientIntensityMagnitudeDevice(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const int index = y*imageWidth+x;
+	d_ouput[index] = make_float4(MINF, MINF, MINF, MINF);
+	float4 DU = d_inputDU[index];
+	float4 DV = d_inputDV[index];
+	if(DU.x != MINF && DV.x != MINF)
+	{
+		float m = sqrtf(DU.x*DU.x+DV.x*DV.x);
+		if(m > 0.005f)
+		{
+			d_ouput[index] = make_float4(m, m, m, 1.0f);
+		}
+	}
+}
+extern "C" void computeGradientIntensityMagnitude(float4* d_inputDU, float4* d_inputDV, unsigned int imageWidth, unsigned int imageHeight, float4* d_ouput)
+{
+	const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	computeGradientIntensityMagnitudeDevice<<<gridSize, blockSize>>>(d_inputDU, d_inputDV, imageWidth, imageHeight, d_ouput);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+/////////////////////////////////////////////
+// Transform
+/////////////////////////////////////////////
+__global__ void transformCameraSpaceMapDevice(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight,  float4* d_output)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const int index = y*imageWidth+x;
+	if(x >= 0 && x < imageWidth && y >= 0 && y < imageHeight)
+	{
+		d_output[index] = d_positions[index];
+		if(d_positions[index].x != MINF && d_positions[index].y != MINF && d_positions[index].z != MINF)
+		{
+			d_output[index] =  d_positions[index]+make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}		
+	}
+}
+extern "C" void transformCameraSpaceMap(float4* d_positions, unsigned int imageWidth, unsigned int imageHeight,  float4* d_output)
+{
+	const dim3 gridSize((imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	transformCameraSpaceMapDevice<<<gridSize, blockSize>>>(d_positions, imageWidth, imageHeight, d_output);
+	#ifdef _DEBUG
+		cutilSafeCall(cudaDeviceSynchronize());
+		cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Erode Depth Map
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__global__ void erodeDepthMapDevice(float* d_output, float* d_input, int structureSize, int width, int height, float dThresh, float fracReq)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if(x >= 0 && x < width && y >= 0 && y < height)
+	{
+		unsigned int count = 0;
+		float oldDepth = d_input[y*width+x];
+		for(int i = -structureSize; i<=structureSize; i++)
+		{
+			for(int j = -structureSize; j<=structureSize; j++)
+			{
+				if(x+j >= 0 && x+j < width && y+i >= 0 && y+i < height)
+				{
+					float depth = d_input[(y+i)*width+(x+j)];
+					if(depth == MINF || depth == 0.0f || fabs(depth-oldDepth) > dThresh)
+					{
+						count++;
+						//d_output[y*width+x] = MINF;
+						//return;
+					}
+				}
+			}
+		}
+		unsigned int sum = (2*structureSize+1)*(2*structureSize+1);
+		if ((float)count/(float)sum >= fracReq) {
+			d_output[y*width+x] = MINF;
+		} else {
+			d_output[y*width+x] = d_input[y*width+x];
+		}
+	}
+}
+extern "C" void erodeDepthMap(float* d_output, float* d_input, int structureSize, unsigned int width, unsigned int height, float dThresh, float fracReq)
+{
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	erodeDepthMapDevice<<<gridSize, blockSize>>>(d_output, d_input, structureSize, width, height, dThresh, fracReq);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////
+// Depth to HSV map conversion /////
+////////////////////////////////////
+__device__ float3 convertHSVToRGB(const float3& hsv) {
+	float H = hsv.x;
+	float S = hsv.y;
+	float V = hsv.z;
+	float hd = H/60.0f;
+	unsigned int h = (unsigned int)hd;
+	float f = hd-h;
+	float p = V*(1.0f-S);
+	float q = V*(1.0f-S*f);
+	float t = V*(1.0f-S*(1.0f-f));
+	if(h == 0 || h == 6)
+	{
+		return make_float3(V, t, p);
+	}
+	else if(h == 1)
+	{
+		return make_float3(q, V, p);
+	}
+	else if(h == 2)
+	{
+		return make_float3(p, V, t);
+	}
+	else if(h == 3)
+	{
+		return make_float3(p, q, V);
+	}
+	else if(h == 4)
+	{
+		return make_float3(t, p, V);
+	}
+	else
+	{
+		return make_float3(V, p, q);
+	}
+}
+__device__ float3 convertDepthToRGB(float depth, float depthMin, float depthMax) {
+	float depthZeroOne = (depth - depthMin)/(depthMax - depthMin);
+	float x = 1.0f-depthZeroOne;
+	if (x < 0.0f)	x = 0.0f;
+	if (x > 1.0f)	x = 1.0f;
+	//return convertHSVToRGB(make_float3(240.0f*x, 1.0f, 0.5f));
+	x = 360.0f*x - 120.0f;
+	if (x < 0.0f) x += 359.0f;
+	return convertHSVToRGB(make_float3(x, 1.0f, 0.5f));
+}
+__global__ void depthToHSVDevice(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth)
+{
+	const int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if (x >= 0 && x < width && y >= 0 && y < height) {
+		float depth = d_input[y*width + x];
+		if (depth != MINF && depth != 0.0f && depth >= minDepth && depth <= maxDepth) {
+			float3 c = convertDepthToRGB(depth, minDepth, maxDepth);
+			d_output[y*width + x] = make_float4(c, 1.0f);
+		} else {
+			d_output[y*width + x] = make_float4(0.0f);
+		}
+	}
+}
+extern "C" void depthToHSV(float4* d_output, float* d_input, unsigned int width, unsigned int height, float minDepth, float maxDepth) {
+	const dim3 gridSize((width + T_PER_BLOCK - 1)/T_PER_BLOCK, (height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	depthToHSVDevice<<<gridSize, blockSize>>>(d_output, d_input, width, height, minDepth, maxDepth);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+#endif // _CAMERA_UTIL_
\ No newline at end of file
--- a/applications/reconstruct/src/main.cpp
+++ b/applications/reconstruct/src/main.cpp
@@ -7,6 +7,9 @@
 #include <glog/logging.h>
 #include <ftl/config.h>
 #include <ftl/configuration.hpp>
+#include <ftl/depth_camera.hpp>
+#include <ftl/scene_rep_hash_sdf.hpp>
+#include <ftl/ray_cast_sdf.hpp>
 #include <ftl/rgbd.hpp>
 // #include <zlib.h>
@@ -67,6 +70,7 @@ using cv::Mat;
 namespace ftl {
 namespace rgbd {
 PointCloud<PointXYZRGB>::Ptr createPointCloud(RGBDSource *src);
+PointCloud<PointXYZRGB>::Ptr createPointCloud(RGBDSource *src, const cv::Point3_<uchar> &col);
 }
 }
@@ -87,14 +91,14 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) {
 	for(int i=0;i<rgb.rows;i++) {
 		const float *sptr = depth.ptr<float>(i);
 		for(int j=0;j<rgb.cols;j++) {
-			float d = sptr[j] * 1000.0f;
+			float d = sptr[j]; // * 1000.0f;
 			pcl::PointXYZRGB point;
 			point.x = (((double)j + CX) / FX) * d;
 			point.y = (((double)i + CY) / FY) * d;
 			point.z = d;
-			if (point.x == INFINITY || point.y == INFINITY || point.z > 20000.0f || point.z < 0.04f) {
+			if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) {
 				point.x = 0.0f; point.y = 0.0f; point.z = 0.0f;
 			}
@@ -109,6 +113,44 @@ PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src) {
 	return point_cloud_ptr;
 }
+PointCloud<PointXYZRGB>::Ptr ftl::rgbd::createPointCloud(RGBDSource *src, const cv::Point3_<uchar> &prgb) {
+	const double CX = src->getParameters().cx;
+	const double CY = src->getParameters().cy;
+	const double FX = src->getParameters().fx;
+	const double FY = src->getParameters().fy;
+	cv::Mat rgb;
+	cv::Mat depth;
+	src->getRGBD(rgb, depth);
+	pcl::PointCloud<pcl::PointXYZRGB>::Ptr point_cloud_ptr(new pcl::PointCloud<pcl::PointXYZRGB>);
+	point_cloud_ptr->width = rgb.cols * rgb.rows;
+	point_cloud_ptr->height = 1;
+	for(int i=0;i<rgb.rows;i++) {
+		const float *sptr = depth.ptr<float>(i);
+		for(int j=0;j<rgb.cols;j++) {
+			float d = sptr[j]; // * 1000.0f;
+			pcl::PointXYZRGB point;
+			point.x = (((double)j + CX) / FX) * d;
+			point.y = (((double)i + CY) / FY) * d;
+			point.z = d;
+			if (point.x == INFINITY || point.y == INFINITY || point.z > 20.0f || point.z < 0.04f) {
+				point.x = 0.0f; point.y = 0.0f; point.z = 0.0f;
+			}
+			uint32_t rgb = (static_cast<uint32_t>(prgb.z) << 16 | static_cast<uint32_t>(prgb.y) << 8 | static_cast<uint32_t>(prgb.x));
+			point.rgb = *reinterpret_cast<float*>(&rgb);
+			point_cloud_ptr -> points.push_back(point);
+		}
+	}
+	return point_cloud_ptr;
+}
 #ifdef HAVE_PCL
 #include <pcl/filters/uniform_sampling.h>
@@ -181,8 +223,14 @@ void saveRegistration(std::map<string, Eigen::Matrix4f> registration) {
 	file << save;
 }
+struct Cameras {
+	RGBDSource *source;
+	DepthCameraData gpu;
+	DepthCameraParams params;
+};
 template <template<class> class Container>
-std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<RGBDSource*> &inputs) {
+std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Container<Cameras> &inputs) {
 	std::map<string, Eigen::Matrix4f> registration;
@@ -205,7 +253,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
 	size_t ref_i;
 	bool found = false;
 	for (size_t i = 0; i < inputs.size(); i++) {
-		if (inputs[i]->getConfig()["uri"] == ref_uri) {
+		if (inputs[i].source->getConfig()["uri"] == ref_uri) {
 			ref_i = i;
 			found = true;
 			break;
@@ -215,7 +263,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
 	if (!found) { LOG(ERROR) << "Reference input not found!"; return registration; }
 	for (auto &input : inputs) { 
-		cv::namedWindow("Registration: " + (string)input->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL);
+		cv::namedWindow("Registration: " + (string)input.source->getConfig()["uri"], cv::WINDOW_KEEPRATIO|cv::WINDOW_NORMAL);
 	}
 	// vector for every input: vector of point clouds of patterns
@@ -227,7 +275,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
 		vector<PointCloud<PointXYZ>::Ptr> patterns_iter(inputs.size());
 		for (size_t i = 0; i < inputs.size(); i++) {
-			RGBDSource *input = inputs[i];
+			RGBDSource *input = inputs[i].source;
 			Mat rgb, depth;
 			input->getRGBD(rgb, depth);
@@ -250,7 +298,7 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
 		else { LOG(WARNING) << "Pattern not detected on all inputs";}
 	}
-	for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input->getConfig()["uri"]); }
+	for (auto &input : inputs) { cv::destroyWindow("Registration: " + (string)input.source->getConfig()["uri"]); }
 	for (size_t i = 0; i < inputs.size(); i++) {
 		Eigen::Matrix4f T;
@@ -260,13 +308,48 @@ std::map<string, Eigen::Matrix4f> runRegistration(ftl::net::Universe &net, Conta
 		else {
 			T = ftl::registration::findTransformation(patterns[i], patterns[ref_i]);
 		}
-		registration[(string)inputs[i]->getConfig()["uri"]] = T;
+		registration[(string)inputs[i].source->getConfig()["uri"]] = T;
 	}
 	saveRegistration(registration);
 	return registration;
 }
 #endif
+template<class T>
+Eigen::Matrix<T,4,4> lookAt
+(
+	Eigen::Matrix<T,3,1> const & eye,
+	Eigen::Matrix<T,3,1> const & center,
+	Eigen::Matrix<T,3,1> const & up
+)
+{
+	typedef Eigen::Matrix<T,4,4> Matrix4;
+	typedef Eigen::Matrix<T,3,1> Vector3;
+	Vector3 f = (center - eye).normalized();
+	Vector3 u = up.normalized();
+	Vector3 s = f.cross(u).normalized();
+	u = s.cross(f);
+	Matrix4 res;
+	res <<	s.x(),s.y(),s.z(),-s.dot(eye),
+			u.x(),u.y(),u.z(),-u.dot(eye),
+			-f.x(),-f.y(),-f.z(),f.dot(eye),
+			0,0,0,1;
+	return res;
+}
+using MouseAction = std::function<void(int, int, int, int)>;
+static void setMouseAction(const std::string& winName, const MouseAction &action)
+{
+  cv::setMouseCallback(winName,
+                       [] (int event, int x, int y, int flags, void* userdata) {
+    (*(MouseAction*)userdata)(event, x, y, flags);
+  }, (void*)&action);
+}
 static void run() {
 	Universe net(config["net"]);
@@ -278,8 +361,8 @@ static void run() {
 		return;
 	}
-	std::deque<RGBDSource*> inputs;
+	std::deque<Cameras> inputs;
-	std::vector<Display> displays;
+	//std::vector<Display> displays;
 	// TODO Allow for non-net source types
 	for (auto &src : config["sources"]) {		
@@ -287,9 +370,20 @@ static void run() {
 		if (!in) {
 			LOG(ERROR) << "Unrecognised source: " << src;
 		} else {
-			inputs.push_back(in);
+			auto &cam = inputs.emplace_back();
-			displays.emplace_back(config["display"], src["uri"]);
+			cam.source = in;
-			LOG(INFO) << (string)src["uri"] << " loaded";
+			cam.params.fx = in->getParameters().fx;
+			cam.params.fy = in->getParameters().fy;
+			cam.params.mx = -in->getParameters().cx;
+			cam.params.my = -in->getParameters().cy;
+			cam.params.m_imageWidth = in->getParameters().width;
+			cam.params.m_imageHeight = in->getParameters().height;
+			cam.params.m_sensorDepthWorldMax = in->getParameters().maxDepth;
+			cam.params.m_sensorDepthWorldMin = in->getParameters().minDepth;
+			cam.gpu.alloc(cam.params);
+			//displays.emplace_back(config["display"], src["uri"]);
+			LOG(INFO) << (string)src["uri"] << " loaded " << cam.params.m_imageWidth;
 		}
 	}
@@ -297,63 +391,126 @@ static void run() {
 	// load point cloud transformations
-#ifdef HAVE_PCL
 	std::map<string, Eigen::Matrix4f> registration;
 	if (config["registration"]["calibration"]["run"]) {
 		registration = runRegistration(net, inputs);
 	}
 	else {
+		LOG(INFO) << "LOAD REG";
 		registration = loadRegistration();
 	}
+	LOG(INFO) << "Assigning poses";
 	vector<Eigen::Matrix4f> T;
 	for (auto &input : inputs) {
-		Eigen::Matrix4f RT = (registration.count((string)input->getConfig()["uri"]) > 0) ? registration[(string)input->getConfig()["uri"]] : registration["default"];
+		LOG(INFO) << (unsigned long long)input.source;
+		Eigen::Matrix4f RT = (registration.count(input.source->getConfig()["uri"].get<string>()) > 0) ? registration[(string)input.source->getConfig()["uri"]] : registration["default"];
 		T.push_back(RT);
+		input.source->setPose(RT);
 	}
-	//
+	//vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size());
-	vector<PointCloud<PointXYZRGB>::Ptr> clouds(inputs.size());
 	Display display_merged(config["display"], "Merged"); // todo
+	CUDARayCastSDF rays(config["voxelhash"]);
+	ftl::voxhash::SceneRep scene(config["voxelhash"]);
+	cv::Mat colour_array(cv::Size(rays.getRayCastParams().m_width,rays.getRayCastParams().m_height), CV_8UC3);
+	// Force window creation
+	display_merged.render(colour_array);
+	display_merged.wait(1);
+	unsigned char frameCount = 0;
+	bool paused = false;
+	float cam_x = 0.0f;
+	float cam_z = 0.0f;
+	Eigen::Vector3f eye(0.0f, 0.0f, 0.0f);
+	Eigen::Vector3f centre(0.0f, 0.0f, -4.0f);
+	Eigen::Vector3f up(0,1.0f,0);
+	Eigen::Vector3f lookPoint(0.0f,1.0f,-4.0f);
+	Eigen::Matrix4f viewPose;
+	float lerpSpeed = 0.4f;
+	// Keyboard camera controls
+	display_merged.onKey([&paused,&eye,&centre](int key) {
+		LOG(INFO) << "Key = " << key;
+		if (key == 32) paused = !paused;
+		else if (key == 81) eye[0] += 0.02f;
+		else if (key == 83) eye[0] -= 0.02f;
+		else if (key == 84) eye[2] += 0.02f;
+		else if (key == 82) eye[2] -= 0.02f;
+	});
+	// TODO(Nick) Calculate "camera" properties of viewport.
+	MouseAction mouseact = [&inputs,&lookPoint,&viewPose]( int event, int ux, int uy, int) {
+		LOG(INFO) << "Mouse " << ux << "," << uy;
+		if (event == 1) {   // click
+			const float x = ((float)ux-inputs[0].params.mx) / inputs[0].params.fx;
+			const float y = ((float)uy-inputs[0].params.my) / inputs[0].params.fy;
+			const float depth = -4.0f;
+			Eigen::Vector4f camPos(x*depth,y*depth,depth,1.0);
+			Eigen::Vector4f worldPos =  viewPose * camPos;
+			lookPoint = Eigen::Vector3f(worldPos[0],worldPos[1],worldPos[2]);
+		}
+	};
+	::setMouseAction("Image", mouseact);
-	int active = displays.size();
+	int active = inputs.size();
 	while (active > 0 && display_merged.active()) {
 		active = 0;
+		if (!paused) {
 			net.broadcast("grab");  // To sync cameras
+			scene.nextFrame();
-		PointCloud<PointXYZRGB>::Ptr cloud(new PointCloud<PointXYZRGB>);
 			for (size_t i = 0; i < inputs.size(); i++) {
+				//if (i == 1) continue;
 				//Display &display = displays[i];
-			RGBDSource *input = inputs[i];
+				RGBDSource *input = inputs[i].source;
 				Mat rgb, depth;
+				//LOG(INFO) << "GetRGB";
 				input->getRGBD(rgb,depth);
 				//if (!display.active()) continue;
 				active += 1;
-			clouds[i] = ftl::rgbd::createPointCloud(input);
+				//clouds[i] = ftl::rgbd::createPointCloud(input);
 				//display.render(rgb, depth,input->getParameters());
 				//display.render(clouds[i]);
 				//display.wait(5);
-		}
-		for (size_t i = 0; i < clouds.size(); i++) {
+				//LOG(INFO) << "Data size: " << depth.cols << "," << depth.rows;
-			pcl::transformPointCloud(*clouds[i], *clouds[i], T[i]);
+				if (depth.cols == 0) continue;
-			*cloud += *clouds[i];
+				Mat rgba;
+				cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA);
+				inputs[i].params.flags = frameCount;
+				// Send to GPU and merge view into scene
+				inputs[i].gpu.updateParams(inputs[i].params);
+				inputs[i].gpu.updateData(depth, rgba);
+				scene.integrate(inputs[i].source->getPose(), inputs[i].gpu, inputs[i].params, nullptr);
+			}
+		} else {
+			active = 1;
 		}
-		pcl::UniformSampling<PointXYZRGB> uniform_sampling;
+		frameCount++;
-		uniform_sampling.setInputCloud(cloud);
-		uniform_sampling.setRadiusSearch(0.1f);
+		// Set virtual camera transformation matrix
-		uniform_sampling.filter(*cloud);
+		//Eigen::Affine3f transform(Eigen::Translation3f(cam_x,0.0f,cam_z));
+		centre += (lookPoint - centre) * (lerpSpeed * 0.1f);
+		viewPose = lookAt<float>(eye,centre,up); // transform.matrix();
-		display_merged.render(cloud);
+		// Call GPU renderer and download result into OpenCV mat
-		display_merged.wait(50);
+		rays.render(scene.getHashData(), scene.getHashParams(), inputs[0].gpu, viewPose);
+		rays.getRayCastData().download(nullptr, (uchar3*)colour_array.data, rays.getRayCastParams());
+		display_merged.render(colour_array);
+		display_merged.wait(1);
 	}
-#endif
-	// TODO non-PCL (?)
 }
 int main(int argc, char **argv) {

--- a/applications/reconstruct/src/ray_cast_sdf.cpp
+++ b/applications/reconstruct/src/ray_cast_sdf.cpp
+//#include <stdafx.h>
+#include <ftl/voxel_hash.hpp>
+//#include "Util.h"
+#include <ftl/ray_cast_sdf.hpp>
+extern "C" void renderCS(
+	const ftl::voxhash::HashData& hashData,
+	const RayCastData &rayCastData, 
+	const DepthCameraData &cameraData,
+	const RayCastParams &rayCastParams);
+extern "C" void computeNormals(float4* d_output, float3* d_input, unsigned int width, unsigned int height);
+extern "C" void convertDepthFloatToCameraSpaceFloat3(float3* d_output, float* d_input, float4x4 intrinsicsInv, unsigned int width, unsigned int height, const DepthCameraData& depthCameraData);
+extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params);
+extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData,
+								 const RayCastData &rayCastData, const RayCastParams &rayCastParams);
+extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &params);
+void CUDARayCastSDF::create(const RayCastParams& params)
+{
+	m_params = params;
+	m_data.allocate(m_params);
+	//m_rayIntervalSplatting.OnD3D11CreateDevice(DXUTGetD3D11Device(), params.m_width, params.m_height);
+}
+void CUDARayCastSDF::destroy(void)
+{
+	m_data.free();
+	//m_rayIntervalSplatting.OnD3D11DestroyDevice();
+}
+void CUDARayCastSDF::render(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform)
+{
+	rayIntervalSplatting(hashData, hashParams, cameraData, lastRigidTransform);
+	//m_data.d_rayIntervalSplatMinArray = m_rayIntervalSplatting.mapMinToCuda();
+	//m_data.d_rayIntervalSplatMaxArray = m_rayIntervalSplatting.mapMaxToCuda();
+	if (hash_render_) nickRenderCUDA(hashData, hashParams, m_data, cameraData, m_params);
+	else renderCS(hashData, m_data, cameraData, m_params);
+	//convertToCameraSpace(cameraData);
+	if (!m_params.m_useGradients)
+	{
+		computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height);
+	}
+	//m_rayIntervalSplatting.unmapCuda();
+}
+void CUDARayCastSDF::convertToCameraSpace(const DepthCameraData& cameraData)
+{
+	convertDepthFloatToCameraSpaceFloat3(m_data.d_depth3, m_data.d_depth, m_params.m_intrinsicsInverse, m_params.m_width, m_params.m_height, cameraData);
+	if(!m_params.m_useGradients) {
+		computeNormals(m_data.d_normals, m_data.d_depth3, m_params.m_width, m_params.m_height);
+	}
+}
+void CUDARayCastSDF::rayIntervalSplatting(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& cameraData, const Eigen::Matrix4f& lastRigidTransform)
+{
+	if (hashParams.m_numOccupiedBlocks == 0)	return;
+	if (m_params.m_maxNumVertices <= 6*hashParams.m_numOccupiedBlocks) { // 6 verts (2 triangles) per block
+		throw std::runtime_error("not enough space for vertex buffer for ray interval splatting");
+	}
+	m_params.m_numOccupiedSDFBlocks = hashParams.m_numOccupiedBlocks;
+	m_params.m_viewMatrix = MatrixConversion::toCUDA(lastRigidTransform.inverse());
+	m_params.m_viewMatrixInverse = MatrixConversion::toCUDA(lastRigidTransform);
+	m_data.updateParams(m_params); // !!! debugging
+	//don't use ray interval splatting (cf CUDARayCastSDF.cu -> line 40
+	//m_rayIntervalSplatting.rayIntervalSplatting(DXUTGetD3D11DeviceContext(), hashData, cameraData, m_data, m_params, m_params.m_numOccupiedSDFBlocks*6);
+}
\ No newline at end of file
--- a/applications/reconstruct/src/ray_cast_sdf.cu
+++ b/applications/reconstruct/src/ray_cast_sdf.cu
+#include <cuda_runtime.h>
+#include <ftl/cuda_matrix_util.hpp>
+#include <ftl/depth_camera.hpp>
+#include <ftl/voxel_hash.hpp>
+#include <ftl/ray_cast_util.hpp>
+#define T_PER_BLOCK 8
+#define NUM_GROUPS_X 1024
+//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMinTextureRef;
+//texture<float, cudaTextureType2D, cudaReadModeElementType> rayMaxTextureRef;
+__global__ void renderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData) 
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const RayCastParams& rayCastParams = c_rayCastParams;
+	if (x < rayCastParams.m_width && y < rayCastParams.m_height) {
+		rayCastData.d_depth[y*rayCastParams.m_width+x] = MINF;
+		rayCastData.d_depth3[y*rayCastParams.m_width+x] = make_float3(MINF,MINF,MINF);
+		rayCastData.d_normals[y*rayCastParams.m_width+x] = make_float4(MINF,MINF,MINF,MINF);
+		rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0);
+		float3 camDir = normalize(cameraData.kinectProjToCamera(x, y, 1.0f));
+		float3 worldCamPos = rayCastParams.m_viewMatrixInverse * make_float3(0.0f, 0.0f, 0.0f);
+		float4 w = rayCastParams.m_viewMatrixInverse * make_float4(camDir, 0.0f);
+		float3 worldDir = normalize(make_float3(w.x, w.y, w.z));
+		////use ray interval splatting
+		//float minInterval = tex2D(rayMinTextureRef, x, y);
+		//float maxInterval = tex2D(rayMaxTextureRef, x, y);
+		//don't use ray interval splatting
+		float minInterval = rayCastParams.m_minDepth;
+		float maxInterval = rayCastParams.m_maxDepth;
+		//if (minInterval == 0 || minInterval == MINF) minInterval = rayCastParams.m_minDepth;
+		//if (maxInterval == 0 || maxInterval == MINF) maxInterval = rayCastParams.m_maxDepth;
+		//TODO MATTHIAS: shouldn't this return in the case no interval is found?
+		if (minInterval == 0 || minInterval == MINF) return;
+		if (maxInterval == 0 || maxInterval == MINF) return;
+		// debugging 
+		//if (maxInterval < minInterval) {
+		//	printf("ERROR (%d,%d): [ %f, %f ]\n", x, y, minInterval, maxInterval);
+		//}
+		rayCastData.traverseCoarseGridSimpleSampleAll(hashData, cameraData, worldCamPos, worldDir, camDir, make_int3(x,y,1), minInterval, maxInterval);
+	} 
+}
+extern "C" void renderCS(const ftl::voxhash::HashData& hashData, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &rayCastParams) 
+{
+	const dim3 gridSize((rayCastParams.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (rayCastParams.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+	//cudaBindTextureToArray(rayMinTextureRef, rayCastData.d_rayIntervalSplatMinArray, channelDesc);
+	//cudaBindTextureToArray(rayMaxTextureRef, rayCastData.d_rayIntervalSplatMaxArray, channelDesc);
+	//printf("Ray casting render...\n");
+	renderKernel<<<gridSize, blockSize>>>(hashData, rayCastData, cameraData);
+#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	//cutilCheckMsg(__FUNCTION__);
+#endif
+}
+////////////////////////////////////////////////////////////////////////////////
+//  Nicks render approach
+////////////////////////////////////////////////////////////////////////////////
+__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData) 
+{
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	const RayCastParams& rayCastParams = c_rayCastParams;
+	if (x < rayCastParams.m_width && y < rayCastParams.m_height) {
+		rayCastData.d_depth_i[y*rayCastParams.m_width+x] = 0x7FFFFFFF; //PINF;
+		rayCastData.d_colors[y*rayCastParams.m_width+x] = make_uchar3(0,0,0);
+	}
+}
+#define SDF_BLOCK_SIZE_PAD 9
+#define SDF_BLOCK_BUFFER 1024  // > 9x9x9
+#define SDF_DX 1
+#define SDF_DY SDF_BLOCK_SIZE_PAD
+#define SDF_DZ (SDF_BLOCK_SIZE_PAD*SDF_BLOCK_SIZE_PAD)
+__device__
+float frac(float val) {
+	return (val - floorf(val));
+}
+__device__
+float3 frac(const float3& val) {
+		return make_float3(frac(val.x), frac(val.y), frac(val.z));
+}
+__host__ size_t nickSharedMem() {
+	return sizeof(float)*SDF_BLOCK_BUFFER +
+		sizeof(uchar)*SDF_BLOCK_BUFFER +
+		sizeof(float)*SDF_BLOCK_BUFFER +
+		sizeof(float3)*SDF_BLOCK_BUFFER;
+}
+/*__device__ void loadVoxel(const ftl::voxhash::HashData &hash, const int3 &vox, float *sdf, uint *weight, float3 *colour) {
+	ftl::voxhash::Voxel &v = hashData.getVoxel(vox);
+	*sdf = v.sdf;
+	*weight = v.weight;
+	*colour = v.color;
+}*/
+//! computes the (local) virtual voxel pos of an index; idx in [0;511]
+__device__ 
+int3 pdelinVoxelIndex(uint idx)	{
+	int x = idx % SDF_BLOCK_SIZE_PAD;
+	int y = (idx % (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD)) / SDF_BLOCK_SIZE_PAD;
+	int z = idx / (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD);	
+	return make_int3(x,y,z);
+}
+//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
+__device__ 
+uint plinVoxelPos(const int3& virtualVoxelPos) {
+	return  
+		virtualVoxelPos.z * SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD +
+		virtualVoxelPos.y * SDF_BLOCK_SIZE_PAD +
+		virtualVoxelPos.x;
+}
+__device__  
+void deleteVoxel(ftl::voxhash::Voxel& v) {
+	v.color = make_uchar3(0,0,0);
+	v.weight = 0;
+	v.sdf = PINF;
+}
+__device__ inline int3 blockDelinear(const int3 &base, uint i) {
+	return make_int3(base.x + (i & 0x1), base.y + (i & 0x2), base.z + (i & 0x4));
+}
+__device__ inline uint blockLinear(int x, int y, int z) {
+	return x + (y << 1) + (z << 2);
+}
+__device__ inline void trilinearInterp(const ftl::voxhash::HashData &hashData, const ftl::voxhash::Voxel *voxels, const uint *ix, const float3 &pos, float &depth, uchar3 &colour) {
+	float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
+	const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(pos));  // Should be world position of ray, not voxel??
+	float dist = 0.0f;
+	dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color);
+	dist+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color);
+	dist+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color);
+	dist+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *make_float3(voxels[ix[3]].color);
+	dist+=	   weight.x *	   weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+=	   weight.x *	   weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color);
+	dist+= (1.0f-weight.x)*	   weight.y *	   weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *	   weight.z *make_float3(voxels[ix[5]].color);
+	dist+=	   weight.x *(1.0f-weight.y)*	   weight.z *voxels[ix[6]].sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*	   weight.z *make_float3(voxels[ix[6]].color);
+	dist+=	   weight.x *	   weight.y *	   weight.z *voxels[ix[7]].sdf; colorFloat+=	   weight.x *	   weight.y *	   weight.z *make_float3(voxels[ix[7]].color);
+	depth = dist;
+	colour = make_uchar3(colorFloat);
+}
+__global__ void nickRenderKernel(ftl::voxhash::HashData hashData, RayCastData rayCastData, DepthCameraData cameraData, RayCastParams params) {
+	// TODO(Nick) Reduce bank conflicts by aligning these
+	__shared__ ftl::voxhash::Voxel voxels[SDF_BLOCK_BUFFER];
+	__shared__ ftl::voxhash::HashEntry blocks[8];
+	const uint i = threadIdx.x;	//inside of an SDF block
+	//TODO (Nick) Either don't use compactified or re-run compacitification using render cam frustrum
+	if (i == 0) blocks[0] = hashData.d_hashCompactified[blockIdx.x];
+	else if (i <= 7) blocks[i] = hashData.getHashEntryForSDFBlockPos(blockDelinear(blocks[0].pos, i));
+	// Make sure all hash entries are cached
+	__syncthreads();
+	const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(blocks[0].pos);
+	const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i));
+	const int3 pi = pi_base + vp;
+	const uint j = plinVoxelPos(vp);  // Padded linear index
+	const float3 worldPos = hashData.virtualVoxelPosToWorld(pi);
+	// Load distances and colours into shared memory + padding
+	const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[blocks[0].ptr + i];
+	voxels[j] = v;
+	// TODO (Nick) Coalesce memory better?
+	uint imod = i & 0x7;
+	bool v_do = imod < 3;
+	if (v_do) {
+		const uint v_a = (i >> 3) & 0x7;
+		const uint v_c = (i >> 6);
+		const uint v_b = (imod >= 1) ? v_c : v_a; 
+		const int3 v_cache = make_int3(((imod == 0) ? 8 : v_a), ((imod == 1) ? 8 : v_b), ((imod == 2) ? 8 : v_c));
+		const int3 v_ii = make_int3((imod == 0) ? 0 : v_a, (imod == 1) ? 0 : v_b, (imod == 2) ? 0 : v_c);
+		const int v_block = blockLinear((imod == 0) ? 1 : 0, (imod == 1) ? 1 : 0, (imod == 2) ? 1 : 0);
+		ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(v_cache)];
+		const uint ii = hashData.linearizeVoxelPos(v_ii);
+		if (blocks[v_block].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[v_block].ptr + ii];
+		else deleteVoxel(padVox);
+	}
+	/*if (vp.x == 7) {
+		ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x+1,vp.y,vp.z))];
+		const uint ii = hashData.linearizeVoxelPos(make_int3(0,vp.y,vp.z));
+		//padVox = hashData.getVoxel(make_int3(pi.x+1,pi.y,pi.z));
+		if (blocks[blockLinear(1,0,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(1,0,0)].ptr + ii];
+		else deleteVoxel(padVox);
+	}
+	if (vp.y == 7) {
+		ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y+1,vp.z))];
+		const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,0,vp.z));
+		//padVox = hashData.getVoxel(make_int3(pi.x,pi.y+1,pi.z));
+		if (blocks[blockLinear(0,1,0)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,1,0)].ptr + ii];
+		else deleteVoxel(padVox);
+	}
+	if (vp.z == 7) {
+		ftl::voxhash::Voxel &padVox = voxels[plinVoxelPos(make_int3(vp.x,vp.y,vp.z+1))];
+		const uint ii = hashData.linearizeVoxelPos(make_int3(vp.x,vp.y,0));
+		//padVox = hashData.getVoxel(make_int3(pi.x,pi.y,pi.z+1));
+		if (blocks[blockLinear(0,0,1)].ptr != ftl::voxhash::FREE_ENTRY) padVox = hashData.d_SDFBlocks[blocks[blockLinear(0,0,1)].ptr + ii];
+		else deleteVoxel(padVox);
+	}*/
+	// Indexes of the 8 neighbor voxels in one direction
+	const uint ix[8] = {
+		j, j+SDF_DX, j+SDF_DY, j+SDF_DZ, j+SDF_DX+SDF_DY, j+SDF_DY+SDF_DZ,
+		j+SDF_DX+SDF_DZ, j+SDF_DX+SDF_DY+SDF_DZ
+	};
+	__syncthreads();
+	// If any weight is 0, skip this voxel
+	const bool missweight = voxels[ix[0]].weight == 0 || voxels[ix[1]].weight == 0 || voxels[ix[2]].weight == 0 ||
+			voxels[ix[3]].weight == 0 || voxels[ix[4]].weight == 0 || voxels[ix[5]].weight == 0 ||
+			voxels[ix[6]].weight == 0 || voxels[ix[7]].weight == 0;
+	if (missweight) return;
+	// Trilinear Interpolation (simple and fast)
+	/*float3 colorFloat = make_float3(0.0f, 0.0f, 0.0f);
+	const float3 weight = frac(hashData.worldToVirtualVoxelPosFloat(worldPos));  // Should be world position of ray, not voxel??
+	float dist = 0.0f;
+	dist+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[0]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[0]].color);
+	dist+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*voxels[ix[1]].sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*(1.0f-weight.z)*make_float3(voxels[ix[1]].color);
+	dist+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*voxels[ix[2]].sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *(1.0f-weight.z)*make_float3(voxels[ix[2]].color);
+	dist+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *voxels[ix[3]].sdf; colorFloat+= (1.0f-weight.x)*(1.0f-weight.y)*	   weight.z *make_float3(voxels[ix[3]].color);
+	dist+=	   weight.x *	   weight.y *(1.0f-weight.z)*voxels[ix[4]].sdf; colorFloat+=	   weight.x *	   weight.y *(1.0f-weight.z)*make_float3(voxels[ix[4]].color);
+	dist+= (1.0f-weight.x)*	   weight.y *	   weight.z *voxels[ix[5]].sdf; colorFloat+= (1.0f-weight.x)*	   weight.y *	   weight.z *make_float3(voxels[ix[5]].color);
+	dist+=	   weight.x *(1.0f-weight.y)*	   weight.z *voxels[ix[6]].sdf; colorFloat+=	   weight.x *(1.0f-weight.y)*	   weight.z *make_float3(voxels[ix[6]].color);
+	dist+=	   weight.x *	   weight.y *	   weight.z *voxels[ix[7]].sdf; colorFloat+=	   weight.x *	   weight.y *	   weight.z *make_float3(voxels[ix[7]].color);
+	// Must finish using colours before updating colours
+	__syncthreads();
+	//voxels[j].color = make_uchar3(colorFloat);
+	//voxels[j].sdf = dist;
+	// What happens if fitlered voxel is put back?
+	//hashData.d_SDFBlocks[blocks[0].ptr + i] = voxels[j];
+	//return;*/
+	bool is_surface = false;
+	// Identify surfaces through sign change. Since we only check in one direction
+	// it is fine to check for any sign change?
+#pragma unroll
+	for (int u=0; u<=1; u++) {
+		for (int v=0; v<=1; v++) {
+			for (int w=0; w<=1; w++) {
+				const int3 uvi = make_int3(vp.x+u,vp.y+v,vp.z+w);
+				// Skip these cases since we didn't load voxels properly
+				if (uvi.x == 8 && uvi.y == 8 || uvi.x == 8 && uvi.z == 8 || uvi.y == 8 && uvi.z == 8) continue;
+				if (signbit(voxels[j].sdf) != signbit(voxels[plinVoxelPos(uvi)].sdf)) {
+					is_surface = true;
+					break;
+				}
+			}
+		}
+	}
+	// Only for surface voxels, work out screen coordinates
+	// TODO Could adjust weights, strengthen on surface, weaken otherwise??
+	if (!is_surface) return;
+	const float3 camPos = params.m_viewMatrix * worldPos;
+	const float2 screenPosf = cameraData.cameraToKinectScreenFloat(camPos);
+	const uint2 screenPos = make_uint2(make_int2(screenPosf)); //  + make_float2(0.5f, 0.5f)
+	/*if (screenPos.x < params.m_width && screenPos.y < params.m_height && 
+			rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] > camPos.z) {
+		rayCastData.d_depth[(screenPos.y)*params.m_width+screenPos.x] = camPos.z;
+		rayCastData.d_colors[(screenPos.y)*params.m_width+screenPos.x] = voxels[j].color;
+	}*/
+	//return;
+	// For this voxel in hash, get its screen position and check it is on screen
+	// Convert depth map to int by x1000 and use atomicMin
+	//const int pixsize = static_cast<int>(c_hashParams.m_virtualVoxelSize*c_depthCameraParams.fx/camPos.z)+1;
+	int pixsizeX = 10;  // Max voxel pixels
+	int pixsizeY = 10;
+	for (int y=0; y<pixsizeY; y++) {
+		for (int x=0; x<pixsizeX; x++) {
+			// TODO(Nick) Within a window, check pixels that have same voxel id
+			// Then trilinear interpolate between current voxel and neighbors.
+			const float3 pixelWorldPos = params.m_viewMatrixInverse * cameraData.kinectDepthToSkeleton(screenPos.x+x,screenPos.y+y, camPos.z);
+			const float3 posInVoxel = (pixelWorldPos - worldPos) / make_float3(c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize,c_hashParams.m_virtualVoxelSize);
+			if (posInVoxel.x >= 1.0f || posInVoxel.y >= 1.0f || posInVoxel.z >= 1.0f) {
+				pixsizeX = x;
+				continue;
+			}
+			/*float depth;
+			uchar3 col;
+			trilinearInterp(hashData, voxels, ix, pixelWorldPos, depth, col);*/
+			int idepth = static_cast<int>(camPos.z * 100.0f);
+			// TODO (Nick) MAKE THIS ATOMIC!!!!
+			/*if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height && 
+					rayCastData.d_depth_i[(screenPos.y+y)*params.m_width+screenPos.x+x] > idepth) {
+				rayCastData.d_depth[(screenPos.y+y)*params.m_width+screenPos.x+x] = idepth;
+				rayCastData.d_colors[(screenPos.y+y)*params.m_width+screenPos.x+x] = voxels[j].color;
+			}*/
+			if (screenPos.x+x < params.m_width && screenPos.y+y < params.m_height) {
+				int index = (screenPos.y+y)*params.m_width+screenPos.x+x;
+				if (rayCastData.d_depth_i[index] > idepth && atomicMin(&rayCastData.d_depth_i[index], idepth) != idepth) {
+					//rayCastData.d_depth[index] = idepth;
+					rayCastData.d_colors[index] = voxels[j].color;
+				}
+			}
+		}
+		if (pixsizeX == 0) break;
+	}
+}
+extern "C" void nickRenderCUDA(const ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const RayCastData &rayCastData, const DepthCameraData &cameraData, const RayCastParams &params)
+{
+	const dim3 clear_gridSize((params.m_width + T_PER_BLOCK - 1)/T_PER_BLOCK, (params.m_height + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	clearDepthKernel<<<clear_gridSize, clear_blockSize>>>(hashData, rayCastData, cameraData);
+	const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;
+	const dim3 gridSize(hashParams.m_numOccupiedBlocks, 1);
+	const dim3 blockSize(threadsPerBlock, 1);
+	if (hashParams.m_numOccupiedBlocks > 0) {	//this guard is important if there is no depth in the current frame (i.e., no blocks were allocated)
+		nickRenderKernel << <gridSize, blockSize >> >(hashData, rayCastData, cameraData, params);
+	}
+	cudaSafeCall( cudaGetLastError() );
+	#ifdef _DEBUG
+	cudaSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+	#endif
+}
+/////////////////////////////////////////////////////////////////////////
+// ray interval splatting
+/////////////////////////////////////////////////////////////////////////
+__global__ void resetRayIntervalSplatKernel(RayCastData data) 
+{
+	uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X;
+	data.point_cloud_[idx] = make_float3(MINF);
+}
+extern "C" void resetRayIntervalSplatCUDA(RayCastData& data, const RayCastParams& params)
+{
+	const dim3 gridSize(NUM_GROUPS_X, (params.m_maxNumVertices + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1); // ! todo check if need third dimension?
+	const dim3 blockSize(1, 1, 1);
+	resetRayIntervalSplatKernel<<<gridSize, blockSize>>>(data);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}
+__global__ void rayIntervalSplatKernel(ftl::voxhash::HashData hashData, DepthCameraData depthCameraData, RayCastData rayCastData, DepthCameraData cameraData) 
+{
+	uint idx = blockIdx.x + blockIdx.y * NUM_GROUPS_X;
+	const ftl::voxhash::HashEntry& entry = hashData.d_hashCompactified[idx];
+	if (entry.ptr != ftl::voxhash::FREE_ENTRY) {
+		//if (!hashData.isSDFBlockInCameraFrustumApprox(entry.pos)) return;
+		const RayCastParams &params = c_rayCastParams;
+		const float4x4& viewMatrix = params.m_viewMatrix;
+		float3 worldCurrentVoxel = hashData.SDFBlockToWorld(entry.pos);
+		float3 MINV = worldCurrentVoxel - c_hashParams.m_virtualVoxelSize / 2.0f;
+		float3 maxv = MINV+SDF_BLOCK_SIZE*c_hashParams.m_virtualVoxelSize;
+		float3 proj000 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, MINV.z));
+		float3 proj100 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, MINV.z));
+		float3 proj010 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, MINV.z));
+		float3 proj001 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, MINV.y, maxv.z));
+		float3 proj110 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, MINV.z));
+		float3 proj011 = cameraData.cameraToKinectProj(viewMatrix * make_float3(MINV.x, maxv.y, maxv.z));
+		float3 proj101 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, MINV.y, maxv.z));
+		float3 proj111 = cameraData.cameraToKinectProj(viewMatrix * make_float3(maxv.x, maxv.y, maxv.z));
+		// Tree Reduction Min
+		float3 min00 = fminf(proj000, proj100);
+		float3 min01 = fminf(proj010, proj001);
+		float3 min10 = fminf(proj110, proj011);
+		float3 min11 = fminf(proj101, proj111);
+		float3 min0 = fminf(min00, min01);
+		float3 min1 = fminf(min10, min11);
+		float3 minFinal = fminf(min0, min1);
+		// Tree Reduction Max
+		float3 max00 = fmaxf(proj000, proj100);
+		float3 max01 = fmaxf(proj010, proj001);
+		float3 max10 = fmaxf(proj110, proj011);
+		float3 max11 = fmaxf(proj101, proj111);
+		float3 max0 = fmaxf(max00, max01);
+		float3 max1 = fmaxf(max10, max11);
+		float3 maxFinal = fmaxf(max0, max1);
+		float depth = maxFinal.z;
+		if(params.m_splatMinimum == 1) {
+			depth = minFinal.z;
+		}
+		float depthWorld = cameraData.kinectProjToCameraZ(depth);
+		//uint addr = idx*4;
+		//rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld);
+		//rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
+		//rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
+		//rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld);
+		// Note (Nick) : Changed to create point cloud instead of vertex.
+		uint addr = idx;
+		rayCastData.point_cloud_[addr] = make_float3(maxFinal.x, maxFinal.y, depth);
+		//printf("Ray: %f\n", depth);
+		/*uint addr = idx*6;
+		rayCastData.d_vertexBuffer[addr] = make_float4(maxFinal.x, minFinal.y, depth, depthWorld);
+		rayCastData.d_vertexBuffer[addr+1] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
+		rayCastData.d_vertexBuffer[addr+2] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
+		rayCastData.d_vertexBuffer[addr+3] = make_float4(minFinal.x, minFinal.y, depth, depthWorld);
+		rayCastData.d_vertexBuffer[addr+4] = make_float4(maxFinal.x, maxFinal.y, depth, depthWorld);
+		rayCastData.d_vertexBuffer[addr+5] = make_float4(minFinal.x, maxFinal.y, depth, depthWorld);*/
+	}
+}
+extern "C" void rayIntervalSplatCUDA(const ftl::voxhash::HashData& hashData, const DepthCameraData& cameraData, const RayCastData &rayCastData, const RayCastParams &rayCastParams) 
+{
+	//printf("Ray casting...\n");
+	const dim3 gridSize(NUM_GROUPS_X, (rayCastParams.m_numOccupiedSDFBlocks + NUM_GROUPS_X - 1) / NUM_GROUPS_X, 1);
+	const dim3 blockSize(1, 1, 1);
+	rayIntervalSplatKernel<<<gridSize, blockSize>>>(hashData, cameraData, rayCastData, cameraData);
+#ifdef _DEBUG
+	cutilSafeCall(cudaDeviceSynchronize());
+	cutilCheckMsg(__FUNCTION__);
+#endif
+}  
--- a/applications/reconstruct/src/registration.cpp
+++ b/applications/reconstruct/src/registration.cpp
@@ -101,7 +101,7 @@ PointCloud<PointXYZ>::Ptr cornersToPointCloud(const vector<cv::Point2f> &corners
 	for (int i = 0; i < corners_len; i++) {
 		double x = corners[i].x;
 		double y = corners[i].y;
-		double d = disp.at<float>((int) y, (int) x) * 1000.0f; // todo: better estimation
+		double d = disp.at<float>((int) y, (int) x); // * 1000.0f; // todo: better estimation
 		//cv::Vec4d homg_pt = Q_ * cv::Vec4d(x, y, d, 1.0);
 		//cv::Vec3d p = cv::Vec3d(homg_pt.val) / homg_pt[3];
No results found