diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt index 4db3fbf4285e8022c7af9f908d87ef1966e9ecd4..00e44a444ba45f6cb7b4f68ed2441a1c3ac4fd4c 100644 --- a/applications/reconstruct/CMakeLists.txt +++ b/applications/reconstruct/CMakeLists.txt @@ -9,15 +9,16 @@ set(REPSRC src/compactors.cu src/garbage.cu src/integrators.cu - src/ray_cast_sdf.cu + #src/ray_cast_sdf.cu src/splat_render.cu src/camera_util.cu src/voxel_hash.cu src/voxel_hash.cpp - src/ray_cast_sdf.cpp + #src/ray_cast_sdf.cpp src/registration.cpp - src/virtual_source.cpp + #src/virtual_source.cpp src/splat_render.cpp + src/dibr.cu ) add_executable(ftl-reconstruct ${REPSRC}) diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp index 21e109b89bdc98f6c504648eee0536875270c806..b7a3b44426bf6fcf490af67f38461ca849ade428 100644 --- a/applications/reconstruct/include/ftl/cuda_operators.hpp +++ b/applications/reconstruct/include/ftl/cuda_operators.hpp @@ -587,6 +587,14 @@ inline __host__ __device__ int3 make_int3(int i) int3 t; t.x = i; t.y = i; t.z = i; return t; } +__device__ inline int3 make_int3(short3 s) { + return make_int3(s.x,s.y,s.z); +} + +__device__ inline int3 make_int3(short4 s) { + return make_int3(s.x,s.y,s.z); +} + // negate inline __host__ __device__ int3 operator-(int3 &a) { diff --git a/applications/reconstruct/include/ftl/depth_camera.hpp b/applications/reconstruct/include/ftl/depth_camera.hpp index 641a9c251c7518f8b8bbe486ca2af32e008557d8..f40581b65cd71b9f239b2c0e82dfd616deb7ff28 100644 --- a/applications/reconstruct/include/ftl/depth_camera.hpp +++ b/applications/reconstruct/include/ftl/depth_camera.hpp @@ -12,19 +12,31 @@ #include <ftl/depth_camera_params.hpp> +#define MAX_CAMERAS 20 + //extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params); //extern __constant__ DepthCameraParams c_depthCameraParams; +namespace ftl { +namespace voxhash { + +struct __align__(16) DepthCameraCUDA { + cudaTextureObject_t depth; + cudaTextureObject_t colour; + DepthCameraParams params; + float4x4 pose; + float4x4 poseInverse; +}; -struct DepthCameraData { +struct DepthCamera { /////////////// // Host part // /////////////// - __device__ __host__ - DepthCameraData() { + __host__ + DepthCamera() { /*d_depthData = NULL; d_colorData = NULL; d_depthArray = NULL; @@ -42,8 +54,9 @@ struct DepthCameraData { colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4); depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_); colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_); - depth_obj_ = depth_tex_->cudaTexture(); - colour_obj_ = colour_tex_->cudaTexture(); + data.depth = depth_tex_->cudaTexture(); + data.colour = colour_tex_->cudaTexture(); + data.params = params; } //__host__ @@ -71,6 +84,10 @@ struct DepthCameraData { cv::cuda::GpuMat *colour_mat_; ftl::cuda::TextureObject<float> *depth_tex_; ftl::cuda::TextureObject<uchar4> *colour_tex_; - cudaTextureObject_t depth_obj_; - cudaTextureObject_t colour_obj_; + //cudaTextureObject_t depth_obj_; + //cudaTextureObject_t colour_obj_; + + DepthCameraCUDA data; }; +} +} diff --git a/applications/reconstruct/include/ftl/voxel_hash.hpp b/applications/reconstruct/include/ftl/voxel_hash.hpp index 4a8e8ee8051705546a94afc035ef57c0415d1246..c0a228834337e60f5807313735eedee25eb51dcb 100644 --- a/applications/reconstruct/include/ftl/voxel_hash.hpp +++ b/applications/reconstruct/include/ftl/voxel_hash.hpp @@ -37,6 +37,7 @@ typedef signed char schar; #include <ftl/depth_camera.hpp> #define SDF_BLOCK_SIZE 8 +#define SDF_BLOCK_SIZE_OLAP 7 #ifndef MINF #define MINF __int_as_float(0xff800000) @@ -54,22 +55,39 @@ namespace voxhash { //status flags for hash entries static const int LOCK_ENTRY = -1; -static const int FREE_ENTRY = -2; +static const int FREE_ENTRY = -2147483648; static const int NO_OFFSET = 0; +static const uint kFlagSurface = 0x00000001; + +struct __align__(16) HashEntryHead { + union { + short4 posXYZ; // hash position (lower left corner of SDFBlock)) + uint64_t pos; + }; + int offset; // offset for collisions + uint flags; +}; + struct __align__(16) HashEntry { - int3 pos; //hash position (lower left corner of SDFBlock)) - int ptr; //pointer into heap to SDFBlock - uint offset; //offset for collisions - uint flags; //for interframe checks (Nick) + HashEntryHead head; + uint voxels[16]; // 512 bits, 1 bit per voxel - __device__ void operator=(const struct HashEntry& e) { + /*__device__ void operator=(const struct HashEntry& e) { ((long long*)this)[0] = ((const long long*)&e)[0]; ((long long*)this)[1] = ((const long long*)&e)[1]; //((int*)this)[4] = ((const int*)&e)[4]; ((long long*)this)[2] = ((const long long*)&e)[2]; - } + ((long long*)this)[2] = ((const long long*)&e)[3]; + ((long long*)this)[2] = ((const long long*)&e)[4]; + ((long long*)this)[2] = ((const long long*)&e)[5]; + ((long long*)this)[2] = ((const long long*)&e)[6]; + ((long long*)this)[2] = ((const long long*)&e)[7]; + ((long long*)this)[2] = ((const long long*)&e)[8]; + ((long long*)this)[2] = ((const long long*)&e)[9]; + ((long long*)this)[2] = ((const long long*)&e)[10]; + }*/ }; struct __align__(8) Voxel { @@ -95,14 +113,14 @@ struct HashData { __device__ __host__ HashData() { - d_heap = NULL; - d_heapCounter = NULL; + //d_heap = NULL; + //d_heapCounter = NULL; d_hash = NULL; d_hashDecision = NULL; d_hashDecisionPrefix = NULL; d_hashCompactified = NULL; d_hashCompactifiedCounter = NULL; - d_SDFBlocks = NULL; + //d_SDFBlocks = NULL; d_hashBucketMutex = NULL; m_bIsOnGPU = false; } @@ -166,18 +184,18 @@ struct HashData { //out.color = 0.5f * (v0.color + v1.color); //exponential running average - //float3 c0 = make_float3(v0.color.x, v0.color.y, v0.color.z); - //float3 c1 = make_float3(v1.color.x, v1.color.y, v1.color.z); + float3 c0 = make_float3(v0.color.x, v0.color.y, v0.color.z); + float3 c1 = make_float3(v1.color.x, v1.color.y, v1.color.z); //float3 res = (c0.x+c0.y+c0.z == 0) ? c1 : 0.5f*c0 + 0.5f*c1; //float3 res = (c0+c1)/2; - //float3 res = (c0 * (float)v0.weight + c1 * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); + float3 res = (c0 * (float)v0.weight + c1 * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); //float3 res = c1; - //out.color.x = (uchar)(res.x+0.5f); out.color.y = (uchar)(res.y+0.5f); out.color.z = (uchar)(res.z+0.5f); + out.color.x = (uchar)(res.x+0.5f); out.color.y = (uchar)(res.y+0.5f); out.color.z = (uchar)(res.z+0.5f); // Nick: reduces colour flicker but not ideal.. - out.color = v1.color; + //out.color = v1.color; // Option 3 (Nick): Use colour with minimum SDF since it should be closest to surface. // Results in stable but pixelated output @@ -220,20 +238,20 @@ struct HashData { __device__ int3 virtualVoxelPosToSDFBlock(int3 virtualVoxelPos) const { - if (virtualVoxelPos.x < 0) virtualVoxelPos.x -= SDF_BLOCK_SIZE-1; - if (virtualVoxelPos.y < 0) virtualVoxelPos.y -= SDF_BLOCK_SIZE-1; - if (virtualVoxelPos.z < 0) virtualVoxelPos.z -= SDF_BLOCK_SIZE-1; + if (virtualVoxelPos.x < 0) virtualVoxelPos.x -= SDF_BLOCK_SIZE_OLAP-1; + if (virtualVoxelPos.y < 0) virtualVoxelPos.y -= SDF_BLOCK_SIZE_OLAP-1; + if (virtualVoxelPos.z < 0) virtualVoxelPos.z -= SDF_BLOCK_SIZE_OLAP-1; return make_int3( - virtualVoxelPos.x/SDF_BLOCK_SIZE, - virtualVoxelPos.y/SDF_BLOCK_SIZE, - virtualVoxelPos.z/SDF_BLOCK_SIZE); + virtualVoxelPos.x/SDF_BLOCK_SIZE_OLAP, + virtualVoxelPos.y/SDF_BLOCK_SIZE_OLAP, + virtualVoxelPos.z/SDF_BLOCK_SIZE_OLAP); } // Computes virtual voxel position of corner sample position __device__ int3 SDFBlockToVirtualVoxelPos(const int3& sdfBlock) const { - return sdfBlock*SDF_BLOCK_SIZE; + return sdfBlock*SDF_BLOCK_SIZE_OLAP; } __device__ @@ -300,7 +318,7 @@ struct HashData { //! returns the hash entry for a given worldPos; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set __device__ - HashEntry getHashEntry(const float3& worldPos) const { + int getHashEntry(const float3& worldPos) const { //int3 blockID = worldToSDFVirtualVoxelPos(worldPos)/SDF_BLOCK_SIZE; //position of sdf block int3 blockID = worldToSDFBlock(worldPos); return getHashEntryForSDFBlockPos(blockID); @@ -314,15 +332,15 @@ struct HashData { __device__ void deleteHashEntry(HashEntry& hashEntry) { - hashEntry.pos = make_int3(0); - hashEntry.offset = 0; - hashEntry.ptr = FREE_ENTRY; + hashEntry.head.pos = 0; + hashEntry.head.offset = FREE_ENTRY; + for (int i=0; i<16; ++i) hashEntry.voxels[i] = 0; } __device__ bool voxelExists(const float3& worldPos) const { - HashEntry hashEntry = getHashEntry(worldPos); - return (hashEntry.ptr != FREE_ENTRY); + int hashEntry = getHashEntry(worldPos); + return (hashEntry != -1); } __device__ @@ -331,48 +349,44 @@ struct HashData { v.weight = 0; v.sdf = 0.0f; } - __device__ - void deleteVoxel(uint id) { - deleteVoxel(d_SDFBlocks[id]); - } __device__ - Voxel getVoxel(const float3& worldPos) const { - HashEntry hashEntry = getHashEntry(worldPos); - Voxel v; - if (hashEntry.ptr == FREE_ENTRY) { - deleteVoxel(v); + bool getVoxel(const float3& worldPos) const { + int hashEntry = getHashEntry(worldPos); + if (hashEntry == -1) { + return false; } else { int3 virtualVoxelPos = worldToVirtualVoxelPos(worldPos); - v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)]; + int ix = virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos); + return d_hash[hashEntry].voxels[ix/32] & (0x1 << (ix % 32)); } - return v; } __device__ - Voxel getVoxel(const int3& virtualVoxelPos) const { - HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); - Voxel v; - if (hashEntry.ptr == FREE_ENTRY) { - deleteVoxel(v); + bool getVoxel(const int3& virtualVoxelPos) const { + int hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); + if (hashEntry == -1) { + return false; } else { - v = d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)]; + int ix = virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos); + return d_hash[hashEntry].voxels[ix >> 5] & (0x1 << (ix & 0x1F)); } - return v; } - __device__ - void setVoxel(const int3& virtualVoxelPos, Voxel& voxelInput) const { - HashEntry hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); - if (hashEntry.ptr != FREE_ENTRY) { + /*__device__ + void setVoxel(const int3& virtualVoxelPos, bool voxelInput) const { + int hashEntry = getHashEntryForSDFBlockPos(virtualVoxelPosToSDFBlock(virtualVoxelPos)); + if (hashEntry == -1) { d_SDFBlocks[hashEntry.ptr + virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos)] = voxelInput; + int ix = virtualVoxelPosToLocalSDFBlockIndex(virtualVoxelPos); + d_hash[hashEntry].voxels[ix >> 5] |= (0x1 << (ix & 0x1F)); } - } + }*/ //! returns the hash entry for a given sdf block id; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set __device__ - HashEntry getHashEntryForSDFBlockPos(const int3& sdfBlock) const; + int getHashEntryForSDFBlockPos(const int3& sdfBlock) const; //for histogram (no collision traversal) __device__ @@ -383,22 +397,9 @@ struct HashData { unsigned int getNumHashLinkedList(unsigned int bucketID); - __device__ - uint consumeHeap() { - uint addr = atomicSub(&d_heapCounter[0], 1); - //TODO MATTHIAS check some error handling? - return d_heap[addr]; - } - __device__ - void appendHeap(uint ptr) { - uint addr = atomicAdd(&d_heapCounter[0], 1); - //TODO MATTHIAS check some error handling? - d_heap[addr+1] = ptr; - } - //pos in SDF block coordinates __device__ - void allocBlock(const int3& pos, const uchar frame); + void allocBlock(const int3& pos); //!inserts a hash entry without allocating any memory: used by streaming: TODO MATTHIAS check the atomics in this function __device__ @@ -410,14 +411,11 @@ struct HashData { #endif //CUDACC - uint* d_heap; //heap that manages free memory - uint* d_heapCounter; //single element; used as an atomic counter (points to the next free block) int* d_hashDecision; // int* d_hashDecisionPrefix; // HashEntry* d_hash; //hash that stores pointers to sdf blocks - HashEntry* d_hashCompactified; //same as before except that only valid pointers are there + HashEntry** d_hashCompactified; //same as before except that only valid pointers are there int* d_hashCompactifiedCounter; //atomic counter to add compactified entries atomically - Voxel* d_SDFBlocks; //sub-blocks that contain 8x8x8 voxels (linearized); are allocated by heap int* d_hashBucketMutex; //binary flag per hash bucket; used for allocation to atomically lock a bucket bool m_bIsOnGPU; //the class be be used on both cpu and gpu diff --git a/applications/reconstruct/include/ftl/voxel_scene.hpp b/applications/reconstruct/include/ftl/voxel_scene.hpp index e75a1b72429d642349281095712c83e3426bccbc..d594d479f1f35ffc9cece087b867be2f4b781ee0 100644 --- a/applications/reconstruct/include/ftl/voxel_scene.hpp +++ b/applications/reconstruct/include/ftl/voxel_scene.hpp @@ -17,7 +17,7 @@ namespace voxhash { struct Cameras { ftl::rgbd::Source *source; - DepthCameraData gpu; + ftl::voxhash::DepthCamera gpu; DepthCameraParams params; cv::cuda::Stream stream; }; @@ -58,13 +58,14 @@ class SceneRep : public ftl::Configurable { //! resets the hash to the initial state (i.e., clears all data) void reset(); + int cameraCount() const { return (int)cameras_.size(); } + ftl::voxhash::HashData& getHashData() { return m_hashData; } HashParams& getHashParams() { return m_hashParams; } - //! debug only! - unsigned int getHeapFreeCount(); + unsigned int getOccupiedCount(); //! debug only! void debugHash(); @@ -78,13 +79,14 @@ class SceneRep : public ftl::Configurable { HashParams _parametersFromConfig(); void _create(const HashParams& params); void _destroy(); - void _alloc(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t); + void _alloc(int camid, cudaStream_t); void _compactifyVisible(const DepthCameraParams &camera); void _compactifyAllocated(); - void _integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams); + //void _integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams); + void _integrateDepthMaps(); void _garbageCollect(); - + void _updateCameraConstant(); HashParams m_hashParams; ftl::voxhash::HashData m_hashData; diff --git a/applications/reconstruct/src/compactors.cu b/applications/reconstruct/src/compactors.cu index 373a71fc241afab2925931e21f586ca5a75ed551..b7cdd5028f0f5ec78de47d8bf6f9099c5448a494 100644 --- a/applications/reconstruct/src/compactors.cu +++ b/applications/reconstruct/src/compactors.cu @@ -63,7 +63,7 @@ using ftl::voxhash::FREE_ENTRY; #endif }*/ -__global__ void compactifyVisibleKernel(HashData hashData, HashParams hashParams, DepthCameraParams camera) +/*__global__ void compactifyVisibleKernel(HashData hashData, HashParams hashParams, DepthCameraParams camera) { //const HashParams& hashParams = c_hashParams; const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; @@ -122,7 +122,7 @@ void ftl::cuda::compactifyVisible(HashData& hashData, const HashParams& hashPara //cutilCheckMsg(__FUNCTION__); #endif //return res; -} +}*/ __global__ void compactifyAllocatedKernel(HashData hashData) { @@ -130,9 +130,9 @@ __global__ void compactifyAllocatedKernel(HashData hashData) const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; #ifdef COMPACTIFY_HASH_SIMPLE if (idx < hashParams.m_hashNumBuckets) { - if (hashData.d_hash[idx].ptr != FREE_ENTRY) { + if (hashData.d_hash[idx].head.offset != FREE_ENTRY) { int addr = atomicAdd(hashData.d_hashCompactifiedCounter, 1); - hashData.d_hashCompactified[addr] = hashData.d_hash[idx]; + hashData.d_hashCompactified[addr] = &hashData.d_hash[idx]; } } #else @@ -142,7 +142,7 @@ __global__ void compactifyAllocatedKernel(HashData hashData) int addrLocal = -1; if (idx < hashParams.m_hashNumBuckets) { - if (hashData.d_hash[idx].ptr != FREE_ENTRY) { + if (hashData.d_hash[idx].head.offset != FREE_ENTRY) { addrLocal = atomicAdd(&localCounter, 1); } } @@ -157,7 +157,7 @@ __global__ void compactifyAllocatedKernel(HashData hashData) if (addrLocal != -1) { const unsigned int addr = addrGlobal + addrLocal; - hashData.d_hashCompactified[addr] = hashData.d_hash[idx]; + hashData.d_hashCompactified[addr] = &hashData.d_hash[idx]; } #endif } @@ -178,3 +178,59 @@ void ftl::cuda::compactifyAllocated(HashData& hashData, const HashParams& hashPa #endif //return res; } + + +__global__ void compactifyOccupiedKernel(HashData hashData) +{ + const HashParams& hashParams = c_hashParams; + const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; +#ifdef COMPACTIFY_HASH_SIMPLE + if (idx < hashParams.m_hashNumBuckets) { + if (hashData.d_hash[idx].head.offset != FREE_ENTRY && hashData.d_hash[idx].head.flags & ftl::voxhash::kFlagSurface) { + int addr = atomicAdd(hashData.d_hashCompactifiedCounter, 1); + hashData.d_hashCompactified[addr] = &hashData.d_hash[idx]; + } + } +#else + __shared__ int localCounter; + if (threadIdx.x == 0) localCounter = 0; + __syncthreads(); + + int addrLocal = -1; + if (idx < hashParams.m_hashNumBuckets) { + if (hashData.d_hash[idx].head.offset != FREE_ENTRY && (hashData.d_hash[idx].head.flags & ftl::voxhash::kFlagSurface)) { // TODO:(Nick) Check voxels for all 0 or all 1 + addrLocal = atomicAdd(&localCounter, 1); + } + } + + __syncthreads(); + + __shared__ int addrGlobal; + if (threadIdx.x == 0 && localCounter > 0) { + addrGlobal = atomicAdd(hashData.d_hashCompactifiedCounter, localCounter); + } + __syncthreads(); + + if (addrLocal != -1) { + const unsigned int addr = addrGlobal + addrLocal; + hashData.d_hashCompactified[addr] = &hashData.d_hash[idx]; + } +#endif +} + +void ftl::cuda::compactifyOccupied(HashData& hashData, const HashParams& hashParams, cudaStream_t stream) { + const unsigned int threadsPerBlock = COMPACTIFY_HASH_THREADS_PER_BLOCK; + const dim3 gridSize((hashParams.m_hashNumBuckets + threadsPerBlock - 1) / threadsPerBlock, 1); + const dim3 blockSize(threadsPerBlock, 1); + + cudaSafeCall(cudaMemsetAsync(hashData.d_hashCompactifiedCounter, 0, sizeof(int), stream)); + compactifyAllocatedKernel << <gridSize, blockSize, 0, stream >> >(hashData); + //unsigned int res = 0; + //cudaSafeCall(cudaMemcpyAsync(&res, hashData.d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream)); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + //cutilCheckMsg(__FUNCTION__); +#endif + //return res; +} diff --git a/applications/reconstruct/src/compactors.hpp b/applications/reconstruct/src/compactors.hpp index a633cfd22d3ec593c6fb9e3be06977d514eacd84..6c61985eea8448a078b8abe3e821992519c9425f 100644 --- a/applications/reconstruct/src/compactors.hpp +++ b/applications/reconstruct/src/compactors.hpp @@ -7,12 +7,13 @@ namespace ftl { namespace cuda { // Compact visible -void compactifyVisible(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraParams &camera, cudaStream_t); +//void compactifyVisible(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraParams &camera, cudaStream_t); // Compact allocated void compactifyAllocated(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t); // Compact visible surfaces +void compactifyOccupied(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t stream); } } diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu new file mode 100644 index 0000000000000000000000000000000000000000..575e3c45f9348c37c8397683db36d87edda677e4 --- /dev/null +++ b/applications/reconstruct/src/dibr.cu @@ -0,0 +1,118 @@ +#include "splat_render_cuda.hpp" +#include <cuda_runtime.h> + +#include <ftl/cuda_matrix_util.hpp> + +#include "splat_params.hpp" +#include <ftl/depth_camera.hpp> + +#define T_PER_BLOCK 8 + +using ftl::cuda::TextureObject; +using ftl::render::SplatParams; + +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; + +__global__ void clearColourKernel(TextureObject<uchar4> colour) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < colour.width() && y < colour.height()) { + //depth(x,y) = 0x7f800000; //PINF; + colour(x,y) = make_uchar4(76,76,82,0); + } +} + +__global__ void dibr_kernel( + TextureObject<float> depth_in, + TextureObject<uchar4> colour_out, int numcams, SplatParams params) { + + const int i = threadIdx.y*blockDim.y + threadIdx.x; + const int bx = blockIdx.x*blockDim.x; + const int by = blockIdx.y*blockDim.y; + const int x = bx + threadIdx.x; + const int y = by + threadIdx.y; + + for (int j=0; j<numcams; ++j) { + const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; + + float d = tex2D<float>(camera.depth, x, y); + if (d < 0.01f) continue; + if (d >= params.camera.m_sensorDepthWorldMax) continue; + + const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); + + const float3 camPos = params.m_viewMatrix * worldPos; + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); + + if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + + const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + + + if (cx < colour_out.width() && cy < colour_out.height()) { + float camd = depth_in.tex2D((int)cx,(int)cy); + //atomicMin(&depth(x,y), idepth); + float camdiff = fabs(camPos.z-camd); + if (camdiff < 0.1f) { + colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); + } else { + //colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255); + } + } + } +} + +__global__ void dibr_kernel_rev( + TextureObject<float> depth_in, + TextureObject<uchar4> colour_out, int numcams, SplatParams params) { + + const int i = threadIdx.y*blockDim.y + threadIdx.x; + const int bx = blockIdx.x*blockDim.x; + const int by = blockIdx.y*blockDim.y; + const int x = bx + threadIdx.x; + const int y = by + threadIdx.y; + + float camd = depth_in.tex2D((int)x,(int)y); + if (camd < 0.01f) return; + if (camd >= params.camera.m_sensorDepthWorldMax) return; + + const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd); + + for (int j=0; j<numcams; ++j) { + const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; + + const float3 camPos = camera.poseInverse * worldPos; + const float2 screenPosf = camera.params.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); + + if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + + const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + + if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) { + float d = tex2D<float>(camera.depth, (int)cx, (int)cy); + float camdiff = fabs(camPos.z-d); + if (camdiff < 0.1f) { + colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy); + } else { + //colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255); + } + } + } +} + +void ftl::cuda::dibr(const TextureObject<float> &depth_in, + const TextureObject<uchar4> &colour_out, int numcams, const SplatParams ¶ms, cudaStream_t stream) { + + const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + + dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_in, colour_out, numcams, params); + cudaSafeCall( cudaGetLastError() ); +} diff --git a/applications/reconstruct/src/garbage.cu b/applications/reconstruct/src/garbage.cu index b6226e58826b4ba0ef594a87a8201562209df32d..b685e9e6b7d94434ff425eff268699a715261522 100644 --- a/applications/reconstruct/src/garbage.cu +++ b/applications/reconstruct/src/garbage.cu @@ -6,7 +6,7 @@ using namespace ftl::voxhash; #define T_PER_BLOCK 8 #define NUM_CUDA_BLOCKS 10000 -__global__ void starveVoxelsKernel(HashData hashData) { +/*__global__ void starveVoxelsKernel(HashData hashData) { int ptr; // Stride over all allocated blocks @@ -32,21 +32,26 @@ void ftl::cuda::starveVoxels(HashData& hashData, const HashParams& hashParams, c cudaSafeCall(cudaDeviceSynchronize()); //cutilCheckMsg(__FUNCTION__); #endif -} +}*/ + +#define ENTRIES_PER_BLOCK 4 __global__ void clearVoxelsKernel(HashData hashData) { + const int lane = threadIdx.x % 16; + const int halfWarp = threadIdx.x / 16; // Stride over all allocated blocks - for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { + for (int bi=blockIdx.x+halfWarp; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS*ENTRIES_PER_BLOCK) { - const HashEntry& entry = hashData.d_hashCompactified[bi]; - hashData.d_SDFBlocks[entry.ptr + threadIdx.x].weight = 0; + HashEntry *entry = hashData.d_hashCompactified[bi]; + //hashData.d_SDFBlocks[entry.ptr + threadIdx.x].weight = 0; + entry->voxels[lane] = 0; } } void ftl::cuda::clearVoxels(HashData& hashData, const HashParams& hashParams) { - const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; + const unsigned int threadsPerBlock = 16 * ENTRIES_PER_BLOCK; const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); @@ -54,65 +59,20 @@ void ftl::cuda::clearVoxels(HashData& hashData, const HashParams& hashParams) { } -__shared__ float shared_MinSDF[SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE / 2]; -__shared__ uint shared_MaxWeight[SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE / 2]; - - __global__ void garbageCollectIdentifyKernel(HashData hashData) { + const int lane = threadIdx.x % 16; + const int halfWarp = threadIdx.x / 16; // Stride over all allocated blocks - for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { - - const HashEntry& entry = hashData.d_hashCompactified[bi]; - - // Entire block was not touched in this frame, so remove (Nick) - /*if (entry.flags != cameraParams.flags & 0xFF) { - hashData.d_hashDecision[hashIdx] = 1; - return; - }*/ - - //uint h = hashData.computeHashPos(entry.pos); - //hashData.d_hashDecision[hashIdx] = 1; - //if (hashData.d_hashBucketMutex[h] == LOCK_ENTRY) return; - - //if (entry.ptr == FREE_ENTRY) return; //should never happen since we did compactify before - //const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + for (int bi=blockIdx.x+halfWarp; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS * ENTRIES_PER_BLOCK) { - const unsigned int idx0 = entry.ptr + 2*threadIdx.x+0; - const unsigned int idx1 = entry.ptr + 2*threadIdx.x+1; + const HashEntry *entry = hashData.d_hashCompactified[bi]; - Voxel v0 = hashData.d_SDFBlocks[idx0]; - Voxel v1 = hashData.d_SDFBlocks[idx1]; + const uint v = entry->voxels[lane]; + const uint mask = (halfWarp & 0x1) ? 0xFFFF0000 : 0x0000FFFF; + uint ballot_result = __ballot_sync(mask, v == 0 || v == 0xFFFFFFFF); - if (v0.weight == 0) v0.sdf = PINF; - if (v1.weight == 0) v1.sdf = PINF; - - shared_MinSDF[threadIdx.x] = min(fabsf(v0.sdf), fabsf(v1.sdf)); //init shared memory - shared_MaxWeight[threadIdx.x] = max(v0.weight, v1.weight); - -#pragma unroll 1 - for (uint stride = 2; stride <= blockDim.x; stride <<= 1) { - __syncthreads(); - if ((threadIdx.x & (stride-1)) == (stride-1)) { - shared_MinSDF[threadIdx.x] = min(shared_MinSDF[threadIdx.x-stride/2], shared_MinSDF[threadIdx.x]); - shared_MaxWeight[threadIdx.x] = max(shared_MaxWeight[threadIdx.x-stride/2], shared_MaxWeight[threadIdx.x]); - } - } - - __syncthreads(); - - if (threadIdx.x == blockDim.x - 1) { - float minSDF = shared_MinSDF[threadIdx.x]; - uint maxWeight = shared_MaxWeight[threadIdx.x]; - - float t = hashData.getTruncation(5.0f); // NICK should not be hardcoded //MATTHIAS TODO check whether this is a reasonable metric - - if (minSDF >= t || maxWeight == 0) { - hashData.d_hashDecision[bi] = 1; - } else { - hashData.d_hashDecision[bi] = 0; - } - } + if (lane == 0) hashData.d_hashDecision[bi] = (ballot_result == mask) ? 1 : 0; } } @@ -138,18 +98,20 @@ __global__ void garbageCollectFreeKernel(HashData hashData) { // Stride over all allocated blocks for (int bi=blockIdx.x*blockDim.x + threadIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS*blockDim.x) { - if (hashData.d_hashDecision[bi] != 0) { //decision to delete the hash entry + HashEntry *entry = hashData.d_hashCompactified[bi]; - const HashEntry& entry = hashData.d_hashCompactified[bi]; - //if (entry.ptr == FREE_ENTRY) return; //should never happen since we did compactify before + if ((entry->head.flags & ftl::voxhash::kFlagSurface) == 0) { //decision to delete the hash entry + + + //if (entry->head.offset == FREE_ENTRY) return; //should never happen since we did compactify before - if (hashData.deleteHashEntryElement(entry.pos)) { //delete hash entry from hash (and performs heap append) - const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + int3 posI3 = make_int3(entry->head.posXYZ.x, entry->head.posXYZ.y, entry->head.posXYZ.z); - #pragma unroll 1 - for (uint i = 0; i < linBlockSize; i++) { //clear sdf block: CHECK TODO another kernel? - hashData.deleteVoxel(entry.ptr + i); - } + if (hashData.deleteHashEntryElement(posI3)) { //delete hash entry from hash (and performs heap append) + //#pragma unroll + //for (uint i = 0; i < 16; i++) { //clear sdf block: CHECK TODO another kernel? + // entry->voxels[i] = 0; + //} } } diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu index d34c021254a66b956ff612e4b646a5be7bd19d9e..1259210b4ea1b292f1adf00dfd0517bf04734c40 100644 --- a/applications/reconstruct/src/integrators.cu +++ b/applications/reconstruct/src/integrators.cu @@ -3,6 +3,8 @@ #include <vector_types.h> #include <cuda_runtime.h> #include <ftl/cuda_matrix_util.hpp> +#include <ftl/cuda_util.hpp> +#include <ftl/cuda_common.hpp> #define T_PER_BLOCK 8 @@ -10,6 +12,7 @@ using ftl::voxhash::HashData; using ftl::voxhash::HashParams; using ftl::voxhash::Voxel; using ftl::voxhash::HashEntry; +using ftl::voxhash::HashEntryHead; using ftl::voxhash::FREE_ENTRY; __device__ float4 make_float4(uchar4 c) { @@ -58,21 +61,26 @@ __device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) { return x*x + y*y + z*z; } +__device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) { + float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z; + float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z; + float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2); + float p_2 = xv_2 / v_2; + return sqrt(x_2 - p_2) < epsilon; +} + #define NUM_CUDA_BLOCKS 10000 -__global__ void integrateDepthMapKernel(HashData hashData, HashParams hashParams, DepthCameraParams cameraParams, cudaTextureObject_t depthT, cudaTextureObject_t colourT) { - //const HashParams& hashParams = c_hashParams; - //const DepthCameraParams& cameraParams = c_depthCameraParams; + + +/*__global__ void integrateRegistrationKernel(HashData hashData, HashParams hashParams, DepthCameraParams cameraParams, cudaTextureObject_t depthT, cudaTextureObject_t colourT) { // Stride over all allocated blocks for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { //TODO check if we should load this in shared memory HashEntry& entry = hashData.d_hashCompactified[bi]; - //if (entry.ptr == FREE_ENTRY) { - // printf("invliad integrate"); - // return; //should never happen since we did the compactification before - //} + int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(entry.pos); @@ -91,13 +99,7 @@ __global__ void integrateDepthMapKernel(HashData hashData, HashParams hashParams //if (depth > 20.0f) return; uchar4 color = make_uchar4(0, 0, 0, 0); - //if (cameraData.d_colorData) { - color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y); - //color = bilinearFilterColor(cameraData.cameraToKinectScreenFloat(pf)); - //} - - //printf("screen pos %d\n", color.x); - //return; + color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y); // Depth is within accepted max distance from camera if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) @@ -107,165 +109,180 @@ __global__ void integrateDepthMapKernel(HashData hashData, HashParams hashParams float sdf = depth - pf.z; float truncation = hashData.getTruncation(depth); - // Is this voxel close enough to cam for depth map value - // CHECK Nick: If is too close then free space violation so remove? - // This isn't enough if the disparity has occlusions that don't cause violations - // Could RGB changes also cause removals if depth can't be confirmed? - /*if (sdf > truncation) { - uint idx = entry.ptr + i; - hashData.d_SDFBlocks[idx].weight = 0; - //hashData.d_SDFBlocks[idx].sdf = PINF; - hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,0); - }*/ - if (sdf > -truncation) // && depthZeroOne >= 0.0f && depthZeroOne <= 1.0f) //check if in truncation range should already be made in depth map computation - { - /*if (sdf >= 0.0f) { - sdf = fminf(truncation, sdf); - } else { - sdf = fmaxf(-truncation, sdf); - }*/ - - - //printf("SDF: %f\n", sdf); - //float weightUpdate = g_WeightSample; - //weightUpdate = (1-depthZeroOne)*5.0f + depthZeroOne*0.05f; - //weightUpdate *= g_WeightSample; + if (sdf > -truncation) { float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f); Voxel curr; //construct current voxel curr.sdf = sdf; curr.weight = weightUpdate; curr.color = make_uchar3(color.x, color.y, color.z); - uint idx = entry.ptr + i; - - //if (entry.flags != cameraParams.flags & 0xFF) { - // entry.flags = cameraParams.flags & 0xFF; - //hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,0); - //} - Voxel newVoxel; - //if (color.x == MINF) hashData.combineVoxelDepthOnly(hashData.d_SDFBlocks[idx], curr, newVoxel); - //else hashData.combineVoxel(hashData.d_SDFBlocks[idx], curr, newVoxel); - hashData.combineVoxel(hashData.d_SDFBlocks[idx], curr, newVoxel); + Voxel out; + const Voxel &v1 = curr; + const Voxel &v0 = hashData.d_SDFBlocks[idx]; - hashData.d_SDFBlocks[idx] = newVoxel; + float redshift = (v0.weight > 0) ? 1.0f - ((v1.sdf - v0.sdf) / hashParams.m_truncation)*0.5f : 1.0f; + + out.color.x = min(max(v1.color.x*redshift,0.0f),255.0f); + out.color.y = min(max(v1.color.y*redshift,0.0f),255.0f); + out.color.z = min(max(v1.color.z*(1.0f / redshift),0.0f),255.0f); + + out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); + out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight); + + hashData.d_SDFBlocks[idx] = out; - //Voxel prev = getVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx); - //Voxel newVoxel = combineVoxel(curr, prev); - //setVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx, newVoxel); } - } else { - // Depth is invalid so what to do here? - // TODO(Nick) Use past voxel if available (set weight from 0 to 1) - - //uint idx = entry.ptr + i; - //float coldist = colourDistance(color, hashData.d_SDFBlocks[idx].color); - //if ((depth > 39.99f || depth < 0.01f) && coldist > 100.0f) { - //hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,(uchar)(coldist)); - // hashData.d_SDFBlocks[idx].weight = hashData.d_SDFBlocks[idx].weight >> 1; - //} } } - } -} - - -void ftl::cuda::integrateDepthMap(HashData& hashData, const HashParams& hashParams, - const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream) { - const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; - const dim3 gridSize(NUM_CUDA_BLOCKS, 1); - const dim3 blockSize(threadsPerBlock, 1); + } // Stride loop +}*/ - //if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) - integrateDepthMapKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, depthCameraParams, depthCameraData.depth_obj_, depthCameraData.colour_obj_); - //} +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; - //cudaSafeCall( cudaGetLastError() ); -#ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); - //cutilCheckMsg(__FUNCTION__); -#endif -} +#define WARP_SIZE 32 +__global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParams, int numcams) { + __shared__ uint all_warp_ballot; + __shared__ uint voxels[16]; -__global__ void integrateRegistrationKernel(HashData hashData, HashParams hashParams, DepthCameraParams cameraParams, cudaTextureObject_t depthT, cudaTextureObject_t colourT) { + const uint i = threadIdx.x; //inside of an SDF block + const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); // Stride over all allocated blocks for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { //TODO check if we should load this in shared memory - HashEntry& entry = hashData.d_hashCompactified[bi]; + //HashEntryHead entry = hashData.d_hashCompactified[bi]->head; + int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ)); - int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(entry.pos); + //uint idx = entry.offset + i; + int3 pi = pi_base + po; + float3 pfb = hashData.virtualVoxelPosToWorld(pi); + int count = 0; + //float camdepths[MAX_CAMERAS]; - uint i = threadIdx.x; //inside of an SDF block - int3 pi = pi_base + make_int3(hashData.delinearizeVoxelIndex(i)); - float3 pf = hashData.virtualVoxelPosToWorld(pi); + Voxel oldVoxel; // = hashData.d_SDFBlocks[idx]; + hashData.deleteVoxel(oldVoxel); - pf = hashParams.m_rigidTransformInverse * pf; - uint2 screenPos = make_uint2(cameraParams.cameraToKinectScreenInt(pf)); + for (uint cam=0; cam<numcams; ++cam) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + float3 pf = camera.poseInverse * pfb; + uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); - // For this voxel in hash, get its screen position and check it is on screen - if (screenPos.x < cameraParams.m_imageWidth && screenPos.y < cameraParams.m_imageHeight) { //on screen + // For this voxel in hash, get its screen position and check it is on screen + if (screenPos.x < camera.params.m_imageWidth && screenPos.y < camera.params.m_imageHeight) { //on screen - //float depth = g_InputDepth[screenPos]; - float depth = tex2D<float>(depthT, screenPos.x, screenPos.y); - //if (depth > 20.0f) return; - - uchar4 color = make_uchar4(0, 0, 0, 0); - color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y); - - // Depth is within accepted max distance from camera - if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) - float depthZeroOne = cameraParams.cameraToKinectProjZ(depth); - - // Calculate SDF of this voxel wrt the depth map value - float sdf = depth - pf.z; - float truncation = hashData.getTruncation(depth); - - if (sdf > -truncation) { - float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f); + //float depth = g_InputDepth[screenPos]; + float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y); + //if (depth > 20.0f) return; - Voxel curr; //construct current voxel - curr.sdf = sdf; - curr.weight = weightUpdate; - curr.color = make_uchar3(color.x, color.y, color.z); - - uint idx = entry.ptr + i; - - Voxel out; - const Voxel &v1 = curr; - const Voxel &v0 = hashData.d_SDFBlocks[idx]; + //uchar4 color = make_uchar4(0, 0, 0, 0); + //if (cameraData.d_colorData) { + //color = (cam == 0) ? make_uchar4(255,0,0,255) : make_uchar4(0,0,255,255); + //color = tex2D<uchar4>(camera.colour, screenPos.x, screenPos.y); + //color = bilinearFilterColor(cameraData.cameraToKinectScreenFloat(pf)); + //} - float redshift = (v0.weight > 0) ? 1.0f - ((v1.sdf - v0.sdf) / hashParams.m_truncation)*0.5f : 1.0f; + //printf("screen pos %d\n", color.x); + //return; + + // Depth is within accepted max distance from camera + if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) + //camdepths[count] = depth; + ++count; + + // Calculate SDF of this voxel wrt the depth map value + float sdf = depth - pf.z; + float truncation = hashData.getTruncation(depth); + float depthZeroOne = camera.params.cameraToKinectProjZ(depth); + + // Is this voxel close enough to cam for depth map value + // CHECK Nick: If is too close then free space violation so remove? + if (sdf > -truncation) // && depthZeroOne >= 0.0f && depthZeroOne <= 1.0f) //check if in truncation range should already be made in depth map computation + { + float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f); + + Voxel curr; //construct current voxel + curr.sdf = sdf; + curr.weight = weightUpdate; + //curr.color = make_uchar3(color.x, color.y, color.z); + + + //if (entry.flags != cameraParams.flags & 0xFF) { + // entry.flags = cameraParams.flags & 0xFF; + //hashData.d_SDFBlocks[idx].color = make_uchar3(0,0,0); + //} + + Voxel newVoxel; + //if (color.x == MINF) hashData.combineVoxelDepthOnly(hashData.d_SDFBlocks[idx], curr, newVoxel); + //else hashData.combineVoxel(hashData.d_SDFBlocks[idx], curr, newVoxel); + hashData.combineVoxel(oldVoxel, curr, newVoxel); + + oldVoxel = newVoxel; + + //Voxel prev = getVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx); + //Voxel newVoxel = combineVoxel(curr, prev); + //setVoxel(g_SDFBlocksSDFUAV, g_SDFBlocksRGBWUAV, idx, newVoxel); + } + } else { + // Depth is invalid so what to do here? + // TODO(Nick) Use past voxel if available (set weight from 0 to 1) + + // Naive: need to know if this is a foreground voxel + //bool coldist = colordiff(color, hashData.d_SDFBlocks[idx].color, 5.0f); + //if (!coldist) ++count; - out.color.x = min(max(v1.color.x*redshift,0.0f),255.0f); - out.color.y = min(max(v1.color.y*redshift,0.0f),255.0f); - out.color.z = min(max(v1.color.z*(1.0f / redshift),0.0f),255.0f); + } + } + } - out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); - out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight); + // Calculate voxel sign values across a warp + int warpNum = i / WARP_SIZE; + uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1); - hashData.d_SDFBlocks[idx] = out; + // Aggregate each warp result into voxel mask + if (i % WARP_SIZE == 0) { + voxels[warpNum] = ballot_result; + } - } + __syncthreads(); + + // Work out if block is occupied or not and save voxel masks + if (i < 16) { + const uint v = voxels[i]; + hashData.d_hashCompactified[bi]->voxels[i] = v; + const uint mask = 0x0000FFFF; + uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF); + uint b2 = __ballot_sync(mask, v == 0); + if (i == 0) { + if (b1 != mask && b2 != mask) hashData.d_hashCompactified[bi]->head.flags |= ftl::voxhash::kFlagSurface; + else hashData.d_hashCompactified[bi]->head.flags &= ~ftl::voxhash::kFlagSurface; } } - } // Stride loop + } } -void ftl::cuda::integrateRegistration(HashData& hashData, const HashParams& hashParams, - const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream) { - const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; - const dim3 gridSize(NUM_CUDA_BLOCKS, 1); - const dim3 blockSize(threadsPerBlock, 1); - integrateRegistrationKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, depthCameraParams, depthCameraData.depth_obj_, depthCameraData.colour_obj_); +void ftl::cuda::integrateDepthMaps(HashData& hashData, const HashParams& hashParams, int numcams, cudaStream_t stream) { +const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; +const dim3 gridSize(NUM_CUDA_BLOCKS, 1); +const dim3 blockSize(threadsPerBlock, 1); + +//if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) + integrateDepthMapsKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); +//} -} \ No newline at end of file +//cudaSafeCall( cudaGetLastError() ); +#ifdef _DEBUG +cudaSafeCall(cudaDeviceSynchronize()); +//cutilCheckMsg(__FUNCTION__); +#endif +} diff --git a/applications/reconstruct/src/integrators.hpp b/applications/reconstruct/src/integrators.hpp index 756ff2ea45e556592714665af504191108880f3d..789551dd1fa7347bf02c518c8c5a73f6ae4269b4 100644 --- a/applications/reconstruct/src/integrators.hpp +++ b/applications/reconstruct/src/integrators.hpp @@ -7,11 +7,14 @@ namespace ftl { namespace cuda { -void integrateDepthMap(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, +/*void integrateDepthMap(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream); void integrateRegistration(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream); +*/ + +void integrateDepthMaps(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, int numcams, cudaStream_t stream); } } diff --git a/applications/reconstruct/src/main.cpp b/applications/reconstruct/src/main.cpp index 650c74b61f7a3ebdaf79d396ee425b43a6fffba2..4f05ef2d32644a728d9d31874de29c8e2a87087d 100644 --- a/applications/reconstruct/src/main.cpp +++ b/applications/reconstruct/src/main.cpp @@ -126,11 +126,12 @@ static void run(ftl::Configurable *root) { //stream->wait(); cudaSafeCall(cudaStreamSynchronize(scene->getIntegrationStream())); - //LOG(INFO) << "Heap: " << scene->getHeapFreeCount(); // Merge new frames into the voxel structure scene->integrate(); + //LOG(INFO) << "Allocated: " << scene->getOccupiedCount(); + // Remove any redundant voxels scene->garbage(); diff --git a/applications/reconstruct/src/scene_rep_hash_sdf.cu b/applications/reconstruct/src/scene_rep_hash_sdf.cu index 4f57bf370b355148662444b393d29622cb0c792b..5c5ebf6fdb73be59bf028d850ce4fa41e6986925 100644 --- a/applications/reconstruct/src/scene_rep_hash_sdf.cu +++ b/applications/reconstruct/src/scene_rep_hash_sdf.cu @@ -26,6 +26,7 @@ using ftl::voxhash::FREE_ENTRY; __device__ __constant__ HashParams c_hashParams; __device__ __constant__ RayCastParams c_rayCastParams; //__device__ __constant__ DepthCameraParams c_depthCameraParams; +__device__ __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; extern "C" void updateConstantHashParams(const HashParams& params) { @@ -40,7 +41,7 @@ extern "C" void updateConstantHashParams(const HashParams& params) { } -extern "C" void updateConstantRayCastParams(const RayCastParams& params) { +/*extern "C" void updateConstantRayCastParams(const RayCastParams& params) { //printf("Update ray cast params\n"); size_t size; cudaSafeCall(cudaGetSymbolSize(&size, c_rayCastParams)); @@ -51,30 +52,23 @@ extern "C" void updateConstantRayCastParams(const RayCastParams& params) { //cutilCheckMsg(__FUNCTION__); #endif -} +}*/ -/*extern "C" void updateConstantDepthCameraParams(const DepthCameraParams& params) { +extern "C" void updateCUDACameraConstant(ftl::voxhash::DepthCameraCUDA *data, int count) { + if (count == 0 || count >= MAX_CAMERAS) return; //printf("Update depth camera params\n"); size_t size; - cudaSafeCall(cudaGetSymbolSize(&size, c_depthCameraParams)); - cudaSafeCall(cudaMemcpyToSymbol(c_depthCameraParams, ¶ms, size, 0, cudaMemcpyHostToDevice)); + cudaSafeCall(cudaGetSymbolSize(&size, c_cameras)); + cudaSafeCall(cudaMemcpyToSymbol(c_cameras, data, sizeof(ftl::voxhash::DepthCameraCUDA)*count, 0, cudaMemcpyHostToDevice)); #ifdef DEBUG cudaSafeCall(cudaDeviceSynchronize()); //cutilCheckMsg(__FUNCTION__); #endif -}*/ - -extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData) -{ - /*cudaSafeCall(cudaBindTextureToArray(depthTextureRef, depthCameraData.d_depthArray, depthCameraData.h_depthChannelDesc)); - cudaSafeCall(cudaBindTextureToArray(colorTextureRef, depthCameraData.d_colorArray, depthCameraData.h_colorChannelDesc)); - depthTextureRef.filterMode = cudaFilterModePoint; - colorTextureRef.filterMode = cudaFilterModePoint;*/ } -__global__ void resetHeapKernel(HashData hashData) +/*__global__ void resetHeapKernel(HashData hashData) { const HashParams& hashParams = c_hashParams; unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; @@ -92,7 +86,7 @@ __global__ void resetHeapKernel(HashData hashData) hashData.deleteVoxel(base_idx+i); } } -} +}*/ __global__ void resetHashKernel(HashData hashData) { @@ -100,7 +94,7 @@ __global__ void resetHashKernel(HashData hashData) const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < hashParams.m_hashNumBuckets) { hashData.deleteHashEntry(hashData.d_hash[idx]); - hashData.deleteHashEntry(hashData.d_hashCompactified[idx]); + //hashData.deleteHashEntry(hashData.d_hashCompactified[idx]); } } @@ -116,20 +110,20 @@ __global__ void resetHashBucketMutexKernel(HashData hashData) extern "C" void resetCUDA(HashData& hashData, const HashParams& hashParams) { - { + //{ //resetting the heap and SDF blocks - const dim3 gridSize((hashParams.m_numSDFBlocks + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); - const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); + //const dim3 gridSize((hashParams.m_numSDFBlocks + (T_PER_BLOCK*T_PER_BLOCK) - 1)/(T_PER_BLOCK*T_PER_BLOCK), 1); + //const dim3 blockSize((T_PER_BLOCK*T_PER_BLOCK), 1); - resetHeapKernel<<<gridSize, blockSize>>>(hashData); + //resetHeapKernel<<<gridSize, blockSize>>>(hashData); - #ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); + //#ifdef _DEBUG + // cudaSafeCall(cudaDeviceSynchronize()); //cutilCheckMsg(__FUNCTION__); - #endif + //#endif - } + //} { //resetting the hash @@ -212,17 +206,18 @@ bool isSDFBlockStreamedOut(const int3& sdfBlock, const HashData& hashData, const // Note: bitMask used for Streaming out code... could be set to nullptr if not streaming out // Note: Allocations might need to be around fat rays since multiple voxels could correspond // to same depth map pixel at larger distances. -__global__ void allocKernel(HashData hashData, DepthCameraData cameraData, HashParams hashParams, DepthCameraParams cameraParams) +__global__ void allocKernel(HashData hashData, HashParams hashParams, int camnum) { //const HashParams& hashParams = c_hashParams; - //const DepthCameraParams& cameraParams = c_depthCameraParams; + const ftl::voxhash::DepthCameraCUDA camera = c_cameras[camnum]; + const DepthCameraParams &cameraParams = camera.params; const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; if (x < cameraParams.m_imageWidth && y < cameraParams.m_imageHeight) { - float d = tex2D<float>(cameraData.depth_obj_, x, y); + float d = tex2D<float>(camera.depth, x, y); //if (d == MINF || d < cameraParams.m_sensorDepthWorldMin || d > cameraParams.m_sensorDepthWorldMax) return; if (d == MINF || d == 0.0f) return; @@ -239,10 +234,10 @@ __global__ void allocKernel(HashData hashData, DepthCameraData cameraData, HashP // camera intrinsics? Same as what reprojectTo3D does in OpenCV? float3 rayMin = cameraParams.kinectDepthToSkeleton(x, y, minDepth); // Is the rigid transform then the estimated camera pose? - rayMin = hashParams.m_rigidTransform * rayMin; + rayMin = camera.pose * rayMin; //printf("Ray min: %f,%f,%f\n", rayMin.x, rayMin.y, rayMin.z); float3 rayMax = cameraParams.kinectDepthToSkeleton(x, y, maxDepth); - rayMax = hashParams.m_rigidTransform * rayMax; + rayMax = camera.pose * rayMax; float3 rayDir = normalize(rayMax - rayMin); @@ -278,7 +273,7 @@ __global__ void allocKernel(HashData hashData, DepthCameraData cameraData, HashP //check if it's in the frustum and not checked out if (hashData.isSDFBlockInCameraFrustumApprox(hashParams, cameraParams, idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) { - hashData.allocBlock(idCurrentVoxel, cameraParams.flags & 0xFF); + hashData.allocBlock(idCurrentVoxel); //printf("Allocate block: %d\n",idCurrentVoxel.x); } @@ -304,7 +299,7 @@ __global__ void allocKernel(HashData hashData, DepthCameraData cameraData, HashP } } -extern "C" void allocCUDA(HashData& hashData, const HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream) +extern "C" void allocCUDA(HashData& hashData, const HashParams& hashParams, int camnum, const DepthCameraParams &depthCameraParams, cudaStream_t stream) { //printf("Allocating: %d\n",depthCameraParams.m_imageWidth); @@ -312,7 +307,7 @@ extern "C" void allocCUDA(HashData& hashData, const HashParams& hashParams, cons const dim3 gridSize((depthCameraParams.m_imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthCameraParams.m_imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - allocKernel<<<gridSize, blockSize, 0, stream>>>(hashData, depthCameraData, hashParams, depthCameraParams); + allocKernel<<<gridSize, blockSize, 0, stream>>>(hashData, hashParams, camnum); #ifdef _DEBUG diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp index cb513c1da0b4b6952ffebb8a186a9d062070bbfe..a9d5e57dc712829b592f56e8226abe0f563dc665 100644 --- a/applications/reconstruct/src/splat_render.cpp +++ b/applications/reconstruct/src/splat_render.cpp @@ -47,13 +47,15 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { params.camera.m_sensorDepthWorldMax = camera.maxDepth; params.camera.m_sensorDepthWorldMin = camera.minDepth; - ftl::cuda::compactifyVisible(scene_->getHashData(), scene_->getHashParams(), params.camera, stream); - ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, colour1_, params, stream); - ftl::cuda::splat_points(depth1_, colour1_, depth2_, colour2_, params, stream); + ftl::cuda::compactifyAllocated(scene_->getHashData(), scene_->getHashParams(), stream); + LOG(INFO) << "Occupied: " << scene_->getOccupiedCount(); + ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream); + ftl::cuda::splat_points(depth1_, depth2_, params, stream); + ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); // TODO: Second pass - src->writeFrames(colour2_, depth2_, stream); + src->writeFrames(colour1_, depth2_, stream); } void Splatter::setOutputDevice(int device) { diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/splat_render.cu index 12be18e048443ce05d3907311c0a2f0c43e9cda3..0e3598f129bf40c72294ab19501b1fd3c95bb1e9 100644 --- a/applications/reconstruct/src/splat_render.cu +++ b/applications/reconstruct/src/splat_render.cu @@ -13,13 +13,13 @@ using ftl::cuda::TextureObject; using ftl::render::SplatParams; -__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, TextureObject<uchar4> colour) { +__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; if (x < depth.width() && y < depth.height()) { depth(x,y) = 0x7f800000; //PINF; - colour(x,y) = make_uchar4(76,76,82,0); + //colour(x,y) = make_uchar4(76,76,82,0); } } @@ -72,10 +72,14 @@ __device__ inline uint blockLinear(int x, int y, int z) { return x + (y << 1) + (z << 2); } -__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, TextureObject<uchar4> colour, SplatParams params) { +__device__ inline bool getVoxel(uint *voxels, int ix) { + return voxels[ix/32] & (0x1 << (ix % 32)); +} + +__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) { // TODO:(Nick) Reduce bank conflicts by aligning these - __shared__ ftl::voxhash::Voxel voxels[SDF_BLOCK_BUFFER]; - __shared__ ftl::voxhash::HashEntry block; + __shared__ uint voxels[16]; + __shared__ ftl::voxhash::HashEntryHead block; // Stride over all allocated blocks for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { @@ -83,34 +87,38 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture const uint i = threadIdx.x; //inside of an SDF block - if (i == 0) block = hashData.d_hashCompactified[bi]; + if (i == 0) block = hashData.d_hashCompactified[bi]->head; + if (i < 16) voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; // Make sure all hash entries are cached __syncthreads(); - const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(block.pos); + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ)); const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i)); const int3 pi = pi_base + vp; - const uint j = plinVoxelPos(vp); // Padded linear index + //const uint j = plinVoxelPos(vp); // Padded linear index const float3 worldPos = hashData.virtualVoxelPosToWorld(pi); // Load distances and colours into shared memory + padding - const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[block.ptr + i]; - voxels[j] = v; + //const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[block.ptr + i]; + //voxels[j] = v; + const bool v = getVoxel(voxels, i); - __syncthreads(); + //__syncthreads(); - if (voxels[j].weight == 0) continue; + //if (voxels[j].weight == 0) continue; + if (vp.x == 7 || vp.y == 7 || vp.z == 7) continue; int edgeX = (vp.x == 0 ) ? 1 : 0; int edgeY = (vp.y == 0 ) ? 1 : 0; int edgeZ = (vp.z == 0 ) ? 1 : 0; - bool is_surface = ((params.m_flags & kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); - if (is_surface) voxels[j].color = make_uchar3(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0); + uchar4 color = make_uchar4(255,0,0,255); + bool is_surface = false; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); + //if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255); - if (!is_surface && voxels[j].sdf <= 0.0f) continue; + if (!is_surface && v) continue; //if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0); @@ -125,10 +133,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture const int3 uvi = make_int3(vp.x+u,vp.y+v,vp.z+w); // Skip these cases since we didn't load voxels properly - if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue; + //if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue; - const auto &vox = voxels[plinVoxelPos(uvi)]; - if (vox.weight > 0 && vox.sdf <= 0.0f) { + const bool vox = getVoxel(voxels, hashData.linearizeVoxelPos(uvi)); + if (vox) { is_surface = true; // Should break but is slower? } @@ -139,6 +147,8 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture // Only for surface voxels, work out screen coordinates if (!is_surface) continue; + // TODO: For each original camera, render a new depth map + const float3 camPos = params.m_viewMatrix * worldPos; const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f) @@ -157,27 +167,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture // See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds if (x < depth.width() && y < depth.height()) { - // TODO:(Nick) Would warp and shared mem considerations improve things? - // Probably not given we now have thin surfaces... - - // FIXME: Needs a lock here to ensure correct colour is written. - if (atomicMin(&depth(x,y), idepth) > idepth) { - colour(x,y) = make_uchar4(voxels[j].color.x, voxels[j].color.y, voxels[j].color.z, 255); - } - - /*bool p = false; - while (!p) { - int ld = atomicExch(&depth(x,y), LOCKED); - if (ld != LOCKED) { - p = true; - if (ld > idepth) { - colour(x,y) = make_uchar4(voxels[j].color.x, voxels[j].color.y, voxels[j].color.z, 255); - depth(x,y) = idepth; - } else { - depth(x,y) = ld; - } - } - }*/ + atomicMin(&depth(x,y), idepth); } } // Stride @@ -185,13 +175,12 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, const TextureObject<uint> &depth, - const TextureObject<uchar4> &colour, const SplatParams ¶ms, cudaStream_t stream) { const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); - clearDepthKernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(hashData, depth, colour); + clearDepthKernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(hashData, depth); //cudaSafeCall( cudaDeviceSynchronize() ); @@ -199,7 +188,7 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); - isosurface_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, colour, params); + isosurface_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params); cudaSafeCall( cudaGetLastError() ); //cudaSafeCall( cudaDeviceSynchronize() ); @@ -221,9 +210,7 @@ __device__ float distance2(float3 a, float3 b) { __global__ void splatting_kernel( TextureObject<uint> depth_in, - TextureObject<uchar4> colour_in, - TextureObject<float> depth_out, - TextureObject<uchar4> colour_out, SplatParams params) { + TextureObject<float> depth_out, SplatParams params) { // Read an NxN region and // - interpolate a depth value for this pixel // - interpolate an rgb value for this pixel @@ -291,11 +278,11 @@ __global__ void splatting_kernel( if (minidx == -1) { depth_out(x,y) = 0.0f; - colour_out(x,y) = make_uchar4(76,76,82,255); + //colour_out(x,y) = make_uchar4(76,76,82,255); return; } - float3 colour = make_float3(0.0f, 0.0f, 0.0f); + //float3 colour = make_float3(0.0f, 0.0f, 0.0f); float depth = 0.0f; float contrib = 0.0f; float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth); // TODO:(Nick) Mindepth assumption is poor choice. @@ -314,32 +301,32 @@ __global__ void splatting_kernel( // Fast and simple trilinear interpolation float c = fabs((1.0f - delta.x) * (1.0f - delta.y) * (1.0f - delta.z)); - uchar4 col = colour_in.tex2D((int)sx, (int)sy); - colour.x += col.x*c; - colour.y += col.y*c; - colour.z += col.z*c; + //uchar4 col = colour_in.tex2D((int)sx, (int)sy); + //colour.x += col.x*c; + //colour.y += col.y*c; + //colour.z += col.z*c; contrib += c; depth += posp.z * c; } } // Normalise - colour.x /= contrib; - colour.y /= contrib; - colour.z /= contrib; + //colour.x /= contrib; + //colour.y /= contrib; + //colour.z /= contrib; depth /= contrib; depth_out(x,y) = depth; - colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255); + //colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255); } -void ftl::cuda::splat_points(const TextureObject<uint> &depth_in, const TextureObject<uchar4> &colour_in, - const TextureObject<float> &depth_out, const TextureObject<uchar4> &colour_out, const SplatParams ¶ms, cudaStream_t stream) +void ftl::cuda::splat_points(const TextureObject<uint> &depth_in, + const TextureObject<float> &depth_out, const SplatParams ¶ms, cudaStream_t stream) { const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, colour_in, depth_out, colour_out, params); + splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params); cudaSafeCall( cudaGetLastError() ); } diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp index 1e03801349fe0049b5eab1000a1566541383e35f..a36e80ad833f258eced3acd63560e6b315995391 100644 --- a/applications/reconstruct/src/splat_render.hpp +++ b/applications/reconstruct/src/splat_render.hpp @@ -5,7 +5,7 @@ #include <ftl/rgbd/source.hpp> #include <ftl/depth_camera.hpp> #include <ftl/voxel_scene.hpp> -#include <ftl/ray_cast_util.hpp> +//#include <ftl/ray_cast_util.hpp> #include <ftl/cuda_common.hpp> #include "splat_params.hpp" diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp index c4bd724add53d72c478cd169c323edcc20ad3d9e..deb702e1661ca8f36fcea7aa6b4b6a115439cc7b 100644 --- a/applications/reconstruct/src/splat_render_cuda.hpp +++ b/applications/reconstruct/src/splat_render_cuda.hpp @@ -3,7 +3,7 @@ #include <ftl/depth_camera.hpp> #include <ftl/voxel_hash.hpp> -#include <ftl/ray_cast_util.hpp> +//#include <ftl/ray_cast_util.hpp> #include "splat_params.hpp" @@ -16,7 +16,6 @@ namespace cuda { */ void isosurface_point_image(const ftl::voxhash::HashData& hashData, const ftl::cuda::TextureObject<uint> &depth, - const ftl::cuda::TextureObject<uchar4> &colour, const ftl::render::SplatParams ¶ms, cudaStream_t stream); //void isosurface_point_image_stereo(const ftl::voxhash::HashData& hashData, @@ -27,9 +26,11 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData, // TODO: isosurface_point_cloud void splat_points(const ftl::cuda::TextureObject<uint> &depth_in, - const ftl::cuda::TextureObject<uchar4> &colour_in, const ftl::cuda::TextureObject<float> &depth_out, - const ftl::cuda::TextureObject<uchar4> &colour_out, + const ftl::render::SplatParams ¶ms, cudaStream_t stream); + +void dibr(const ftl::cuda::TextureObject<float> &depth_in, + const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); } diff --git a/applications/reconstruct/src/voxel_hash.cpp b/applications/reconstruct/src/voxel_hash.cpp index d939556cfe364190597e3adc41295bed83ac41cf..14cb75078652001275404e87c09f77f0b9d19385 100644 --- a/applications/reconstruct/src/voxel_hash.cpp +++ b/applications/reconstruct/src/voxel_hash.cpp @@ -6,24 +6,18 @@ using ftl::voxhash::HashParams; void HashData::allocate(const HashParams& params, bool dataOnGPU) { m_bIsOnGPU = dataOnGPU; if (m_bIsOnGPU) { - cudaSafeCall(cudaMalloc(&d_heap, sizeof(unsigned int) * params.m_numSDFBlocks)); - cudaSafeCall(cudaMalloc(&d_heapCounter, sizeof(unsigned int))); cudaSafeCall(cudaMalloc(&d_hash, sizeof(HashEntry)* params.m_hashNumBuckets)); cudaSafeCall(cudaMalloc(&d_hashDecision, sizeof(int)* params.m_hashNumBuckets)); cudaSafeCall(cudaMalloc(&d_hashDecisionPrefix, sizeof(int)* params.m_hashNumBuckets)); - cudaSafeCall(cudaMalloc(&d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets)); + cudaSafeCall(cudaMalloc(&d_hashCompactified, sizeof(HashEntry*)* params.m_hashNumBuckets)); cudaSafeCall(cudaMalloc(&d_hashCompactifiedCounter, sizeof(int))); - cudaSafeCall(cudaMalloc(&d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize)); cudaSafeCall(cudaMalloc(&d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets)); } else { - d_heap = new unsigned int[params.m_numSDFBlocks]; - d_heapCounter = new unsigned int[1]; d_hash = new HashEntry[params.m_hashNumBuckets]; d_hashDecision = new int[params.m_hashNumBuckets]; d_hashDecisionPrefix = new int[params.m_hashNumBuckets]; - d_hashCompactified = new HashEntry[params.m_hashNumBuckets]; + d_hashCompactified = new HashEntry*[params.m_hashNumBuckets]; d_hashCompactifiedCounter = new int[1]; - d_SDFBlocks = new Voxel[params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize]; d_hashBucketMutex = new int[params.m_hashNumBuckets]; } @@ -38,35 +32,26 @@ void HashData::updateParams(const HashParams& params) { void HashData::free() { if (m_bIsOnGPU) { - cudaSafeCall(cudaFree(d_heap)); - cudaSafeCall(cudaFree(d_heapCounter)); cudaSafeCall(cudaFree(d_hash)); cudaSafeCall(cudaFree(d_hashDecision)); cudaSafeCall(cudaFree(d_hashDecisionPrefix)); cudaSafeCall(cudaFree(d_hashCompactified)); cudaSafeCall(cudaFree(d_hashCompactifiedCounter)); - cudaSafeCall(cudaFree(d_SDFBlocks)); cudaSafeCall(cudaFree(d_hashBucketMutex)); } else { - if (d_heap) delete[] d_heap; - if (d_heapCounter) delete[] d_heapCounter; if (d_hash) delete[] d_hash; if (d_hashDecision) delete[] d_hashDecision; if (d_hashDecisionPrefix) delete[] d_hashDecisionPrefix; if (d_hashCompactified) delete[] d_hashCompactified; if (d_hashCompactifiedCounter) delete[] d_hashCompactifiedCounter; - if (d_SDFBlocks) delete[] d_SDFBlocks; if (d_hashBucketMutex) delete[] d_hashBucketMutex; } d_hash = NULL; - d_heap = NULL; - d_heapCounter = NULL; d_hashDecision = NULL; d_hashDecisionPrefix = NULL; d_hashCompactified = NULL; d_hashCompactifiedCounter = NULL; - d_SDFBlocks = NULL; d_hashBucketMutex = NULL; } @@ -76,14 +61,11 @@ HashData HashData::download() const { HashData hashData; hashData.allocate(params, false); //allocate the data on the CPU - cudaSafeCall(cudaMemcpy(hashData.d_heap, d_heap, sizeof(unsigned int) * params.m_numSDFBlocks, cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(hashData.d_heapCounter, d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(hashData.d_hash, d_hash, sizeof(HashEntry)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(hashData.d_hashDecision, d_hashDecision, sizeof(int)*params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(hashData.d_hashDecisionPrefix, d_hashDecisionPrefix, sizeof(int)*params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry*)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(hashData.d_hashCompactifiedCounter, d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(hashData.d_SDFBlocks, d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize, cudaMemcpyDeviceToHost)); cudaSafeCall(cudaMemcpy(hashData.d_hashBucketMutex, d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets, cudaMemcpyDeviceToHost)); return hashData; @@ -95,21 +77,18 @@ HashData HashData::upload() const { HashData hashData; hashData.allocate(params, false); //allocate the data on the CPU - cudaSafeCall(cudaMemcpy(hashData.d_heap, d_heap, sizeof(unsigned int) * params.m_numSDFBlocks, cudaMemcpyHostToDevice)); - cudaSafeCall(cudaMemcpy(hashData.d_heapCounter, d_heapCounter, sizeof(unsigned int), cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hash, d_hash, sizeof(HashEntry)* params.m_hashNumBuckets, cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hashDecision, d_hashDecision, sizeof(int)*params.m_hashNumBuckets, cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hashDecisionPrefix, d_hashDecisionPrefix, sizeof(int)*params.m_hashNumBuckets, cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hashCompactified, d_hashCompactified, sizeof(HashEntry)* params.m_hashNumBuckets, cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hashCompactifiedCounter, d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyHostToDevice)); - cudaSafeCall(cudaMemcpy(hashData.d_SDFBlocks, d_SDFBlocks, sizeof(Voxel) * params.m_numSDFBlocks * params.m_SDFBlockSize*params.m_SDFBlockSize*params.m_SDFBlockSize, cudaMemcpyHostToDevice)); cudaSafeCall(cudaMemcpy(hashData.d_hashBucketMutex, d_hashBucketMutex, sizeof(int)* params.m_hashNumBuckets, cudaMemcpyHostToDevice)); return hashData; } -size_t HashData::getAllocatedBlocks() const { +/*size_t HashData::getAllocatedBlocks() const { unsigned int count; cudaSafeCall(cudaMemcpy(d_heapCounter, &count, sizeof(unsigned int), cudaMemcpyDeviceToHost)); return count; -} +}*/ diff --git a/applications/reconstruct/src/voxel_hash.cu b/applications/reconstruct/src/voxel_hash.cu index 7447a1c54eea72e7e8ee888a148fa3ee73a9473e..c2d07c391a6e48d2b45cc23dbf32b00878ffd5c9 100644 --- a/applications/reconstruct/src/voxel_hash.cu +++ b/applications/reconstruct/src/voxel_hash.cu @@ -2,35 +2,42 @@ using namespace ftl::voxhash; -#define COLLISION_LIST_SIZE 5 +#define COLLISION_LIST_SIZE 6 + +__device__ inline uint64_t compactPosition(const int3 &pos) { + union __align__(8) { + short4 posXYZ; + uint64_t pos64; + }; + posXYZ.x = pos.x; posXYZ.y = pos.y; posXYZ.z = pos.z; posXYZ.w = 0; + return pos64; +} //! returns the hash entry for a given sdf block id; if there was no hash entry the returned entry will have a ptr with FREE_ENTRY set __device__ -HashEntry HashData::getHashEntryForSDFBlockPos(const int3& sdfBlock) const { +int HashData::getHashEntryForSDFBlockPos(const int3& sdfBlock) const { uint h = computeHashPos(sdfBlock); //hash - int3 pos = sdfBlock; + uint64_t pos = compactPosition(sdfBlock); - HashEntry curr; + HashEntryHead curr; int i = h; unsigned int maxIter = 0; #pragma unroll 2 while (maxIter < COLLISION_LIST_SIZE) { - curr = d_hash[i]; + curr = d_hash[i].head; - if (curr.pos == pos && curr.ptr != FREE_ENTRY) return curr; - if (curr.offset == 0) break; + if (curr.pos == pos && curr.offset != FREE_ENTRY) return i; + if (curr.offset == 0 || curr.offset == FREE_ENTRY) break; i += curr.offset; //go to next element in the list i %= (params().m_hashNumBuckets); //check for overflow ++maxIter; } - // Could not find so return dummy - curr.pos = pos; - curr.ptr = FREE_ENTRY; - return curr; + // Could not find + return -1; } //for histogram (collisions traversal only) @@ -39,15 +46,15 @@ unsigned int HashData::getNumHashLinkedList(unsigned int bucketID) { unsigned int listLen = 0; unsigned int i = bucketID; //start with the last entry of the current bucket - HashEntry curr; curr.offset = 0; + HashEntryHead curr; curr.offset = 0; unsigned int maxIter = 0; #pragma unroll 2 while (maxIter < COLLISION_LIST_SIZE) { - curr = d_hash[i]; + curr = d_hash[i].head; - if (curr.offset == 0) break; + if (curr.offset == 0 || curr.offset == FREE_ENTRY) break; i += curr.offset; //go to next element in the list i %= (params().m_hashNumBuckets); //check for overflow @@ -60,18 +67,19 @@ unsigned int HashData::getNumHashLinkedList(unsigned int bucketID) { //pos in SDF block coordinates __device__ -void HashData::allocBlock(const int3& pos, const uchar frame) { +void HashData::allocBlock(const int3& pos) { uint h = computeHashPos(pos); //hash bucket uint i = h; - HashEntry curr; curr.offset = 0; + HashEntryHead curr; //curr.offset = 0; + const uint64_t pos64 = compactPosition(pos); unsigned int maxIter = 0; #pragma unroll 2 while (maxIter < COLLISION_LIST_SIZE) { //offset = curr.offset; - curr = d_hash[i]; //TODO MATTHIAS do by reference - if (curr.pos == pos && curr.ptr != FREE_ENTRY) return; - if (curr.offset == 0) break; + curr = d_hash[i].head; //TODO MATTHIAS do by reference + if (curr.pos == pos64 && curr.offset != FREE_ENTRY) return; + if (curr.offset == 0 || curr.offset == FREE_ENTRY) break; i += curr.offset; //go to next element in the list i %= (params().m_hashNumBuckets); //check for overflow @@ -79,25 +87,32 @@ void HashData::allocBlock(const int3& pos, const uchar frame) { } // Limit reached... - if (curr.offset != 0) return; + //if (maxIter == COLLISION_LIST_SIZE) return; - int j = i+1; + int j = i; while (maxIter < COLLISION_LIST_SIZE) { //offset = curr.offset; - curr = d_hash[j]; //TODO MATTHIAS do by reference - if (curr.ptr == FREE_ENTRY) { + + if (curr.offset == FREE_ENTRY) { int prevValue = atomicExch(&d_hashBucketMutex[i], LOCK_ENTRY); if (prevValue != LOCK_ENTRY) { - //InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket where we have found a free entry - prevValue = atomicExch(&d_hashBucketMutex[j], LOCK_ENTRY); - if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked - HashEntry& entry = d_hash[j]; - entry.pos = pos; + if (i == j) { + HashEntryHead& entry = d_hash[j].head; + entry.pos = pos64; entry.offset = 0; - entry.flags = 0; // Flag block as valid in this frame (Nick) - entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc - d_hash[i].offset = j-i; - //setHashEntry(g_Hash, idxLastEntryInBucket, lastEntryInBucket); + entry.flags = 0; + } else { + //InterlockedExchange(g_HashBucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket where we have found a free entry + prevValue = atomicExch(&d_hashBucketMutex[j], LOCK_ENTRY); + if (prevValue != LOCK_ENTRY) { //only proceed if the bucket has been locked + HashEntryHead& entry = d_hash[j].head; + entry.pos = pos64; + entry.offset = 0; + entry.flags = 0; // Flag block as valid in this frame (Nick) + //entry.ptr = consumeHeap() * SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; //memory alloc + d_hash[i].head.offset = j-i; + //setHashEntry(g_Hash, idxLastEntryInBucket, lastEntryInBucket); + } } } return; //bucket was already locked @@ -105,6 +120,7 @@ void HashData::allocBlock(const int3& pos, const uchar frame) { ++j; j %= (params().m_hashNumBuckets); //check for overflow + curr = d_hash[j].head; //TODO MATTHIAS do by reference ++maxIter; } } @@ -194,18 +210,19 @@ bool HashData::insertHashEntry(HashEntry entry) __device__ bool HashData::deleteHashEntryElement(const int3& sdfBlock) { uint h = computeHashPos(sdfBlock); //hash bucket + const uint64_t pos = compactPosition(sdfBlock); int i = h; int prev = -1; - HashEntry curr; + HashEntryHead curr; unsigned int maxIter = 0; #pragma unroll 2 while (maxIter < COLLISION_LIST_SIZE) { - curr = d_hash[i]; + curr = d_hash[i].head; //found that dude that we need/want to delete - if (curr.pos == sdfBlock && curr.ptr != FREE_ENTRY) { + if (curr.pos == pos && curr.offset != FREE_ENTRY) { //int prevValue = 0; //InterlockedExchange(bucketMutex[h], LOCK_ENTRY, prevValue); //lock the hash bucket int prevValue = atomicExch(&d_hashBucketMutex[i], LOCK_ENTRY); @@ -214,19 +231,19 @@ bool HashData::deleteHashEntryElement(const int3& sdfBlock) { prevValue = (prev >= 0) ? atomicExch(&d_hashBucketMutex[prev], LOCK_ENTRY) : 0; if (prevValue == LOCK_ENTRY) return false; if (prevValue != LOCK_ENTRY) { - const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; - appendHeap(curr.ptr / linBlockSize); + //const uint linBlockSize = SDF_BLOCK_SIZE * SDF_BLOCK_SIZE * SDF_BLOCK_SIZE; + //appendHeap(curr.ptr / linBlockSize); deleteHashEntry(i); if (prev >= 0) { - d_hash[prev].offset = curr.offset; + d_hash[prev].head.offset = curr.offset; } return true; } } } - if (curr.offset == 0) { //we have found the end of the list + if (curr.offset == 0 || curr.offset == FREE_ENTRY) { //we have found the end of the list return false; //should actually never happen because we need to find that guy before } prev = i; diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp index e392f55ee5a7f2f23ad3046dcb7b3a83f61132d5..51f9bc75b88b0d8ae73d41d8ba05fb248cb5ae00 100644 --- a/applications/reconstruct/src/voxel_scene.cpp +++ b/applications/reconstruct/src/voxel_scene.cpp @@ -17,11 +17,11 @@ using std::vector; extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t); -extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t); +extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, int camid, const DepthCameraParams &depthCameraParams, cudaStream_t); //extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData); //extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); //extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); -extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t); +//extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t); //extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData); @@ -97,6 +97,18 @@ void SceneRep::addSource(ftl::rgbd::Source *src) { cam.params.m_imageWidth = 0; } +extern "C" void updateCUDACameraConstant(ftl::voxhash::DepthCameraCUDA *data, int count); + +void SceneRep::_updateCameraConstant() { + std::vector<ftl::voxhash::DepthCameraCUDA> cams(cameras_.size()); + for (size_t i=0; i<cameras_.size(); ++i) { + cams[i] = cameras_[i].gpu.data; + cams[i].pose = MatrixConversion::toCUDA(cameras_[i].source->getPose().cast<float>()); + cams[i].poseInverse = MatrixConversion::toCUDA(cameras_[i].source->getPose().cast<float>().inverse()); + } + updateCUDACameraConstant(cams.data(), cams.size()); +} + int SceneRep::upload() { int active = 0; @@ -130,6 +142,15 @@ int SceneRep::upload() { } } + cam.params.flags = m_frameCount; + } + + _updateCameraConstant(); + //cudaSafeCall(cudaDeviceSynchronize()); + + for (size_t i=0; i<cameras_.size(); ++i) { + auto &cam = cameras_[i]; + // Get the RGB-Depth frame from input Source *input = cam.source; Mat rgb, depth; @@ -145,13 +166,11 @@ int SceneRep::upload() { Mat rgba; cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA); - cam.params.flags = m_frameCount; - // Send to GPU and merge view into scene //cam.gpu.updateParams(cam.params); cam.gpu.updateData(depth, rgba, cam.stream); - setLastRigidTransform(input->getPose().cast<float>()); + //setLastRigidTransform(input->getPose().cast<float>()); //make the rigid transform available on the GPU //m_hashData.updateParams(m_hashParams, cv::cuda::StreamAccessor::getStream(cam.stream)); @@ -159,7 +178,7 @@ int SceneRep::upload() { //if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream))); //allocate all hash blocks which are corresponding to depth map entries - _alloc(cam.gpu, cam.params, cv::cuda::StreamAccessor::getStream(cam.stream)); + _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream)); } // Must have finished all allocations and rendering before next integration @@ -169,7 +188,7 @@ int SceneRep::upload() { } void SceneRep::integrate() { - for (size_t i=0; i<cameras_.size(); ++i) { + /*for (size_t i=0; i<cameras_.size(); ++i) { auto &cam = cameras_[i]; setLastRigidTransform(cam.source->getPose().cast<float>()); @@ -179,16 +198,19 @@ void SceneRep::integrate() { _compactifyVisible(cam.params); //volumetrically integrate the depth data into the depth SDFBlocks - _integrateDepthMap(cam.gpu, cam.params); + //_integrateDepthMap(cam.gpu, cam.params); //_garbageCollect(); m_numIntegratedFrames++; - } + }*/ + + _compactifyAllocated(); + _integrateDepthMaps(); } void SceneRep::garbage() { - _compactifyAllocated(); + //_compactifyAllocated(); _garbageCollect(); //cudaSafeCall(cudaStreamSynchronize(integ_stream_)); @@ -248,127 +270,19 @@ void SceneRep::nextFrame() { void SceneRep::reset() { m_numIntegratedFrames = 0; - m_hashParams.m_rigidTransform.setIdentity(); - m_hashParams.m_rigidTransformInverse.setIdentity(); + //m_hashParams.m_rigidTransform.setIdentity(); + //m_hashParams.m_rigidTransformInverse.setIdentity(); m_hashParams.m_numOccupiedBlocks = 0; m_hashData.updateParams(m_hashParams); resetCUDA(m_hashData, m_hashParams); } -//! debug only! -unsigned int SceneRep::getHeapFreeCount() { +unsigned int SceneRep::getOccupiedCount() { unsigned int count; - cudaSafeCall(cudaMemcpy(&count, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&count, m_hashData.d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); return count+1; //there is one more free than the address suggests (0 would be also a valid address) } -//! debug only! -void SceneRep::debugHash() { - HashEntry* hashCPU = new HashEntry[m_hashParams.m_hashNumBuckets]; - unsigned int* heapCPU = new unsigned int[m_hashParams.m_numSDFBlocks]; - unsigned int heapCounterCPU; - - cudaSafeCall(cudaMemcpy(&heapCounterCPU, m_hashData.d_heapCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - heapCounterCPU++; //points to the first free entry: number of blocks is one more - - cudaSafeCall(cudaMemcpy(heapCPU, m_hashData.d_heap, sizeof(unsigned int)*m_hashParams.m_numSDFBlocks, cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(hashCPU, m_hashData.d_hash, sizeof(HashEntry)*m_hashParams.m_hashNumBuckets, cudaMemcpyDeviceToHost)); - - //Check for duplicates - class myint3Voxel { - public: - myint3Voxel() {} - ~myint3Voxel() {} - bool operator<(const myint3Voxel& other) const { - if (x == other.x) { - if (y == other.y) { - return z < other.z; - } - return y < other.y; - } - return x < other.x; - } - - bool operator==(const myint3Voxel& other) const { - return x == other.x && y == other.y && z == other.z; - } - - int x,y,z, i; - int offset; - int ptr; - }; - - - std::unordered_set<unsigned int> pointersFreeHash; - std::vector<int> pointersFreeVec(m_hashParams.m_numSDFBlocks, 0); // CHECK Nick Changed to int from unsigned in - for (unsigned int i = 0; i < heapCounterCPU; i++) { - pointersFreeHash.insert(heapCPU[i]); - pointersFreeVec[heapCPU[i]] = FREE_ENTRY; - } - if (pointersFreeHash.size() != heapCounterCPU) { - throw std::runtime_error("ERROR: duplicate free pointers in heap array"); - } - - - unsigned int numOccupied = 0; - unsigned int numMinusOne = 0; - //unsigned int listOverallFound = 0; - - std::list<myint3Voxel> l; - //std::vector<myint3Voxel> v; - - for (unsigned int i = 0; i < m_hashParams.m_hashNumBuckets; i++) { - if (hashCPU[i].ptr == -1) { - numMinusOne++; - } - - if (hashCPU[i].ptr != -2) { - numOccupied++; // != FREE_ENTRY - myint3Voxel a; - a.x = hashCPU[i].pos.x; - a.y = hashCPU[i].pos.y; - a.z = hashCPU[i].pos.z; - l.push_back(a); - //v.push_back(a); - - unsigned int linearBlockSize = m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize*m_hashParams.m_SDFBlockSize; - if (pointersFreeHash.find(hashCPU[i].ptr / linearBlockSize) != pointersFreeHash.end()) { - throw std::runtime_error("ERROR: ptr is on free heap, but also marked as an allocated entry"); - } - pointersFreeVec[hashCPU[i].ptr / linearBlockSize] = LOCK_ENTRY; - } - } - - unsigned int numHeapFree = 0; - unsigned int numHeapOccupied = 0; - for (unsigned int i = 0; i < m_hashParams.m_numSDFBlocks; i++) { - if (pointersFreeVec[i] == FREE_ENTRY) numHeapFree++; - else if (pointersFreeVec[i] == LOCK_ENTRY) numHeapOccupied++; - else { - throw std::runtime_error("memory leak detected: neither free nor allocated"); - } - } - if (numHeapFree + numHeapOccupied == m_hashParams.m_numSDFBlocks) std::cout << "HEAP OK!" << std::endl; - else throw std::runtime_error("HEAP CORRUPTED"); - - l.sort(); - size_t sizeBefore = l.size(); - l.unique(); - size_t sizeAfter = l.size(); - - - std::cout << "diff: " << sizeBefore - sizeAfter << std::endl; - std::cout << "minOne: " << numMinusOne << std::endl; - std::cout << "numOccupied: " << numOccupied << "\t numFree: " << getHeapFreeCount() << std::endl; - std::cout << "numOccupied + free: " << numOccupied + getHeapFreeCount() << std::endl; - std::cout << "numInFrustum: " << m_hashParams.m_numOccupiedBlocks << std::endl; - - SAFE_DELETE_ARRAY(heapCPU); - SAFE_DELETE_ARRAY(hashCPU); - - //getchar(); -} - HashParams SceneRep::_parametersFromConfig() { //auto &cfg = ftl::config::resolve(config); HashParams params; @@ -404,15 +318,15 @@ void SceneRep::_destroy() { m_hashData.free(); } -void SceneRep::_alloc(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t stream) { +void SceneRep::_alloc(int camid, cudaStream_t stream) { // NOTE (nick): We might want this later... - if (false) { + /*if (false) { // TODO(Nick) Make this work without memcpy to host first - //allocate until all blocks are allocated - unsigned int prevFree = getHeapFreeCount(); + allocate until all blocks are allocated + unsigned int prevFree = 0; //getHeapFreeCount(); while (1) { resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream); - allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, stream); + allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream); unsigned int currFree = getHeapFreeCount(); @@ -424,16 +338,16 @@ void SceneRep::_alloc(const DepthCameraData& depthCameraData, const DepthCameraP } } } - else { + else {*/ //this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame) resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream); - allocCUDA(m_hashData, m_hashParams, depthCameraData, depthCameraParams, stream); - } + allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream); + //} } void SceneRep::_compactifyVisible(const DepthCameraParams &camera) { //const DepthCameraData& depthCameraData) { - ftl::cuda::compactifyVisible(m_hashData, m_hashParams, camera, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance + ftl::cuda::compactifyOccupied(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance //m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU } @@ -443,13 +357,17 @@ void SceneRep::_compactifyAllocated() { //m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU } -void SceneRep::_integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) { +/*void SceneRep::_integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) { if (!reg_mode_) ftl::cuda::integrateDepthMap(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_); else ftl::cuda::integrateRegistration(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_); +}*/ + +void SceneRep::_integrateDepthMaps() { + ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_); } void SceneRep::_garbageCollect() { - ftl::cuda::garbageCollectIdentify(m_hashData, m_hashParams, integ_stream_); + //ftl::cuda::garbageCollectIdentify(m_hashData, m_hashParams, integ_stream_); resetHashBucketMutexCUDA(m_hashData, m_hashParams, integ_stream_); //needed if linked lists are enabled -> for memeory deletion ftl::cuda::garbageCollectFree(m_hashData, m_hashParams, integ_stream_); } diff --git a/components/rgbd-sources/src/realsense_source.cpp b/components/rgbd-sources/src/realsense_source.cpp index a4f34c2360c3d3cf9c528277f1cc5f21278af905..64b0d328274a61b8baeca4c2706b38de8f521fe4 100644 --- a/components/rgbd-sources/src/realsense_source.cpp +++ b/components/rgbd-sources/src/realsense_source.cpp @@ -1,5 +1,7 @@ #include "realsense_source.hpp" #include <loguru.hpp> +#include <ftl/threads.hpp> +#include <ftl/rgbd/source.hpp> using ftl::rgbd::detail::RealsenseSource; using std::string; @@ -41,7 +43,7 @@ RealsenseSource::~RealsenseSource() { bool RealsenseSource::grab(int n, int b) { rs2::frameset frames = pipe_.wait_for_frames(); //rs2::align align(RS2_STREAM_DEPTH); - frames = align_to_depth_.process(frames); //align_to_depth_.process(frames); + //frames = align_to_depth_.process(frames); //align_to_depth_.process(frames); rs2::depth_frame depth = frames.get_depth_frame(); float w = depth.get_width();