diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f0b1e56ab31eb91e30f49c95463df46784496ce..75783eb36247e22217894636720e256e69c387b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,8 @@ check_language(CUDA) if (CUDA_TOOLKIT_ROOT_DIR) enable_language(CUDA) set(CMAKE_CUDA_FLAGS "") -set(CMAKE_CUDA_FLAGS_DEBUG "-g -DDEBUG -D_DEBUG") -set(CMAKE_CUDA_FLAGS_RELEASE "") +set(CMAKE_CUDA_FLAGS_DEBUG "--gpu-architecture=compute_61 -g -DDEBUG -D_DEBUG") +set(CMAKE_CUDA_FLAGS_RELEASE "--gpu-architecture=compute_61") set(HAVE_CUDA TRUE) include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) diff --git a/applications/reconstruct/include/ftl/depth_camera.hpp b/applications/reconstruct/include/ftl/depth_camera.hpp index 7df38bb006091930fc03204d58cd71999e6ed95e..a62f6f9d11287a7b665ed752eae8162827b71907 100644 --- a/applications/reconstruct/include/ftl/depth_camera.hpp +++ b/applications/reconstruct/include/ftl/depth_camera.hpp @@ -23,6 +23,7 @@ namespace voxhash { struct __align__(16) DepthCameraCUDA { cudaTextureObject_t depth; + cudaTextureObject_t depth2; cudaTextureObject_t colour; cudaTextureObject_t normal; DepthCameraParams params; @@ -47,10 +48,12 @@ struct DepthCamera { __host__ void _computeNormals(cudaStream_t stream); cv::cuda::GpuMat *depth_mat_; + cv::cuda::GpuMat *depth2_mat_; cv::cuda::GpuMat *colour_mat_; cv::cuda::GpuMat *point_mat_; cv::cuda::GpuMat *normal_mat_; ftl::cuda::TextureObject<float> *depth_tex_; + ftl::cuda::TextureObject<float> *depth2_tex_; ftl::cuda::TextureObject<uchar4> *colour_tex_; ftl::cuda::TextureObject<float4> *normal_tex_; diff --git a/applications/reconstruct/src/depth_camera.cpp b/applications/reconstruct/src/depth_camera.cpp index 93d6ab2b44592fbfc859428443922cfcaa304454..b2dd2671af057376f8de5441073a01c113ed4b69 100644 --- a/applications/reconstruct/src/depth_camera.cpp +++ b/applications/reconstruct/src/depth_camera.cpp @@ -7,20 +7,25 @@ using ftl::voxhash::DepthCameraCUDA; DepthCamera::DepthCamera() { depth_mat_ = nullptr; + depth2_mat_ = nullptr; colour_mat_ = nullptr; point_mat_ = nullptr; normal_mat_ = nullptr; depth_tex_ = nullptr; + depth2_tex_ = nullptr; colour_tex_ = nullptr; normal_tex_ = nullptr; } void DepthCamera::alloc(const DepthCameraParams& params, bool withNormals) { //! todo resizing??? depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1); + depth2_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1); colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4); depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_); + depth2_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth2_mat_); colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_); data.depth = depth_tex_->cudaTexture(); + data.depth2 = depth2_tex_->cudaTexture(); data.colour = colour_tex_->cudaTexture(); data.params = params; @@ -36,6 +41,7 @@ void DepthCamera::alloc(const DepthCameraParams& params, bool withNormals) { //! void DepthCamera::free() { if (depth_mat_) delete depth_mat_; + if (depth2_mat_) delete depth2_mat_; if (colour_mat_) delete colour_mat_; if (point_mat_) delete point_mat_; if (normal_mat_) delete normal_mat_; @@ -45,7 +51,7 @@ void DepthCamera::free() { } void DepthCamera::updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream) { - depth_mat_->upload(depth, stream); + depth2_mat_->upload(depth, stream); colour_mat_->upload(rgb, stream); if (normal_mat_) { _computeNormals(cv::cuda::StreamAccessor::getStream(stream)); diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu index 9234504e2d78b1cbf75563693b119b7975b2481e..9849ec09845e9007a134b745eb81d0cfbceba65f 100644 --- a/applications/reconstruct/src/depth_camera.cu +++ b/applications/reconstruct/src/depth_camera.cu @@ -7,6 +7,83 @@ #define MINF __int_as_float(0xff800000) using ftl::voxhash::DepthCameraCUDA; +using ftl::voxhash::HashData; +using ftl::voxhash::HashParams; + +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; + +/// ===== MLS Smooth + +// TODO:(Nick) Put this in a common location (used in integrators.cu) +extern __device__ float spatialWeighting(float r); + +#define WINDOW_RADIUS 5 + +__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float> output, HashData hashData, HashParams hashParams, int numcams, int cam) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = output.width(); + const int height = output.height(); + + const DepthCameraCUDA &mainCamera = c_cameras[cam]; + + if (x < width && y < height) { + float depth = tex2D<float>(mainCamera.depth2, x, y); + + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + + if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) { + const float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth); + + for (uint cam=0; cam<numcams; ++cam) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + const uint height = camera.params.m_imageHeight; + const uint width = camera.params.m_imageWidth; + + const float3 pf = camera.poseInverse * mPos; + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + + #pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float depth = tex2D<float>(camera.depth2, screenPos.x+u, screenPos.y+v); + + //float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); + const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + const float weight = spatialWeighting(length(mPos - worldPos)); + + wpos += weight*worldPos; + //wnorm += weight*make_float3(normal); + weights += weight; + } + } + } + } + + wpos /= weights; + } + + output(x,y) = (weights >= 20.0f) ? (mainCamera.poseInverse * wpos).z : 1000.0f; + } +} + +void ftl::cuda::mls_smooth(TextureObject<float> &output, const HashData &hashData, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) { + const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashData, hashParams, numcams, cam); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} + + +/// ===== Point cloud from depth ===== __global__ void point_cloud_kernel(float3* output, DepthCameraCUDA depthCameraData) { @@ -38,6 +115,7 @@ void ftl::cuda::point_cloud(float3* output, const DepthCameraCUDA &depthCameraDa /// ===== NORMALS ===== + __global__ void compute_normals_kernel(const float3 *input, ftl::cuda::TextureObject<float4> output) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; @@ -61,12 +139,12 @@ __global__ void compute_normals_kernel(const float3 *input, ftl::cuda::TextureOb if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF) { const float3 n = cross(PC-MC, CP-CM); - const float l = length(n); + //const float l = length(n); - if(l > 0.0f) - { - output(x,y) = make_float4(n/-l, 1.0f); - } + //if(l > 0.0f) + //{ + output(x,y) = make_float4(n, 1.0f); //make_float4(n/-l, 1.0f); + //} } } } diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp index 05a83a473a7a36ccf6a16fc8c699d63046eea3b1..54bc453741a154ac3bf86409ed64cc450bf1eb06 100644 --- a/applications/reconstruct/src/depth_camera_cuda.hpp +++ b/applications/reconstruct/src/depth_camera_cuda.hpp @@ -7,6 +7,8 @@ namespace ftl { namespace cuda { +void mls_smooth(TextureObject<float> &output, const ftl::voxhash::HashData &hashData, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream); + void point_cloud(float3* output, const ftl::voxhash::DepthCameraCUDA &depthCameraData, cudaStream_t stream); void compute_normals(const float3 *points, ftl::cuda::TextureObject<float4> *normals, cudaStream_t stream); diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu index 1c243a775dbf6e85b634ed8cc9d352229831d075..0b7f7936528c225099ac6fb52b5db5e3a570051e 100644 --- a/applications/reconstruct/src/integrators.cu +++ b/applications/reconstruct/src/integrators.cu @@ -159,7 +159,8 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam // Calculate voxel sign values across a warp int warpNum = i / WARP_SIZE; - uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1); + //uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1); + uint ballot_result = __ballot_sync(0xFFFFFFFF, (fabs(oldVoxel.sdf) <= hashParams.m_virtualVoxelSize && oldVoxel.weight > 0) ? 1 : 0); // Aggregate each warp result into voxel mask if (i % WARP_SIZE == 0) { @@ -186,10 +187,10 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam } #define WINDOW_RADIUS 5 +#define PATCH_SIZE 32 __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int numcams) { __shared__ uint voxels[16]; - //__shared__ uint valid[16]; const uint i = threadIdx.x; //inside of an SDF block const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); @@ -202,20 +203,20 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int //TODO check if we should load this in shared memory //HashEntryHead entry = hashData.d_hashCompactified[bi]->head; - int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ)); + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ)); //uint idx = entry.offset + i; - int3 pi = pi_base + po; - float3 pfb = hashData.virtualVoxelPosToWorld(pi); - int count = 0; + const int3 pi = pi_base + po; + const float3 pfb = hashData.virtualVoxelPosToWorld(pi); + //int count = 0; //float camdepths[MAX_CAMERAS]; - Voxel oldVoxel; // = hashData.d_SDFBlocks[idx]; - hashData.deleteVoxel(oldVoxel); + //Voxel oldVoxel; // = hashData.d_SDFBlocks[idx]; + //hashData.deleteVoxel(oldVoxel); - float3 wpos = make_float3(0.0f); - float3 wnorm = make_float3(0.0f); - float weights = 0.0f; + float3 awpos = make_float3(0.0f); + float3 awnorm = make_float3(0.0f); + float aweights = 0.0f; // Preload depth values // 1. Find min and max screen positions @@ -234,26 +235,18 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int const float3 pf = camera.poseInverse * pfb; const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + #pragma unroll for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { if (screenPos.x+u < width && screenPos.y+v < height) { //on screen - float depth; - - // Compute >= 7 only - #if __CUDA_ARCH__ >= 700 - uint posPack = ((screenPos.x+u) << 16) | (screenPos.y+v); - uint mask = __match_any_sync(__activemask(), posPack); - int lead = __ffs(mask)-1; - if (lead == lane) depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); - depth = __shfl_sync(mask, depth, lead); - #else - depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); - #endif + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); //float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); - const float weight = spatialWeighting(length(pfb - worldPos)); wpos += weight*worldPos; @@ -262,12 +255,15 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int } } } + + awpos += wpos; + aweights += weights; } - wpos /= weights; + awpos /= aweights; //wnorm /= weights; - float sdf = length(pfb - wpos); + float sdf = length(pfb - awpos); //float sdf = wnorm.x * (pfb.x - wpos.x) + wnorm.y * (pfb.y - wpos.y) + wnorm.z * (pfb.z - wpos.z); //printf("WEIGHTS: %f\n", weights); @@ -276,9 +272,9 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int // Calculate voxel sign values across a warp int warpNum = i / WARP_SIZE; - //uint laneMask = 1 << (i % WARP_SIZE); - uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) < hashParams.m_virtualVoxelSize && weights > hashParams.m_confidenceThresh) ? 1 : 0); - //uint valid_ballot = __ballot_sync(0xFFFFFFFF, (weights >= 1.0f) ? 1 : 0); + + uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) < hashParams.m_virtualVoxelSize && aweights > hashParams.m_confidenceThresh) ? 1 : 0); + //uint solid_ballot = __ballot_sync(0xFFFFFFFF, (sdf < 0.0f ) ? 1 : 0); // Aggregate each warp result into voxel mask if (i % WARP_SIZE == 0) { @@ -314,7 +310,7 @@ const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); //if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) - integrateMLSKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); + integrateDepthMapsKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); //} //cudaSafeCall( cudaGetLastError() ); diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/splat_render.cu index 486ee7f8512aad1929be64b6d572930b514b1efa..edd29cadf33228c34ffd63e35b833846b3104d90 100644 --- a/applications/reconstruct/src/splat_render.cu +++ b/applications/reconstruct/src/splat_render.cu @@ -76,6 +76,61 @@ __device__ inline bool getVoxel(uint *voxels, int ix) { return voxels[ix/32] & (0x1 << (ix % 32)); } +__global__ void occupied_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) { + __shared__ uint voxels[16]; + __shared__ ftl::voxhash::HashEntryHead block; + + // Stride over all allocated blocks + for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { + __syncthreads(); + + const uint i = threadIdx.x; //inside of an SDF block + + if (i == 0) block = hashData.d_hashCompactified[bi]->head; + if (i < 16) { + voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + //valid[i] = hashData.d_hashCompactified[bi]->validity[i]; + } + + // Make sure all hash entries are cached + __syncthreads(); + + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ)); + const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i)); + const int3 pi = pi_base + vp; + const float3 worldPos = hashData.virtualVoxelPosToWorld(pi); + + const bool v = getVoxel(voxels, i); + + uchar4 color = make_uchar4(255,0,0,255); + bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); + + + // Only for surface voxels, work out screen coordinates + if (!is_surface) continue; + + // TODO: For each original camera, render a new depth map + + const float3 camPos = params.m_viewMatrix * worldPos; + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f) + + //printf("Worldpos: %f,%f,%f\n", camPos.x, camPos.y, camPos.z); + + if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + + const unsigned int x = screenPos.x; + const unsigned int y = screenPos.y; + const int idepth = static_cast<int>(camPos.z * 1000.0f); + + // See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds + if (x < depth.width() && y < depth.height()) { + atomicMin(&depth(x,y), idepth); + } + + } // Stride +} + __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) { // TODO:(Nick) Reduce bank conflicts by aligning these __shared__ uint voxels[16]; @@ -111,7 +166,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture //__syncthreads(); //if (voxels[j].weight == 0) continue; - //if (vp.x == 7 || vp.y == 7 || vp.z == 7) continue; + if (vp.x == 7 || vp.y == 7 || vp.z == 7) continue; int edgeX = (vp.x == 0 ) ? 1 : 0; @@ -122,7 +177,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); //if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255); - //if (!v) continue; // !getVoxel(valid, i) + if (v) continue; // !getVoxel(valid, i) //if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0); @@ -130,7 +185,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture // it is fine to check for any sign change? -/*#pragma unroll +#pragma unroll for (int u=0; u<=1; u++) { for (int v=0; v<=1; v++) { for (int w=0; w<=1; w++) { @@ -146,7 +201,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture } } } - }*/ + } // Only for surface voxels, work out screen coordinates if (!is_surface) continue; @@ -194,7 +249,7 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); - isosurface_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params); + occupied_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params); cudaSafeCall( cudaGetLastError() ); diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp index 20b087370b909f76b11b544613b42b5cbc5489e5..6e12bd9cbcae1c4557347937ec5d582ee17c8fc7 100644 --- a/applications/reconstruct/src/voxel_scene.cpp +++ b/applications/reconstruct/src/voxel_scene.cpp @@ -2,6 +2,7 @@ #include "compactors.hpp" #include "garbage.hpp" #include "integrators.hpp" +#include "depth_camera_cuda.hpp" #include <opencv2/core/cuda_stream_accessor.hpp> @@ -354,6 +355,9 @@ void SceneRep::_compactifyAllocated() { }*/ void SceneRep::_integrateDepthMaps() { + for (size_t i=0; i<cameras_.size(); ++i) { + ftl::cuda::mls_smooth(*(cameras_[i].gpu.depth_tex_), m_hashData, m_hashParams, cameras_.size(), i, integ_stream_); + } ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_); }