From 157f290263882e5df9c95688d2d992a1f981cfe2 Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nwpope@utu.fi> Date: Mon, 22 Jul 2019 08:09:33 +0300 Subject: [PATCH] Use op warp intrinsics if available --- applications/reconstruct/src/integrators.cu | 33 ++++++++++++++++---- applications/reconstruct/src/voxel_scene.cpp | 2 ++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu index 49b7ffcea..1c243a775 100644 --- a/applications/reconstruct/src/integrators.cu +++ b/applications/reconstruct/src/integrators.cu @@ -193,6 +193,8 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int const uint i = threadIdx.x; //inside of an SDF block const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); + const int warpNum = i / WARP_SIZE; + const int lane = i % WARP_SIZE; // Stride over all allocated blocks for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { @@ -215,25 +217,44 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int float3 wnorm = make_float3(0.0f); float weights = 0.0f; + // Preload depth values + // 1. Find min and max screen positions + // 2. Subtract/Add WINDOW_RADIUS to min/max + // ... check that the buffer is not too small to cover this + // ... if buffer not big enough then don't buffer at all. + // 3. Populate shared mem depth map buffer using all threads + // 4. Adjust window lookups to use shared mem buffer + //uint cam=0; for (uint cam=0; cam<numcams; ++cam) { const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; const uint height = camera.params.m_imageHeight; const uint width = camera.params.m_imageWidth; - float3 pf = camera.poseInverse * pfb; - uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + const float3 pf = camera.poseInverse * pfb; + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); #pragma unroll for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { - // For this voxel in hash, get its screen position and check it is on screen if (screenPos.x+u < width && screenPos.y+v < height) { //on screen - float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + float depth; + + // Compute >= 7 only + #if __CUDA_ARCH__ >= 700 + uint posPack = ((screenPos.x+u) << 16) | (screenPos.y+v); + uint mask = __match_any_sync(__activemask(), posPack); + int lead = __ffs(mask)-1; + if (lead == lane) depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + depth = __shfl_sync(mask, depth, lead); + #else + depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + #endif + //float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); - float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); - float weight = spatialWeighting(length(pfb - worldPos)); + const float weight = spatialWeighting(length(pfb - worldPos)); wpos += weight*worldPos; //wnorm += weight*make_float3(normal); diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp index a1cdba97e..20b087370 100644 --- a/applications/reconstruct/src/voxel_scene.cpp +++ b/applications/reconstruct/src/voxel_scene.cpp @@ -88,6 +88,8 @@ bool SceneRep::_initCUDA() { // TODO:(Nick) Check memory is sufficient // TODO:(Nick) Find out what our compute capability should be. + LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor; + return true; } -- GitLab