diff --git a/applications/reconstruct/include/ftl/voxel_hash.hpp b/applications/reconstruct/include/ftl/voxel_hash.hpp index c0a228834337e60f5807313735eedee25eb51dcb..40b981740caef12869ad977f42293276879578cb 100644 --- a/applications/reconstruct/include/ftl/voxel_hash.hpp +++ b/applications/reconstruct/include/ftl/voxel_hash.hpp @@ -73,6 +73,7 @@ struct __align__(16) HashEntry { HashEntryHead head; uint voxels[16]; // 512 bits, 1 bit per voxel + uint validity[16]; // Is the voxel valid, 512 bit /*__device__ void operator=(const struct HashEntry& e) { ((long long*)this)[0] = ((const long long*)&e)[0]; diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu index 984fd29dce023bd1491c2637b73b98accaf303d9..9abb28c8431893f9d33ce5da54990002b8b81c13 100644 --- a/applications/reconstruct/src/integrators.cu +++ b/applications/reconstruct/src/integrators.cu @@ -47,10 +47,10 @@ __device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) { */ __device__ float spatialWeighting(float r) { const float h = c_hashParams.m_spatialSmoothing; + if (r >= h) return 0.0f; float rh = r / h; rh = 1.0f - rh*rh; - rh = rh*rh*rh*rh; - return (rh < h) ? rh : 0.0f; + return rh*rh*rh*rh; } @@ -103,6 +103,10 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam //printf("screen pos %d\n", color.x); //return; + // TODO:(Nick) Accumulate weighted positions + // TODO:(Nick) Accumulate weighted normals + // TODO:(Nick) Accumulate weights + // Depth is within accepted max distance from camera if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) //camdepths[count] = depth; @@ -181,6 +185,102 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam } } +#define WINDOW_RADIUS 7 + +__global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int numcams) { + __shared__ uint voxels[16]; + __shared__ uint valid[16]; + + const uint i = threadIdx.x; //inside of an SDF block + const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); + + // Stride over all allocated blocks + for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { + + //TODO check if we should load this in shared memory + //HashEntryHead entry = hashData.d_hashCompactified[bi]->head; + + int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ)); + + //uint idx = entry.offset + i; + int3 pi = pi_base + po; + float3 pfb = hashData.virtualVoxelPosToWorld(pi); + int count = 0; + //float camdepths[MAX_CAMERAS]; + + Voxel oldVoxel; // = hashData.d_SDFBlocks[idx]; + hashData.deleteVoxel(oldVoxel); + + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + + for (uint cam=0; cam<numcams; ++cam) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + const uint height = camera.params.m_imageHeight; + const uint width = camera.params.m_imageWidth; + + float3 pf = camera.poseInverse * pfb; + uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + // For this voxel in hash, get its screen position and check it is on screen + if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); + float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + + float weight = spatialWeighting(length(pfb - worldPos)); + + wpos += weight*worldPos; + wnorm += weight*make_float3(normal); + weights += weight; + } + } + } + } + + wpos /= weights; + wnorm /= weights; + + float sdf = length(pfb - wpos); //wnorm.x * (pfb.x - wpos.x) + wnorm.y * (pfb.y - wpos.y) + wnorm.z * (pfb.z - wpos.z); + + //printf("WEIGHTS: %f\n", weights); + + //if (weights < 0.00001f) sdf = 0.0f; + + // Calculate voxel sign values across a warp + int warpNum = i / WARP_SIZE; + uint solid_ballot = __ballot_sync(0xFFFFFFFF, (sdf >= 0.0f) ? 0 : 1); + uint valid_ballot = __ballot_sync(0xFFFFFFFF, (weights >= 2.0f) ? 1 : 0); + + // Aggregate each warp result into voxel mask + if (i % WARP_SIZE == 0) { + voxels[warpNum] = solid_ballot; + valid[warpNum] = valid_ballot; + } + + __syncthreads(); + + // Work out if block is occupied or not and save voxel masks + // TODO:(Nick) Is it faster to do this in a separate garbage kernel? + if (i < 16) { + const uint v = voxels[i]; + hashData.d_hashCompactified[bi]->voxels[i] = v; + hashData.d_hashCompactified[bi]->validity[i] = valid[i]; + const uint mask = 0x0000FFFF; + uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF); + uint b2 = __ballot_sync(mask, v == 0); + if (i == 0) { + if (b1 != mask && b2 != mask) hashData.d_hashCompactified[bi]->head.flags |= ftl::voxhash::kFlagSurface; + else hashData.d_hashCompactified[bi]->head.flags &= ~ftl::voxhash::kFlagSurface; + } + } + + } +} + void ftl::cuda::integrateDepthMaps(HashData& hashData, const HashParams& hashParams, int numcams, cudaStream_t stream) { @@ -189,7 +289,7 @@ const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); //if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) - integrateDepthMapsKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); + integrateMLSKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); //} //cudaSafeCall( cudaGetLastError() ); diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/splat_render.cu index 0e3598f129bf40c72294ab19501b1fd3c95bb1e9..58e4bbf2a4b1fbdb87877d4deee70665661f9813 100644 --- a/applications/reconstruct/src/splat_render.cu +++ b/applications/reconstruct/src/splat_render.cu @@ -79,6 +79,7 @@ __device__ inline bool getVoxel(uint *voxels, int ix) { __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) { // TODO:(Nick) Reduce bank conflicts by aligning these __shared__ uint voxels[16]; + __shared__ uint valid[16]; __shared__ ftl::voxhash::HashEntryHead block; // Stride over all allocated blocks @@ -88,7 +89,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture const uint i = threadIdx.x; //inside of an SDF block if (i == 0) block = hashData.d_hashCompactified[bi]->head; - if (i < 16) voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + if (i < 16) { + voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + valid[i] = hashData.d_hashCompactified[bi]->validity[i]; + } // Make sure all hash entries are cached __syncthreads(); @@ -118,7 +122,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture bool is_surface = false; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); //if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255); - if (!is_surface && v) continue; + if (!getVoxel(valid, i)) continue; //if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0); @@ -136,7 +140,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture //if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue; const bool vox = getVoxel(voxels, hashData.linearizeVoxelPos(uvi)); - if (vox) { + if (getVoxel(valid, hashData.linearizeVoxelPos(uvi))) { is_surface = true; // Should break but is slower? }