Skip to content
Snippets Groups Projects
Commit 157f2902 authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Use op warp intrinsics if available

parent e52095f7
No related branches found
No related tags found
1 merge request!71Implements #130 for optional MLS merging
Pipeline #12347 passed
...@@ -193,6 +193,8 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int ...@@ -193,6 +193,8 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int
const uint i = threadIdx.x; //inside of an SDF block const uint i = threadIdx.x; //inside of an SDF block
const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); const int3 po = make_int3(hashData.delinearizeVoxelIndex(i));
const int warpNum = i / WARP_SIZE;
const int lane = i % WARP_SIZE;
// Stride over all allocated blocks // Stride over all allocated blocks
for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
...@@ -215,25 +217,44 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int ...@@ -215,25 +217,44 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int
float3 wnorm = make_float3(0.0f); float3 wnorm = make_float3(0.0f);
float weights = 0.0f; float weights = 0.0f;
// Preload depth values
// 1. Find min and max screen positions
// 2. Subtract/Add WINDOW_RADIUS to min/max
// ... check that the buffer is not too small to cover this
// ... if buffer not big enough then don't buffer at all.
// 3. Populate shared mem depth map buffer using all threads
// 4. Adjust window lookups to use shared mem buffer
//uint cam=0; //uint cam=0;
for (uint cam=0; cam<numcams; ++cam) { for (uint cam=0; cam<numcams; ++cam) {
const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
const uint height = camera.params.m_imageHeight; const uint height = camera.params.m_imageHeight;
const uint width = camera.params.m_imageWidth; const uint width = camera.params.m_imageWidth;
float3 pf = camera.poseInverse * pfb; const float3 pf = camera.poseInverse * pfb;
uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
#pragma unroll #pragma unroll
for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
// For this voxel in hash, get its screen position and check it is on screen
if (screenPos.x+u < width && screenPos.y+v < height) { //on screen if (screenPos.x+u < width && screenPos.y+v < height) { //on screen
float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); float depth;
// Compute >= 7 only
#if __CUDA_ARCH__ >= 700
uint posPack = ((screenPos.x+u) << 16) | (screenPos.y+v);
uint mask = __match_any_sync(__activemask(), posPack);
int lead = __ffs(mask)-1;
if (lead == lane) depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
depth = __shfl_sync(mask, depth, lead);
#else
depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
#endif
//float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); //float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
float weight = spatialWeighting(length(pfb - worldPos)); const float weight = spatialWeighting(length(pfb - worldPos));
wpos += weight*worldPos; wpos += weight*worldPos;
//wnorm += weight*make_float3(normal); //wnorm += weight*make_float3(normal);
......
...@@ -88,6 +88,8 @@ bool SceneRep::_initCUDA() { ...@@ -88,6 +88,8 @@ bool SceneRep::_initCUDA() {
// TODO:(Nick) Check memory is sufficient // TODO:(Nick) Check memory is sufficient
// TODO:(Nick) Find out what our compute capability should be. // TODO:(Nick) Find out what our compute capability should be.
LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor;
return true; return true;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment