From 157f290263882e5df9c95688d2d992a1f981cfe2 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Mon, 22 Jul 2019 08:09:33 +0300
Subject: [PATCH] Use op warp intrinsics if available

---
 applications/reconstruct/src/integrators.cu  | 33 ++++++++++++++++----
 applications/reconstruct/src/voxel_scene.cpp |  2 ++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu
index 49b7ffcea..1c243a775 100644
--- a/applications/reconstruct/src/integrators.cu
+++ b/applications/reconstruct/src/integrators.cu
@@ -193,6 +193,8 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int
 
 	const uint i = threadIdx.x;	//inside of an SDF block
 	const int3 po = make_int3(hashData.delinearizeVoxelIndex(i));
+	const int warpNum = i / WARP_SIZE;
+	const int lane = i % WARP_SIZE;
 
 	// Stride over all allocated blocks
 	for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
@@ -215,25 +217,44 @@ __global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int
 	float3 wnorm = make_float3(0.0f);
 	float weights = 0.0f;
 
+	// Preload depth values
+	// 1. Find min and max screen positions
+	// 2. Subtract/Add WINDOW_RADIUS to min/max
+	// ... check that the buffer is not too small to cover this
+	// ... if buffer not big enough then don't buffer at all.
+	// 3. Populate shared mem depth map buffer using all threads
+	// 4. Adjust window lookups to use shared mem buffer
+
 	//uint cam=0;
 	for (uint cam=0; cam<numcams; ++cam) {
 		const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];
 		const uint height = camera.params.m_imageHeight;
 		const uint width = camera.params.m_imageWidth;
 	
-		float3 pf = camera.poseInverse * pfb;
-		uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
+		const float3 pf = camera.poseInverse * pfb;
+		const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf));
 
 		#pragma unroll
 		for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) {
 			for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) {
-				// For this voxel in hash, get its screen position and check it is on screen
 				if (screenPos.x+u < width && screenPos.y+v < height) {	//on screen
-					float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+					float depth;
+
+					// Compute >= 7 only
+					#if __CUDA_ARCH__ >= 700
+					uint posPack = ((screenPos.x+u) << 16) | (screenPos.y+v);
+					uint mask = __match_any_sync(__activemask(), posPack);
+					int lead = __ffs(mask)-1;
+					if (lead == lane) depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+					depth = __shfl_sync(mask, depth, lead);
+					#else
+					depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v);
+					#endif
+
 					//float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v);
-					float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
+					const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth);
 
-					float weight = spatialWeighting(length(pfb - worldPos));
+					const float weight = spatialWeighting(length(pfb - worldPos));
 
 					wpos += weight*worldPos;
 					//wnorm += weight*make_float3(normal);
diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp
index a1cdba97e..20b087370 100644
--- a/applications/reconstruct/src/voxel_scene.cpp
+++ b/applications/reconstruct/src/voxel_scene.cpp
@@ -88,6 +88,8 @@ bool SceneRep::_initCUDA() {
 	// TODO:(Nick) Check memory is sufficient
 	// TODO:(Nick) Find out what our compute capability should be.
 
+	LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor;
+
 	return true;
 }
 
-- 
GitLab