Nicolas Pope
--- a/applications/reconstruct/src/dibr.cu

+ 57

− 44
+++ b/applications/reconstruct/src/dibr.cu

+ 57

− 44
 @@ -13,7 +13,7 @@
 #define WARP_SIZE 32
 #define DEPTH_THRESHOLD 0.05f
 #define UPSAMPLE_MAX 60
-#define MAX_ITERATIONS 10
+#define MAX_ITERATIONS 32
 #define SPATIAL_SMOOTHING 0.005f

 using ftl::cuda::TextureObject;
 @@ -377,9 +377,19 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp
 	}
 }

-#define NEIGHBOR_RADIUS_2 2
+#define NEIGHBOR_RADIUS_2 3
 #define NEIGHBOR_WINDOW ((NEIGHBOR_RADIUS_2*2+1)*(NEIGHBOR_RADIUS_2*2+1))
-#define MAX_NEIGHBORS_2 10
+#define MAX_NEIGHBORS_2 32
+
+#define FULL_MASK 0xffffffff
+
+__device__ inline float warpMaxEnergy(float energy) {
+	for (int i = WARP_SIZE/2; i > 0; i /= 2) {
+		const float other = __shfl_xor_sync(FULL_MASK, energy, i, WARP_SIZE);
+		energy = (energy > other) ? energy : other;
+	}
+	return energy;
+}


 /*
 @@ -394,14 +404,15 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp
 	__shared__ int minimum[2*T_PER_BLOCK];
 	__shared__ int maximum[2*T_PER_BLOCK];
 	__shared__ unsigned int nidx[2*T_PER_BLOCK];
+	__shared__ float sampleenergy[2*T_PER_BLOCK][WARP_SIZE];

 	const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2;
 	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
 	const int y = blockIdx.y*blockDim.y + threadIdx.y;

-	const float3 camPos = params.camera.kinectDepthToSkeleton(x,y, float(point_in.tex2D(x,y)) / 1000.0f);
+	//const float3 camPos = params.camera.kinectDepthToSkeleton(x,y, float(point_in.tex2D(x,y)) / 1000.0f);

-    const float r = 1.0f; //(camera.poseInverse * worldPos).z / camera.params.fx;
+    //const float r = 1.0f; //(camera.poseInverse * worldPos).z / camera.params.fx;

 	// Get virtual camera ray for splat centre and backface cull if possible
 	//const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f);
 @@ -410,11 +421,11 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp

    // Find the virtual screen position of current point
 	//const float3 camPos = params.m_viewMatrix * worldPos;
-	if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
-	if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
-	const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);
+	//if (camPos.z < params.camera.m_sensorDepthWorldMin) return;
+	//if (camPos.z > params.camera.m_sensorDepthWorldMax) return;
+	//const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);

-	const int upsample = 16; //min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));
+	//const int upsample = 16; //min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z));

 	// Not on screen so stop now...
 	//if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 ||
 @@ -436,10 +447,10 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp
 		const int u = (i % (2*NEIGHBOR_RADIUS_2+1)) - NEIGHBOR_RADIUS_2;
 		const int v = (i / (2*NEIGHBOR_RADIUS_2+1)) - NEIGHBOR_RADIUS_2;
 		const float3 point = params.camera.kinectDepthToSkeleton(x+u, y+v, float(point_in.tex2D(x+u, y+v)) / 1000.0f);
+		const float3 camPos = params.camera.kinectDepthToSkeleton(x, y, point.z);

 		// If it is close enough...
-		// TODO: We don't have camPos so distance if this pixel takes on same depth
-		if (length(point - camPos) <= SPATIAL_SMOOTHING) {
+		if (point.z > params.camera.m_sensorDepthWorldMin && point.z < params.camera.m_sensorDepthWorldMax && length(point - camPos) <= 0.02f) {
 			// Append to neighbour list
 			unsigned int idx = atomicInc(&nidx[warp], MAX_NEIGHBORS_2-1);
 			neighborhood_cache[warp][idx] = point;
 @@ -450,48 +461,47 @@ __global__ void OLD_dibr_visibility_kernel(TextureObject<int> depth, int cam, Sp

 	__syncwarp();
 	
-	const float interval = (float(maximum[warp])/1000.0f - float(minimum[warp]) / 1000.0f) / float(MAX_ITERATIONS);
-	//if (y == 200) printf("interval: %f\n", interval);
+	const float minDepth = float(minimum[warp])/1000.0f;
+	const float maxDepth = float(maximum[warp])/1000.0f;
+	const float interval = (maxDepth - minDepth) / float(MAX_ITERATIONS);
+
+	if (minDepth >= params.camera.m_sensorDepthWorldMax) return;
+	if (maxDepth <= params.camera.m_sensorDepthWorldMin) return;
+	if (y == 200) printf("interval: %f\n", interval);
+
+	//const uint2 screenPos = params.camera.cameraToKinectScreen(camPos);

 	// Each thread in warp takes an upsample point and updates corresponding depth buffer.
 	// TODO: Don't do this step, simply update the current pixel to either fill or replace existing value
 	// use warp threads to do the iteration samples ... 32 samples per pixel.
 	// could iterate each thread to perform more checks within likely range.
-	for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
-		const float u = (i % upsample) - (upsample / 2);
-		const float v = (i / upsample) - (upsample / 2);
+	//for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) {
+	//	const float u = (i % upsample) - (upsample / 2);
+	//	const float v = (i / upsample) - (upsample / 2);

        // Make an initial estimate of the points location
 		// Use minimum z as first estimate
-		float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,float(minimum[warp])/1000.0f);
-		float lastenergy = 0.0f;
-		float lastdepth = nearest.z;
+		float maxenergy = 0.0f;
+		float bestdepth = -1.0f;

 		// Search for best or threshold energy
-		for (int k=0; k<MAX_ITERATIONS; ++k) {
+		for (int k=lane; k<MAX_ITERATIONS; k+=WARP_SIZE) {
+			const float3 nearest = params.camera.kinectDepthToSkeleton(x,y,minDepth+float(k)*interval);
+			const float myenergy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS_2>(neighborhood_cache[warp], nearest, nidx[warp], SPATIAL_SMOOTHING);
+			maxenergy = warpMaxEnergy(max(myenergy, maxenergy));
+			bestdepth = (myenergy == maxenergy) ? nearest.z : -1.0f;
+		}

-			const float energy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS_2>(neighborhood_cache[warp], nearest, nidx[warp], SPATIAL_SMOOTHING);
-			
-			//if (energy <= 0.0f) break;
-		
-			const float d = nearest.z;
-			nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d+interval);
-			
-			// Search for first energy maximum above a threshold
-			if (lastenergy >= 0.01f && energy < lastenergy) {
-				const unsigned int cx = screenPos.x+u;
-				const unsigned int cy = screenPos.y+v;
-				if (lastdepth > params.camera.m_sensorDepthWorldMin && lastdepth < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
-					// Transform estimated point to virtual cam space and output z
-					atomicMin(&depth(cx,cy), lastdepth * 1000.0f);
-				}
-				break;
+		// Search for first energy maximum above a threshold
+		if (bestdepth > 0.0f && maxenergy >= 0.01f) {
+			const unsigned int cx = x;
+			const unsigned int cy = y;
+			if (bestdepth > params.camera.m_sensorDepthWorldMin && bestdepth < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) {
+				// Transform estimated point to virtual cam space and output z
+				atomicMin(&depth(cx,cy), bestdepth * 1000.0f);
 			}
-
-			lastenergy = energy;
-			lastdepth = d;
 		}
-	}
+	//}
 }

 // ===== Pass 2 and 3 : Attribute contributions ================================
 @@ -657,19 +667,22 @@ void ftl::cuda::dibr(const TextureObject<int> &depth_out,
 #endif

 	int i=3;
+
+	bool noSplatting = params.m_flags & ftl::render::kNoSplatting;
+
 	// Pass 1, gather and upsample depth maps
 	if (params.m_flags & ftl::render::kNoUpsampling) {
 		for (int i=0; i<numcams; ++i)
-			dibr_merge_kernel<<<gridSize, blockSize, 0, stream>>>(tmp_depth, i, params);
+			dibr_merge_kernel<<<gridSize, blockSize, 0, stream>>>((noSplatting) ? depth_out : tmp_depth, i, params);
 	} else {
 		for (int i=0; i<numcams; ++i)
-			dibr_merge_upsample_kernel<<<sgridSize, sblockSize, 0, stream>>>(tmp_depth, i, params);
+			dibr_merge_upsample_kernel<<<sgridSize, sblockSize, 0, stream>>>((noSplatting) ? depth_out : tmp_depth, i, params);
 	}

-	if (params.m_flags & ftl::render::kNoSplatting) {
+	if (noSplatting) {
 		// Pass 3, accumulate all point contributions to pixels
 		for (int i=0; i<numcams; ++i)
-        	dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(tmp_depth, tmp_colour, normal_out, confidence_out, i, params);
+        	dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, tmp_colour, normal_out, confidence_out, i, params);
 	} else {
 		// Pass 2
 		dibr_visibility_principal_kernel2<<<sgridSize, sblockSize, 0, stream>>>(tmp_depth, depth_out, params);