Nicolas Pope
--- a/applications/reconstruct/src/ilw/ilw.cu

+ 215

− 105
+++ b/applications/reconstruct/src/ilw/ilw.cu

+ 215

− 105
 @@ -24,120 +24,214 @@ __device__ inline float warpSum(float e) {
 	return e;
 }

-//#define COR_WIN_RADIUS 17
-//#define COR_WIN_SIZE (COR_WIN_RADIUS * COR_WIN_RADIUS)
+//==============================================================================
+
+template <int RADIUS>
+__global__ void preprocess_kernel(
+    	ftl::cuda::TextureObject<float> depth_in,
+		ftl::cuda::TextureObject<float> depth_out,
+		ftl::cuda::TextureObject<uchar4> colour,
+		ftl::cuda::TextureObject<int> mask,
+		ftl::rgbd::Camera camera,
+		ftl::cuda::ILWParams params) {
+
+    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	float d = depth_in.tex2D((int)x,(int)y);
+	uchar4 c = colour.tex2D((int)x,(int)y);
+
+	// Calculate discontinuity mask
+
+	// Fill missing depths
+	if (d < camera.minDepth || d > camera.maxDepth) {
+		float depth_accum = 0.0f;
+		float contrib = 0.0f;
+
+		for (int v=-RADIUS; v<=RADIUS; ++v) {
+			for (int u=-RADIUS; u<=RADIUS; ++u) {
+				uchar4 c2 = colour.tex2D((int)x+u,(int)y+v);
+				float d2 = depth_in.tex2D((int)x+u,(int)y+v);
+				if (d2 >= camera.minDepth && d2 <= camera.maxDepth) {
+					float w = ftl::cuda::colourWeighting(c, c2, params.colour_smooth);
+					depth_accum += d2*w;
+					contrib += w;
+				}
+			}
+		}
+
+		if (contrib >= 0.0f) d = depth_accum / contrib;
+	}
+
+	depth_out(x,y) = d;
+}
+
+void ftl::cuda::preprocess_depth(
+		ftl::cuda::TextureObject<float> &depth_in,
+		ftl::cuda::TextureObject<float> &depth_out,
+		ftl::cuda::TextureObject<uchar4> &colour,
+		ftl::cuda::TextureObject<int> &mask,
+		const ftl::rgbd::Camera &camera,
+		const ftl::cuda::ILWParams &params,
+		cudaStream_t stream) {
+
+	const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+
+	preprocess_kernel<3><<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, colour, mask, camera, params);
+
+	cudaSafeCall( cudaGetLastError() );
+}
+
+//==============================================================================
+
+template<int FUNCTION>
+__device__ float weightFunction(const ftl::cuda::ILWParams &params, float dweight, float cweight);
+
+template <>
+__device__ inline float weightFunction<0>(const ftl::cuda::ILWParams &params, float dweight, float cweight) {
+	return (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<1>(const ftl::cuda::ILWParams &param, float dweight, float cweight) {
+	return (cweight * cweight * dweight);
+}

-template<int COR_WIN_RADIUS> 
+template <>
+__device__ inline float weightFunction<2>(const ftl::cuda::ILWParams &param, float dweight, float cweight) {
+	return (dweight * dweight * cweight);
+}
+
+template <>
+__device__ inline float weightFunction<3>(const ftl::cuda::ILWParams &params, float dweight, float cweight) {
+	return (dweight == 0.0f) ? 0.0f : (params.cost_ratio * (cweight) + (1.0f - params.cost_ratio) * dweight);
+}
+
+template <>
+__device__ inline float weightFunction<4>(const ftl::cuda::ILWParams &params, float dweight, float cweight) {
+	return cweight;
+}
+
+template<int COR_STEPS, int FUNCTION> 
 __global__ void correspondence_energy_vector_kernel(
-        TextureObject<float4> p1,
-        TextureObject<float4> p2,
+        TextureObject<float> d1,
+        TextureObject<float> d2,
        TextureObject<uchar4> c1,
        TextureObject<uchar4> c2,
-        TextureObject<float4> vout,
-        TextureObject<float> eout,
-        float4x4 pose1,  // Inverse
+        TextureObject<float> dout,
+        TextureObject<float> conf,
+        float4x4 pose1,
+        float4x4 pose1_inv,
        float4x4 pose2,  // Inverse
+        Camera cam1,
        Camera cam2, ftl::cuda::ILWParams params) {

    // Each warp picks point in p1
-    const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
-	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+    //const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x); // / WARP_SIZE;
    const int y = blockIdx.y*blockDim.y + threadIdx.y;
    
-    const float3 world1 = make_float3(p1.tex2D(x, y));
+    //const float3 world1 = make_float3(p1.tex2D(x, y));
+    const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z;  // Initial starting depth
+	if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return;
+
+	// TODO: Temporary hack to ensure depth1 is present
+	//const float4 temp = vout.tex2D(x,y);
+	//vout(x,y) =  make_float4(depth1, 0.0f, temp.z, temp.w);
+	
+	const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1);
+
    const uchar4 colour1 = c1.tex2D(x, y);
-	if (world1.x == MINF) return;
-    const float3 camPos2 = pose2 * world1;
-    const uint2 screen2 = cam2.camToScreen<uint2>(camPos2);
-    
-    float bestcost = 1.1f;
-    float avgcost = 0.0f;
-	float3 bestpoint;
+
+	float bestdepth = 0.0f;
+	float bestweight = 0.0f;
+	float bestcolour = 0.0f;
+	float totalcolour = 0.0f;
 	int count = 0;
+	float contrib = 0.0f;
+    
+	const float step_interval = params.range / (COR_STEPS / 2);
+	
+	const float3 rayStep_world = pose1.getFloat3x3() * cam1.screenToCam(x,y,step_interval);
+	const float3 rayStart_2 = pose2 * world1;
+	const float3 rayStep_2 = pose2.getFloat3x3() * rayStep_world;

    // Project to p2 using cam2
    // Each thread takes a possible correspondence and calculates a weighting
-    const int lane = tid % WARP_SIZE;
-	for (int i=lane; i<COR_WIN_RADIUS*COR_WIN_RADIUS; i+=WARP_SIZE) {
-		const float u = (i % COR_WIN_RADIUS) - (COR_WIN_RADIUS / 2);
-        const float v = (i / COR_WIN_RADIUS) - (COR_WIN_RADIUS / 2);
-        
-        const float3 world2 = make_float3(p2.tex2D(screen2.x+u, screen2.y+v));
-        if ((params.flags & ftl::cuda::kILWFlag_IgnoreBad) && world2.x == MINF) continue;
-        const uchar4 colour2 = c2.tex2D(screen2.x+u, screen2.y+v);
-
-        // Determine degree of correspondence
-		float cost = 1.0f - ftl::cuda::spatialWeighting(world1, world2, params.spatial_smooth);
-		// Point is too far away to even count
-		if (world2.x != MINF && cost == 1.0f) continue;
-
-        // Mix ratio of colour and distance costs
-        const float ccost = 1.0f - ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
-        if ((params.flags & ftl::cuda::kILWFlag_SkipBadColour) && ccost == 1.0f) continue;
-        cost = params.cost_ratio * (ccost) + (1.0f - params.cost_ratio) * cost;
-        //cost /= 2.0f;
+    //const int lane = tid % WARP_SIZE;
+	for (int i=0; i<COR_STEPS; ++i) {
+		const int j = i - (COR_STEPS/2);
+		const float depth_adjust = (float)j * step_interval + depth1;
+
+        // Calculate adjusted depth 3D point in camera 2 space
+        const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
+        const float3 camPos = rayStart_2 + j * rayStep_2; //pose2 * worldPos;
+        const uint2 screen = cam2.camToScreen<uint2>(camPos);
+
+        if (screen.x >= cam2.width || screen.y >= cam2.height) continue;
+
+		// Generate a depth correspondence value
+		const float depth2 = d2.tex2D((int)screen.x, (int)screen.y);
+		const float dweight = ftl::cuda::weighting(fabs(depth2 - camPos.z), params.spatial_smooth);
+		//const float dweight = ftl::cuda::weighting(fabs(depth_adjust - depth1), 2.0f*params.range);
+		
+		// Generate a colour correspondence value
+		const uchar4 colour2 = c2.tex2D((int)screen.x, (int)screen.y);
+		const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth);
+
+		const float weight = weightFunction<FUNCTION>(params, dweight, cweight);

 		++count;
-		avgcost += cost;
-        if (world2.x != MINF && cost < bestcost) {
-            bestpoint = world2;
-            bestcost = cost;
-        }
+		contrib += weight;
+		bestcolour = max(cweight, bestcolour);
+		totalcolour += cweight;
+		if (weight > bestweight) {
+			bestweight = weight;
+			bestdepth = depth_adjust;
+		}
    }

-	count = warpSum(count);
-    const float mincost = warpMin(bestcost);
-	bool best = mincost == bestcost;
-	avgcost = warpSum(avgcost) / count;
-    const float confidence = (avgcost - mincost);
-
-    if (best && mincost < 1.0f) {
-        float3 tvecA = pose1 * bestpoint;
-        float3 tvecB = pose1 * world1;
-        if (params.flags & ftl::cuda::kILWFlag_RestrictZ) {
-            tvecA.x = tvecB.x;
-            tvecA.y = tvecB.y;
-        }
-        tvecA = (pose1.getInverse() * tvecA) - world1;
-        vout(x,y) = vout.tex2D(x, y) + make_float4(
-            tvecA.x, // * (1.0f - mincost) * confidence,
-            tvecA.y, // * (1.0f - mincost) * confidence,
-            tvecA.z, // * (1.0f - mincost) * confidence,
-			(1.0f - mincost) * confidence);
-			
-		//eout(x,y) = max(eout(x,y), (length(bestpoint-world1) / 0.04f) * 7.0f);
-		//eout(x,y) = max(eout(x,y), (1.0f - mincost) * 7.0f);
-		//eout(x,y) = max(eout(x, y), (1.0f - mincost) * confidence * (length(bestpoint-world1) / 0.04f) * 12.0f);
-		eout(x,y) = max(eout(x, y), (1.0f - mincost) * confidence * 12.0f);
-		//eout(x,y) = max(eout(x, y), confidence * 12.0f);
-    } else if (mincost >= 1.0f && lane == 0) {
-        //vout(x,y) = make_float4(0.0f);
-        //eout(x,y) = 0.0f;
+	const float avgcolour = totalcolour/(float)count;
+    const float confidence = bestcolour / totalcolour; //bestcolour - avgcolour;
+
+    if (bestweight > 0.0f) {
+        float old = conf.tex2D(x,y);
+
+        if (bestweight * confidence > old) {
+			dout(x,y) = bestdepth;
+			conf(x,y) = bestweight * confidence;
+		}
    }
 }

-void ftl::cuda::correspondence_energy_vector(
-        TextureObject<float4> &p1,
-        TextureObject<float4> &p2,
+void ftl::cuda::correspondence(
+        TextureObject<float> &d1,
+        TextureObject<float> &d2,
        TextureObject<uchar4> &c1,
        TextureObject<uchar4> &c2,
-        TextureObject<float4> &vout,
-        TextureObject<float> &eout,
+        TextureObject<float> &dout,
+        TextureObject<float> &conf,
        float4x4 &pose1,
+        float4x4 &pose1_inv,
        float4x4 &pose2,
-        const Camera &cam2, const ILWParams &params, int win,
+        const Camera &cam1,
+        const Camera &cam2, const ILWParams &params, int func,
        cudaStream_t stream) {

-    const dim3 gridSize((p1.width() + 2 - 1)/2, (p1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(2*WARP_SIZE, T_PER_BLOCK);
+	const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    //printf("COR SIZE %d,%d\n", p1.width(), p1.height());

-    switch (win) {
-    case 17     : correspondence_energy_vector_kernel<17><<<gridSize, blockSize, 0, stream>>>(p1, p2, c1, c2, vout, eout, pose1, pose2, cam2, params); break;
-    case 9      : correspondence_energy_vector_kernel<9><<<gridSize, blockSize, 0, stream>>>(p1, p2, c1, c2, vout, eout, pose1, pose2, cam2, params); break;
-    case 5      : correspondence_energy_vector_kernel<5><<<gridSize, blockSize, 0, stream>>>(p1, p2, c1, c2, vout, eout, pose1, pose2, cam2, params); break;
-    }
+	switch (func) {
+    case 0: correspondence_energy_vector_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, dout, conf, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 1: correspondence_energy_vector_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, dout, conf, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 2: correspondence_energy_vector_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, dout, conf, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 3: correspondence_energy_vector_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, dout, conf, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	case 4: correspondence_energy_vector_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, dout, conf, pose1, pose1_inv, pose2, cam1, cam2, params); break;
+	}
+
    cudaSafeCall( cudaGetLastError() );
 }

 @@ -147,58 +241,74 @@ void ftl::cuda::correspondence_energy_vector(

 template <int MOTION_RADIUS>
 __global__ void move_points_kernel(
-    ftl::cuda::TextureObject<float4> p,
-    ftl::cuda::TextureObject<float4> ev,
+    ftl::cuda::TextureObject<float> d_old,
+	ftl::cuda::TextureObject<float> d_new,
+	ftl::cuda::TextureObject<float> conf,
    ftl::rgbd::Camera camera,
+	float4x4 pose,
+	ftl::cuda::ILWParams params,
    float rate) {

    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+	const float d0_new = d_new.tex2D((int)x,(int)y);
+	const float d0_old = d_old.tex2D((int)x,(int)y);
+	if (d0_new == 0.0f) return;  // No correspondence found
    
-    if (x < p.width() && y < p.height()) {
-		const float4 world = p(x,y);
-		if (world.x == MINF) return;
+    if (x < d_old.width() && y < d_old.height()) {
+		//const float4 world = p(x,y);
+		//if (world.x == MINF) return;

-		float4 vec = make_float4(0.0f, 0.0f, 0.0f, 0.0f); //ev.tex2D((int)x,(int)y);
+		float delta = 0.0f; //make_float4(0.0f, 0.0f, 0.0f, 0.0f); //ev.tex2D((int)x,(int)y);
 		float contrib = 0.0f;

 		// Calculate screen space distortion with neighbours
 		for (int v=-MOTION_RADIUS; v<=MOTION_RADIUS; ++v) {
 			for (int u=-MOTION_RADIUS; u<=MOTION_RADIUS; ++u) {
-				const float4 vecn = ev.tex2D((int)x+u,(int)y+v);
-				const float3 pn = make_float3(p.tex2D((int)x+u,(int)y+v));
-				if (pn.x == MINF) continue;
+				const float dn_new = d_new.tex2D((int)x+u,(int)y+v);
+				const float dn_old = d_old.tex2D((int)x+u,(int)y+v);
+				const float confn = conf.tex2D((int)x+u,(int)y+v);
+				//const float3 pn = make_float3(p.tex2D((int)x+u,(int)y+v));
+				//if (pn.x == MINF) continue;
+				if (dn_new == 0.0f) continue;  // Neighbour has no new correspondence

-				const float s = ftl::cuda::spatialWeighting(pn, make_float3(world), 0.01f);
-				contrib += vecn.w * s;
-				vec += vecn.w * s * vecn;
+				const float s = ftl::cuda::spatialWeighting(camera.screenToCam(x,y,d0_new), camera.screenToCam(x+u,y+v,dn_new), params.range); // ftl::cuda::weighting(fabs(d0_new - dn_new), params.range);
+				contrib += (confn+0.01f) * s;
+				delta += (confn+0.01f) * s * ((confn == 0.0f) ? dn_old : dn_new);
 			}
 		}

-        if (vec.w > 0.0f) {
-            p(x,y) = world + rate * (vec / contrib);
+        if (contrib > 0.0f) {
+            //const float3 newworld = pose * camera.screenToCam(x, y, vec0.x + rate * ((delta / contrib) - vec0.x));
+			//p(x,y) = make_float4(newworld, world.w); //world + rate * (vec / contrib);
+			
+			d_old(x,y) = d0_old + rate * ((delta / contrib) - d0_old);
        }
    }
 }


 void ftl::cuda::move_points(
-        ftl::cuda::TextureObject<float4> &p,
-        ftl::cuda::TextureObject<float4> &v,
+        ftl::cuda::TextureObject<float> &d_old,
+		ftl::cuda::TextureObject<float> &d_new,
+		ftl::cuda::TextureObject<float> &conf,
        const ftl::rgbd::Camera &camera,
+		const float4x4 &pose,
+		const ftl::cuda::ILWParams &params,
        float rate,
        int radius,
        cudaStream_t stream) {

-    const dim3 gridSize((p.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (p.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 gridSize((d_old.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d_old.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    switch (radius) {
-    case 9 : move_points_kernel<9><<<gridSize, blockSize, 0, stream>>>(p,v,camera,rate); break;
-    case 5 : move_points_kernel<5><<<gridSize, blockSize, 0, stream>>>(p,v,camera,rate); break;
-    case 3 : move_points_kernel<3><<<gridSize, blockSize, 0, stream>>>(p,v,camera,rate); break;
-    case 1 : move_points_kernel<1><<<gridSize, blockSize, 0, stream>>>(p,v,camera,rate); break;
-    case 0 : move_points_kernel<0><<<gridSize, blockSize, 0, stream>>>(p,v,camera,rate); break;
+    case 9 : move_points_kernel<9><<<gridSize, blockSize, 0, stream>>>(d_old,d_new,conf,camera, pose, params, rate); break;
+    case 5 : move_points_kernel<5><<<gridSize, blockSize, 0, stream>>>(d_old,d_new,conf,camera, pose, params, rate); break;
+    case 3 : move_points_kernel<3><<<gridSize, blockSize, 0, stream>>>(d_old,d_new,conf,camera, pose, params, rate); break;
+    case 1 : move_points_kernel<1><<<gridSize, blockSize, 0, stream>>>(d_old,d_new,conf,camera, pose, params, rate); break;
+    case 0 : move_points_kernel<0><<<gridSize, blockSize, 0, stream>>>(d_old,d_new,conf,camera, pose, params, rate); break;
    }

    cudaSafeCall( cudaGetLastError() );