From db22e69f9ef70cdcd71ba80a5428b2f3f8fb8016 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Fri, 4 Oct 2019 21:33:54 +0300
Subject: [PATCH] Initial no warp

---
 applications/reconstruct/src/ilw/ilw.cu | 26 +++++++++++--------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/applications/reconstruct/src/ilw/ilw.cu b/applications/reconstruct/src/ilw/ilw.cu
index e9d2049f8..aa8456545 100644
--- a/applications/reconstruct/src/ilw/ilw.cu
+++ b/applications/reconstruct/src/ilw/ilw.cu
@@ -44,8 +44,8 @@ __global__ void correspondence_energy_vector_kernel(
         Camera cam2, ftl::cuda::ILWParams params) {
 
     // Each warp picks point in p1
-    const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
-	const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
+    //const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x); // / WARP_SIZE;
     const int y = blockIdx.y*blockDim.y + threadIdx.y;
     
     const float3 world1 = make_float3(p1.tex2D(x, y));
@@ -63,8 +63,8 @@ __global__ void correspondence_energy_vector_kernel(
 
     // Project to p2 using cam2
     // Each thread takes a possible correspondence and calculates a weighting
-    const int lane = tid % WARP_SIZE;
-	for (int i=lane; i<COR_STEPS; i+=WARP_SIZE) {
+    //const int lane = tid % WARP_SIZE;
+	for (int i=0; i<COR_STEPS; ++i) {
         const float depth_adjust = (float)(i - (COR_STEPS / 2)) * step_interval + depth1;
 
         // Calculate adjusted depth 3D point in camera 2 space
@@ -111,14 +111,13 @@ __global__ void correspondence_energy_vector_kernel(
         //}
     }
 
-	count = warpSum(count);
-    const float mincost = warpMin(bestcost);
-	bool best = mincost == bestcost;
-	avgcost = warpSum(avgcost) / count;
+	//count = warpSum(count);
+    const float mincost = bestcost; //warpMin(bestcost);
+	//bool best = mincost == bestcost;
+	avgcost /= count;
     const float confidence = (params.flags & ftl::cuda::kILWFlag_ColourConfidenceOnly) ? avgcost : (avgcost - mincost);
 
-    // FIXME: Multiple threads in warp could match this.
-    if (best && mincost < 1.0f) {
+    if (mincost < 1.0f) {
         //float3 tvecA = pose1 * cam1.screenToCam(x, y, bestdepth);
         //float3 tvecB = pose1 * world1;
         //if (params.flags & ftl::cuda::kILWFlag_RestrictZ) {
@@ -143,9 +142,6 @@ __global__ void correspondence_energy_vector_kernel(
 		//eout(x,y) = max(eout(x, y), (1.0f - mincost) * confidence * (length(bestpoint-world1) / 0.04f) * 12.0f);
 		
 		//eout(x,y) = max(eout(x, y), confidence * 12.0f);
-    } else if (mincost >= 1.0f && lane == 0) {
-        //vout(x,y) = make_float4(0.0f);
-        //eout(x,y) = 0.0f;
     }
 }
 
@@ -163,8 +159,8 @@ void ftl::cuda::correspondence_energy_vector(
         const Camera &cam2, const ILWParams &params, int win,
         cudaStream_t stream) {
 
-    const dim3 gridSize((p1.width() + 2 - 1)/2, (p1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
-    const dim3 blockSize(2*WARP_SIZE, T_PER_BLOCK);
+	const dim3 gridSize((p1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (p1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
+	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
 
     //printf("COR SIZE %d,%d\n", p1.width(), p1.height());
 
-- 
GitLab