From dc8c96375515b7aad2af739e55a565223ddd6c88 Mon Sep 17 00:00:00 2001
From: Nicolas Pope <nwpope@utu.fi>
Date: Thu, 22 Oct 2020 15:53:34 +0300
Subject: [PATCH] Use 5x5 standard census for SGM

---
 lib/libsgm/src/census_transform.cu | 82 ++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 10 deletions(-)

diff --git a/lib/libsgm/src/census_transform.cu b/lib/libsgm/src/census_transform.cu
index 70f67bcff..2687bc1bf 100644
--- a/lib/libsgm/src/census_transform.cu
+++ b/lib/libsgm/src/census_transform.cu
@@ -21,14 +21,15 @@ namespace sgm {
 
 namespace {
 
-static constexpr int WINDOW_WIDTH  = 9;
-static constexpr int WINDOW_HEIGHT = 7;
+static constexpr int WINDOW_WIDTH  = 5;
+static constexpr int WINDOW_HEIGHT = 5;
 
 static constexpr int BLOCK_SIZE = 128;
 static constexpr int LINES_PER_BLOCK = 16;
 
+/* Centre symmetric census */
 template <typename T>
-__global__ void census_transform_kernel(
+__global__ void cs_census_transform_kernel(
 	feature_type *dest,
 	const T *src,
 	int width,
@@ -103,6 +104,57 @@ __global__ void census_transform_kernel(
 	}
 }
 
+template <typename T>
+__global__ void census_transform_kernel(
+	feature_type* __restrict__ dest,
+	const T* __restrict__ src,
+	int width,
+	int height,
+	int pitch)
+{
+	static constexpr int RADIUS_X = WINDOW_WIDTH/2;
+	static constexpr int RADIUS_Y = WINDOW_HEIGHT/2;
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;	
+
+	dest[x+y*width] = 0;
+
+	if (x >= RADIUS_X && y >= RADIUS_Y && x < width-RADIUS_X && y < height-RADIUS_Y) {
+		short center = src[y*pitch+x];
+		uint8_t i = 0; // bit counter for *out
+		// possible BUG in operator(), gets called more than once per pixel;
+		// local variable for sub-bitstring to avoid data race (no read
+		// dependency to out; writes are identical)
+		feature_type res = 0;
+
+		for (int wy = -RADIUS_Y; wy <= RADIUS_Y; wy++) {
+			for (int wx = -RADIUS_X; wx <= RADIUS_X; wx++) {
+				const int y_ = y + wy;
+				const int x_ = x + wx;
+
+				if (y == 0 && x == 0) {
+					continue;
+				}
+
+				// zero if first value, otherwise shift to left
+				res = (res << 1);
+				res |= (center < (src[y_*pitch+x_]) ? 1 : 0);
+
+				// if all bits set, continue to next element
+				/*if (++i % 64 == 0) {
+					*out = res;
+					out++;
+				}*/
+			}
+		}
+		//if ((i - 1)%64 != 0) {
+			// write remaining bits
+			dest[x+y*width] = res;
+		//}
+	}
+}
+
 template <typename T>
 void enqueue_census_transform(
 	feature_type *dest,
@@ -112,13 +164,23 @@ void enqueue_census_transform(
 	int pitch,
 	cudaStream_t stream)
 {
-	const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
-	const int height_per_block = LINES_PER_BLOCK;
-	const dim3 gdim(
-		(width  + width_per_block  - 1) / width_per_block,
-		(height + height_per_block - 1) / height_per_block);
-	const dim3 bdim(BLOCK_SIZE);
-	census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	/* Disable the original center symmetric algorithm */
+	if (false) {
+		const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
+		const int height_per_block = LINES_PER_BLOCK;
+		const dim3 gdim(
+			(width  + width_per_block  - 1) / width_per_block,
+			(height + height_per_block - 1) / height_per_block);
+		const dim3 bdim(BLOCK_SIZE);
+		cs_census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	} else {
+		static constexpr int THREADS_X = 16;
+		static constexpr int THREADS_Y = 16;
+
+		const dim3 gdim((width + THREADS_X - 1)/THREADS_X, (height + THREADS_Y - 1)/THREADS_Y);
+		const dim3 bdim(THREADS_X, THREADS_Y);
+		census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	}
 }
 
 }
-- 
GitLab