Nicolas Pope
--- a/lib/libsgm/src/census_transform.cu

+ 72

− 10
+++ b/lib/libsgm/src/census_transform.cu

+ 72

− 10
 @@ -21,14 +21,15 @@ namespace sgm {

 namespace {

-static constexpr int WINDOW_WIDTH  = 9;
-static constexpr int WINDOW_HEIGHT = 7;
+static constexpr int WINDOW_WIDTH  = 5;
+static constexpr int WINDOW_HEIGHT = 5;

 static constexpr int BLOCK_SIZE = 128;
 static constexpr int LINES_PER_BLOCK = 16;

+/* Centre symmetric census */
 template <typename T>
-__global__ void census_transform_kernel(
+__global__ void cs_census_transform_kernel(
 	feature_type *dest,
 	const T *src,
 	int width,
 @@ -103,6 +104,57 @@ __global__ void census_transform_kernel(
 	}
 }

+template <typename T>
+__global__ void census_transform_kernel(
+	feature_type* __restrict__ dest,
+	const T* __restrict__ src,
+	int width,
+	int height,
+	int pitch)
+{
+	static constexpr int RADIUS_X = WINDOW_WIDTH/2;
+	static constexpr int RADIUS_Y = WINDOW_HEIGHT/2;
+
+	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
+	const int y = blockIdx.y*blockDim.y + threadIdx.y;	
+
+	dest[x+y*width] = 0;
+
+	if (x >= RADIUS_X && y >= RADIUS_Y && x < width-RADIUS_X && y < height-RADIUS_Y) {
+		short center = src[y*pitch+x];
+		uint8_t i = 0; // bit counter for *out
+		// possible BUG in operator(), gets called more than once per pixel;
+		// local variable for sub-bitstring to avoid data race (no read
+		// dependency to out; writes are identical)
+		feature_type res = 0;
+
+		for (int wy = -RADIUS_Y; wy <= RADIUS_Y; wy++) {
+			for (int wx = -RADIUS_X; wx <= RADIUS_X; wx++) {
+				const int y_ = y + wy;
+				const int x_ = x + wx;
+
+				if (y == 0 && x == 0) {
+					continue;
+				}
+
+				// zero if first value, otherwise shift to left
+				res = (res << 1);
+				res |= (center < (src[y_*pitch+x_]) ? 1 : 0);
+
+				// if all bits set, continue to next element
+				/*if (++i % 64 == 0) {
+					*out = res;
+					out++;
+				}*/
+			}
+		}
+		//if ((i - 1)%64 != 0) {
+			// write remaining bits
+			dest[x+y*width] = res;
+		//}
+	}
+}
+
 template <typename T>
 void enqueue_census_transform(
 	feature_type *dest,
 @@ -112,13 +164,23 @@ void enqueue_census_transform(
 	int pitch,
 	cudaStream_t stream)
 {
-	const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
-	const int height_per_block = LINES_PER_BLOCK;
-	const dim3 gdim(
-		(width  + width_per_block  - 1) / width_per_block,
-		(height + height_per_block - 1) / height_per_block);
-	const dim3 bdim(BLOCK_SIZE);
-	census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	/* Disable the original center symmetric algorithm */
+	if (false) {
+		const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
+		const int height_per_block = LINES_PER_BLOCK;
+		const dim3 gdim(
+			(width  + width_per_block  - 1) / width_per_block,
+			(height + height_per_block - 1) / height_per_block);
+		const dim3 bdim(BLOCK_SIZE);
+		cs_census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	} else {
+		static constexpr int THREADS_X = 16;
+		static constexpr int THREADS_Y = 16;
+
+		const dim3 gdim((width + THREADS_X - 1)/THREADS_X, (height + THREADS_Y - 1)/THREADS_Y);
+		const dim3 bdim(THREADS_X, THREADS_Y);
+		census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+	}
 }

 }