Skip to content
Snippets Groups Projects

Use 5x5 proper census for SGM

Merged Nicolas Pope requested to merge exp/ct5 into master
1 file
+ 72
10
Compare changes
  • Side-by-side
  • Inline
@@ -21,14 +21,15 @@ namespace sgm {
namespace {
static constexpr int WINDOW_WIDTH = 9;
static constexpr int WINDOW_HEIGHT = 7;
static constexpr int WINDOW_WIDTH = 5;
static constexpr int WINDOW_HEIGHT = 5;
static constexpr int BLOCK_SIZE = 128;
static constexpr int LINES_PER_BLOCK = 16;
/* Centre symmetric census */
template <typename T>
__global__ void census_transform_kernel(
__global__ void cs_census_transform_kernel(
feature_type *dest,
const T *src,
int width,
@@ -103,6 +104,57 @@ __global__ void census_transform_kernel(
}
}
template <typename T>
__global__ void census_transform_kernel(
feature_type* __restrict__ dest,
const T* __restrict__ src,
int width,
int height,
int pitch)
{
static constexpr int RADIUS_X = WINDOW_WIDTH/2;
static constexpr int RADIUS_Y = WINDOW_HEIGHT/2;
const int x = (blockIdx.x*blockDim.x + threadIdx.x);
const int y = blockIdx.y*blockDim.y + threadIdx.y;
dest[x+y*width] = 0;
if (x >= RADIUS_X && y >= RADIUS_Y && x < width-RADIUS_X && y < height-RADIUS_Y) {
short center = src[y*pitch+x];
uint8_t i = 0; // bit counter for *out
// possible BUG in operator(), gets called more than once per pixel;
// local variable for sub-bitstring to avoid data race (no read
// dependency to out; writes are identical)
feature_type res = 0;
for (int wy = -RADIUS_Y; wy <= RADIUS_Y; wy++) {
for (int wx = -RADIUS_X; wx <= RADIUS_X; wx++) {
const int y_ = y + wy;
const int x_ = x + wx;
if (y == 0 && x == 0) {
continue;
}
// zero if first value, otherwise shift to left
res = (res << 1);
res |= (center < (src[y_*pitch+x_]) ? 1 : 0);
// if all bits set, continue to next element
/*if (++i % 64 == 0) {
*out = res;
out++;
}*/
}
}
//if ((i - 1)%64 != 0) {
// write remaining bits
dest[x+y*width] = res;
//}
}
}
template <typename T>
void enqueue_census_transform(
feature_type *dest,
@@ -112,13 +164,23 @@ void enqueue_census_transform(
int pitch,
cudaStream_t stream)
{
const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
const int height_per_block = LINES_PER_BLOCK;
const dim3 gdim(
(width + width_per_block - 1) / width_per_block,
(height + height_per_block - 1) / height_per_block);
const dim3 bdim(BLOCK_SIZE);
census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
/* Disable the original center symmetric algorithm */
if (false) {
const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
const int height_per_block = LINES_PER_BLOCK;
const dim3 gdim(
(width + width_per_block - 1) / width_per_block,
(height + height_per_block - 1) / height_per_block);
const dim3 bdim(BLOCK_SIZE);
cs_census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
} else {
static constexpr int THREADS_X = 16;
static constexpr int THREADS_Y = 16;
const dim3 gdim((width + THREADS_X - 1)/THREADS_X, (height + THREADS_Y - 1)/THREADS_Y);
const dim3 bdim(THREADS_X, THREADS_Y);
census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
}
}
}
Loading