#include "smoothing_cuda.hpp"
#include <ftl/cuda/weighting.hpp>
using ftl::cuda::TextureObject;
#define T_PER_BLOCK 8
#define WARP_SIZE 32
// ===== MLS Smooth ============================================================
/*
* Smooth depth map using Moving Least Squares
*/
template <int SEARCH_RADIUS>
__global__ void mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
float smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
float smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Colour MLS Smooth =====================================================
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <int SEARCH_RADIUS>
__global__ void colour_mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<uchar4> colour_in,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
normals_out(x,y) = normals_in(x,y);
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
const float cw = ftl::cuda::colourWeighting(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing*cw);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
if (contrib == 0.0f) return;
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::colour_mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: colour_mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 4: colour_mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 3: colour_mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 2: colour_mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 1: colour_mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Colour MLS using cross support region =================================
__device__ inline int segmentID(int u, int v) {
if (u < 0 && v < 0) return 0x001;
if (u > 0 && v < 0) return 0x002;
if (u > 0 && v > 0) return 0x004;
if (u < 0 && v > 0) return 0x008;
return 0;
}
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <bool FILLING, int RADIUS>
__global__ void colour_mls_smooth_csr_kernel(
TextureObject<uchar4> region,
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<uchar4> colour_in,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
normals_out(x,y) = normals_in(x,y);
if (d0 < camera.minDepth || d0 > camera.maxDepth) {
if(FILLING) d0 = 0.0f;
else return;
}
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
// Neighbourhood
uchar4 base = region.tex2D(x,y);
int segment_check = 0;
// TODO: Does using a fixed loop size with range test work better?
// Or with warp per pixel version, this would be less of a problem...
// TODO: Do a separate vote fill step?
for (int v=-RADIUS; v<=RADIUS; ++v) {
uchar4 baseY = region.tex2D(x,y+v);
#pragma unroll
for (int u=-RADIUS; u<=RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
//if (d > camera.minDepth && d < camera.maxDepth) {
float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -baseY.x || u > baseY.y || v < -base.z || v > base.z) ? 0.0f : 1.0f;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
// FIXME: Ensure bad normals are removed by setting depth invalid
//if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
// Allow missing point to borrow z value
// TODO: This is a bad choice of Z! Perhaps try histogram vote approach
//if (FILLING && d0 == 0.0f) X = camera.screenToCam((int)(x),(int)(y),Xi.z);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(X,Xi,smoothing*w);
aX += Xi*w;
nX += Ni*w;
contrib += w;
//if (FILLING && w > 0.0f && v > -base.z+1 && v < base.w-1 && u > -baseY.x+1 && u < baseY.y-1) segment_check |= segmentID(u,v);
//}
}
}
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
if (FILLING && d0 == 0.0f) {
if (__popc(segment_check) < 3) return;
X = camera.screenToCam((int)(x),(int)(y),aX.z);
}
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
}
void ftl::cuda::colour_mls_smooth_csr(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
bool filling,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
if (filling) {
colour_mls_smooth_csr_kernel<true,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
} else {
colour_mls_smooth_csr_kernel<false,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Cross Aggregate MLS ===================================================
__device__ inline float colourAsFloat(uchar4 c) {
union {
uchar4 col;
float f;
};
col = c;
return f;
}
__device__ inline uchar4 floatAsColour(float pf) {
union {
uchar4 col;
float f;
};
f = pf;
return col;
}
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It also uses
* cross support windows to prevent unwanted edge crossing. This function does
* the weighted aggregation of centroids and normals in the horizontal
* direction.
*/
template <int RADIUS>
__global__ void mls_aggr_horiz_kernel(
const uchar4* __restrict__ region,
size_t region_pitch,
const half4* __restrict__ normals_in,
size_t normals_in_pitch,
TextureObject<half4> normals_out,
const float* __restrict__ depth_in, // Virtual depth map
size_t depth_in_pitch,
TextureObject<float4> centroid_out, // Accumulated output
const uchar4* __restrict__ colour_in,
size_t colour_in_pitch,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
const float d0 = depth_in[y*depth_in_pitch+x];
const uchar4 c0 = colour_in[y*colour_in_pitch+x];
// Note: x and y flipped as output is rotated.
centroid_out(y,x) = make_float4(0.0f,0.0f,0.0f,colourAsFloat(c0));
normals_out(y,x) = normals_in[y*normals_in_pitch+x];
if (d0 > camera.minDepth && d0 < camera.maxDepth) {
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
// Cross-Support Neighbourhood
const uchar4 base = region[y*region_pitch+x];
const half4* __restrict__ nptr = normals_in + y*normals_in_pitch + x - RADIUS;
const float* __restrict__ dptr = depth_in + y*depth_in_pitch + x - RADIUS;
const uchar4* __restrict__ cptr = colour_in + y*colour_in_pitch + x - RADIUS;
#pragma unroll
for (int u=-RADIUS; u<=RADIUS; ++u) {
const float d = *(dptr++);
// If outside of cross support range, set weight to 0 to ignore
float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam(x+u,y,d);
const float3 Ni = make_float3(*(nptr++));
// Bad or missing normals should be ignored
if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
// Gauss approx colour weighting.
const uchar4 c = *(cptr++);
w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Note: x and y flipped since output is rotated 90 degrees.
centroid_out(y,x) = make_float4(aX, colourAsFloat(c0));
normals_out(y,x) = make_half4(nX / length(nX), 0.0f);
}
}
}
}
/**
* This function does a vertical weighted aggregation of the centroids and
* normals generated by the horizontal aggregation.
*/
template <int RADIUS>
__global__ void mls_aggr_vert_kernel(
const uchar4* __restrict__ region,
size_t region_pitch,
const half4* __restrict__ normals_in,
size_t normals_in_pitch,
TextureObject<half4> normals_out,
const float4* __restrict__ centroid_in, // Virtual depth map
size_t centroid_in_pitch,
TextureObject<float4> centroid_out, // Accumulated output
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
const float4 cin = centroid_in[x*centroid_in_pitch+y];
const float3 A = make_float3(cin);
const uchar4 c0 = floatAsColour(cin.w);
centroid_out(x,y) = make_float4(0.0f);
normals_out(x,y) = make_half4(0.0f);
if (A.z > camera.minDepth && A.z < camera.maxDepth) {
//float3 X = camera.screenToCam((int)(x),(int)(y),d0);
//const uchar4 c0 = colour_in[y*colour_in_pitch+x];
// Cross-Support Neighbourhood
const uchar4 base = region[y*region_pitch+x];
const half4* __restrict__ nptr = normals_in + x*normals_in_pitch + y - RADIUS;
const float4* __restrict__ ceptr = (centroid_in + x*centroid_in_pitch + y - RADIUS);
#pragma unroll
for (int v=-RADIUS; v<=RADIUS; ++v) {
// Note: x and y flipped, input image is rotated.
const float4 cin = *(ceptr++);
const float3 Ai = make_float3(cin);
const uchar4 c = floatAsColour(cin.w);
// If outside the cross support range, set weight to 0 to ignore
float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f;
// Note: x and y flipped, input image is rotated.
const float3 Ni = make_float3(*(nptr++));
// Bad normals should be ignored.
if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
// Gauss approx colour weighting.
//const uchar4 c = *(cptr+v*colour_in_pitch);
w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(A,Ai,A.z*smoothing*w);
aX += Ai*w;
nX += Ni*w;
contrib += w;
}
// Normalise the summed points and normals
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
centroid_out(x,y) = make_float4(aX, 0.0f);
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
}
}
}
/**
* Using the aggregated centroids and normals, calculate the signed-distance-
* field and move the depth value accordingly using the calculated normal.
*/
__global__ void mls_adjust_depth_kernel(
TextureObject<half4> normals_in,
TextureObject<float4> centroid_in, // Virtual depth map
TextureObject<float> depth_out,
TextureObject<float> depth_in,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) {
const float3 aX = make_float3(centroid_in(x,y));
const float3 nX = make_float3(normals_in(x,y));
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
if (aX.z > camera.minDepth && aX.z < camera.maxDepth) {
float3 X = camera.screenToCam((int)(x),(int)(y),aX.z);
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
depth_out(x,y) = X.z;
}
}
}
void ftl::cuda::mls_aggr_horiz(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float4> ¢roid_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 16;
static constexpr int THREADS_Y = 16;
const dim3 gridSize((normals_in.width() + THREADS_X - 1)/THREADS_X, (normals_in.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
switch(radius) {
case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
default: return;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
void ftl::cuda::mls_aggr_vert(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float4> ¢roid_in,
ftl::cuda::TextureObject<float4> ¢roid_out,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 4;
static constexpr int THREADS_Y = 32;
const dim3 gridSize((normals_out.width() + THREADS_X - 1)/THREADS_X, (normals_out.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
switch(radius) {
case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
default: return;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
void ftl::cuda::mls_adjust_depth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<float4> ¢roid_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<float> &depth_in,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 32;
static constexpr int THREADS_Y = 4;
const dim3 gridSize((depth_out.width() + THREADS_X - 1)/THREADS_X, (depth_out.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_out, depth_in, camera);
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Adaptive MLS Smooth =====================================================
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <int SEARCH_RADIUS>
__global__ void adaptive_mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<float> smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
normals_out(x,y) = normals_in(x,y);
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
//uchar4 c0 = colour_in.tex2D(x, y);
float smooth = smoothing(x,y);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smooth*0.5f);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
if (contrib <= 0.0f) return;
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::adaptive_mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<float> &smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: adaptive_mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 4: adaptive_mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 3: adaptive_mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 2: adaptive_mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 1: adaptive_mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}