Newer
Older
#include "smoothing_cuda.hpp"
#include <ftl/cuda/weighting.hpp>
using ftl::cuda::TextureObject;
#define T_PER_BLOCK 8
// ===== MLS Smooth ============================================================
/*
* Smooth depth map using Moving Least Squares
*/
template <int SEARCH_RADIUS>
__global__ void mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
float smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
float smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Colour MLS Smooth =====================================================
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <int SEARCH_RADIUS>
__global__ void colour_mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<uchar4> colour_in,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
const float cw = ftl::cuda::colourWeighting(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing*cw);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::colour_mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: colour_mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 4: colour_mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 3: colour_mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 2: colour_mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
case 1: colour_mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Colour MLS using cross support region =================================
__device__ inline int segmentID(int u, int v) {
if (u < 0 && v < 0) return 0x001;
if (u > 0 && v < 0) return 0x002;
if (u > 0 && v > 0) return 0x004;
if (u < 0 && v > 0) return 0x008;
return 0;
}
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <bool FILLING, int RADIUS>
__global__ void colour_mls_smooth_csr_kernel(
TextureObject<uchar4> region,
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<uchar4> colour_in,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
normals_out(x,y) = normals_in(x,y);
if (d0 < camera.minDepth || d0 > camera.maxDepth) {
if(FILLING) d0 = 0.0f;
else return;
}
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
float4 c0 = colour_in.tex2D((float)x+0.5f, (float)y+0.5f);
// Neighbourhood
uchar4 base = region.tex2D(x,y);
// TODO: Does using a fixed loop size with range test work better?
// Or with warp per pixel version, this would be less of a problem...
// TODO: Do a separate vote fill step?
for (int v=-RADIUS; v<=RADIUS; ++v) {
uchar4 baseY = region.tex2D(x,y+v);
#pragma unroll
for (int u=-RADIUS; u<=RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
//if (d > camera.minDepth && d < camera.maxDepth) {
float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -baseY.x || u > baseY.y || v < -base.z || v > base.z) ? 0.0f : 1.0f;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
// FIXME: Ensure bad normals are removed by setting depth invalid
//if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
const float4 c = colour_in.tex2D(float(x+u) + 0.5f, float(y+v) + 0.5f);
w *= ftl::cuda::colourWeighting(c0,c,colour_smoothing);
// Allow missing point to borrow z value
// TODO: This is a bad choice of Z! Perhaps try histogram vote approach
//if (FILLING && d0 == 0.0f) X = camera.screenToCam((int)(x),(int)(y),Xi.z);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(X,Xi,smoothing*w);
//if (FILLING && w > 0.0f && v > -base.z+1 && v < base.w-1 && u > -baseY.x+1 && u < baseY.y-1) segment_check |= segmentID(u,v);
//}
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
if (FILLING && d0 == 0.0f) {
if (__popc(segment_check) < 3) return;
X = camera.screenToCam((int)(x),(int)(y),aX.z);
}
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::colour_mls_smooth_csr(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
colour_mls_smooth_csr_kernel<true,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
colour_mls_smooth_csr_kernel<false,5><<<gridSize, blockSize, 0, stream>>>(region, normals_in, normals_out, depth_in, depth_out, colour_in, smoothing, colour_smoothing, camera);
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Cross Aggregate MLS ===================================================
__device__ inline float colourAsFloat(uchar4 c) {
union {
uchar4 col;
float f;
};
col = c;
return f;
}
__device__ inline uchar4 floatAsColour(float pf) {
union {
uchar4 col;
float f;
};
f = pf;
return col;
}
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It also uses
* cross support windows to prevent unwanted edge crossing. This function does
* the weighted aggregation of centroids and normals in the horizontal
* direction.
*/
template <int RADIUS>
__global__ void mls_aggr_horiz_kernel(
const uchar4* __restrict__ region,
size_t region_pitch,
const half4* __restrict__ normals_in,
size_t normals_in_pitch,
TextureObject<half4> normals_out,
const float* __restrict__ depth_in, // Virtual depth map
size_t depth_in_pitch,
TextureObject<float4> centroid_out, // Accumulated output
const uchar4* __restrict__ colour_in,
size_t colour_in_pitch,
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
const float d0 = depth_in[y*depth_in_pitch+x];
const uchar4 c0 = colour_in[y*colour_in_pitch+x];
// Note: x and y flipped as output is rotated.
centroid_out(y,x) = make_float4(0.0f,0.0f,0.0f,colourAsFloat(c0));
normals_out(y,x) = normals_in[y*normals_in_pitch+x];
if (d0 > camera.minDepth && d0 < camera.maxDepth) {
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
// Cross-Support Neighbourhood
const uchar4 base = region[y*region_pitch+x];
const half4* __restrict__ nptr = normals_in + y*normals_in_pitch + x - RADIUS;
const float* __restrict__ dptr = depth_in + y*depth_in_pitch + x - RADIUS;
const uchar4* __restrict__ cptr = colour_in + y*colour_in_pitch + x - RADIUS;
#pragma unroll
for (int u=-RADIUS; u<=RADIUS; ++u) {
const float d = *(dptr++);
// If outside of cross support range, set weight to 0 to ignore
float w = (d <= camera.minDepth || d >= camera.maxDepth || u < -base.x || u > base.y) ? 0.0f : 1.0f;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam(x+u,y,d);
const float3 Ni = make_float3(*(nptr++));
// Bad or missing normals should be ignored
if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
// Gauss approx colour weighting.
const uchar4 c = *(cptr++);
w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(X,Xi,d0*smoothing*w);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Note: x and y flipped since output is rotated 90 degrees.
centroid_out(y,x) = make_float4(aX, colourAsFloat(c0));
normals_out(y,x) = make_half4(nX / length(nX), 0.0f);
}
}
}
}
/**
* This function does a vertical weighted aggregation of the centroids and
* normals generated by the horizontal aggregation.
*/
template <int RADIUS>
__global__ void mls_aggr_vert_kernel(
const uchar4* __restrict__ region,
size_t region_pitch,
const half4* __restrict__ normals_in,
size_t normals_in_pitch,
TextureObject<half4> normals_out,
const float4* __restrict__ centroid_in, // Virtual depth map
size_t centroid_in_pitch,
TextureObject<float4> centroid_out, // Accumulated output
float smoothing,
float colour_smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= RADIUS && y >= RADIUS && x < camera.width-RADIUS && y < camera.height-RADIUS) {
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
const float4 cin = centroid_in[x*centroid_in_pitch+y];
const float3 A = make_float3(cin);
const uchar4 c0 = floatAsColour(cin.w);
centroid_out(x,y) = make_float4(0.0f);
normals_out(x,y) = make_half4(0.0f);
if (A.z > camera.minDepth && A.z < camera.maxDepth) {
//float3 X = camera.screenToCam((int)(x),(int)(y),d0);
//const uchar4 c0 = colour_in[y*colour_in_pitch+x];
// Cross-Support Neighbourhood
const uchar4 base = region[y*region_pitch+x];
const half4* __restrict__ nptr = normals_in + x*normals_in_pitch + y - RADIUS;
const float4* __restrict__ ceptr = (centroid_in + x*centroid_in_pitch + y - RADIUS);
#pragma unroll
for (int v=-RADIUS; v<=RADIUS; ++v) {
// Note: x and y flipped, input image is rotated.
const float4 cin = *(ceptr++);
const float3 Ai = make_float3(cin);
const uchar4 c = floatAsColour(cin.w);
// If outside the cross support range, set weight to 0 to ignore
float w = (Ai.z <= camera.minDepth || Ai.z >= camera.maxDepth || v < -base.z || v > base.w) ? 0.0f : 1.0f;
// Note: x and y flipped, input image is rotated.
const float3 Ni = make_float3(*(nptr++));
// Bad normals should be ignored.
if (Ni.x+Ni.y+Ni.z == 0.0f) w = 0.0f;
// Gauss approx colour weighting.
//const uchar4 c = *(cptr+v*colour_in_pitch);
w *= ftl::cuda::colourWeighting2(c0,c,colour_smoothing);
// Gauss approx weighting function using point distance
w = ftl::cuda::spatialWeighting(A,Ai,A.z*smoothing*w);
aX += Ai*w;
nX += Ni*w;
contrib += w;
}
// Normalise the summed points and normals
if (contrib > 0.0f) {
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
centroid_out(x,y) = make_float4(aX, 0.0f);
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
}
}
}
/**
* Using the aggregated centroids and normals, calculate the signed-distance-
* field and move the depth value accordingly using the calculated normal.
*/
__global__ void mls_adjust_depth_kernel(
TextureObject<float4> centroid_in, // Virtual depth map
TextureObject<float> depth_out,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) {
const float3 aX = make_float3(centroid_in(x,y));
const float3 nX = make_float3(normals_in(x,y));
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
if (aX.z > camera.minDepth && aX.z < camera.maxDepth) {
float3 X = camera.screenToCam((int)(x),(int)(y),aX.z);
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
depth_out(x,y) = X.z;
}
}
}
void ftl::cuda::mls_aggr_horiz(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float4> ¢roid_out,
ftl::cuda::TextureObject<uchar4> &colour_in,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 16;
static constexpr int THREADS_Y = 16;
const dim3 gridSize((normals_in.width() + THREADS_X - 1)/THREADS_X, (normals_in.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
case 1: mls_aggr_horiz_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 2: mls_aggr_horiz_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 3: mls_aggr_horiz_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 5: mls_aggr_horiz_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 10: mls_aggr_horiz_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 15: mls_aggr_horiz_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
case 20: mls_aggr_horiz_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, depth_in.devicePtr(), depth_in.pixelPitch(), centroid_out, colour_in.devicePtr(), colour_in.pixelPitch(), smoothing, colour_smoothing, camera); break;
default: return;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
void ftl::cuda::mls_aggr_vert(
ftl::cuda::TextureObject<uchar4> ®ion,
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float4> ¢roid_in,
ftl::cuda::TextureObject<float4> ¢roid_out,
float smoothing,
float colour_smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 4;
static constexpr int THREADS_Y = 32;
const dim3 gridSize((normals_out.width() + THREADS_X - 1)/THREADS_X, (normals_out.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
case 1: mls_aggr_vert_kernel<1><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 2: mls_aggr_vert_kernel<2><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 3: mls_aggr_vert_kernel<3><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 5: mls_aggr_vert_kernel<5><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 10: mls_aggr_vert_kernel<10><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 15: mls_aggr_vert_kernel<15><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
case 20: mls_aggr_vert_kernel<20><<<gridSize, blockSize, 0, stream>>>(region.devicePtr(), region.pixelPitch(), normals_in.devicePtr(), normals_in.pixelPitch(), normals_out, centroid_in.devicePtr(), centroid_in.pixelPitch(), centroid_out, smoothing, colour_smoothing, camera); break;
default: return;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
void ftl::cuda::mls_adjust_depth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<float4> ¢roid_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<float> &depth_in,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
static constexpr int THREADS_X = 32;
static constexpr int THREADS_Y = 4;
const dim3 gridSize((depth_out.width() + THREADS_X - 1)/THREADS_X, (depth_out.height() + THREADS_Y - 1)/THREADS_Y);
const dim3 blockSize(THREADS_X, THREADS_Y);
mls_adjust_depth_kernel<<<gridSize, blockSize, 0, stream>>>(normals_in, centroid_in, depth_out, depth_in, camera);
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
// ===== Adaptive MLS Smooth =====================================================
/*
* Smooth depth map using Moving Least Squares. This version uses colour
* similarity weights to adjust the spatial smoothing factor. It is naive in
* that similar colours may exist on both sides of an edge and colours at the
* level of single pixels can be subject to noise. Colour noise should first
* be removed from the image.
*/
template <int SEARCH_RADIUS>
__global__ void adaptive_mls_smooth_kernel(
TextureObject<half4> normals_in,
TextureObject<half4> normals_out,
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
TextureObject<float> depth_in, // Virtual depth map
TextureObject<float> depth_out, // Accumulated output
TextureObject<float> smoothing,
ftl::rgbd::Camera camera) {
const int x = blockIdx.x*blockDim.x + threadIdx.x;
const int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;
float3 aX = make_float3(0.0f,0.0f,0.0f);
float3 nX = make_float3(0.0f,0.0f,0.0f);
float contrib = 0.0f;
float d0 = depth_in.tex2D(x, y);
depth_out(x,y) = d0;
normals_out(x,y) = normals_in(x,y);
if (d0 < camera.minDepth || d0 > camera.maxDepth) return;
float3 X = camera.screenToCam((int)(x),(int)(y),d0);
//uchar4 c0 = colour_in.tex2D(x, y);
float smooth = smoothing(x,y);
// Neighbourhood
for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) {
for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) {
const float d = depth_in.tex2D(x+u, y+v);
if (d < camera.minDepth || d > camera.maxDepth) continue;
// Point and normal of neighbour
const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d);
const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v));
if (Ni.x+Ni.y+Ni.z == 0.0f) continue;
// Gauss approx weighting function using point distance
const float w = ftl::cuda::spatialWeighting(X,Xi,smooth*0.5f);
aX += Xi*w;
nX += Ni*w;
contrib += w;
}
}
if (contrib <= 0.0f) return;
nX /= contrib; // Weighted average normal
aX /= contrib; // Weighted average point (centroid)
// Signed-Distance Field function
float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z);
// Calculate new point using SDF function to adjust depth (and position)
X = X - nX * fX;
//uint2 screen = camera.camToScreen<uint2>(X);
//if (screen.x < depth_out.width() && screen.y < depth_out.height()) {
// depth_out(screen.x,screen.y) = X.z;
//}
depth_out(x,y) = X.z;
normals_out(x,y) = make_half4(nX / length(nX), 0.0f);
}
void ftl::cuda::adaptive_mls_smooth(
ftl::cuda::TextureObject<half4> &normals_in,
ftl::cuda::TextureObject<half4> &normals_out,
ftl::cuda::TextureObject<float> &depth_in,
ftl::cuda::TextureObject<float> &depth_out,
ftl::cuda::TextureObject<float> &smoothing,
int radius,
const ftl::rgbd::Camera &camera,
cudaStream_t stream) {
const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
switch (radius) {
case 5: adaptive_mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 4: adaptive_mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 3: adaptive_mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 2: adaptive_mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
case 1: adaptive_mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break;
}
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}