diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt index ae06aab6d142ba8a2ad3a1e0970905e3a70468e9..1e55c671c87487b995b0dd772495ff8a612a8c78 100644 --- a/applications/reconstruct/CMakeLists.txt +++ b/applications/reconstruct/CMakeLists.txt @@ -10,7 +10,7 @@ set(REPSRC src/garbage.cu src/integrators.cu #src/ray_cast_sdf.cu - src/splat_render.cu + src/voxel_render.cu src/camera_util.cu src/voxel_hash.cu src/voxel_hash.cpp @@ -19,6 +19,7 @@ set(REPSRC #src/virtual_source.cpp src/splat_render.cpp src/dibr.cu + src/mls.cu src/depth_camera.cu src/depth_camera.cpp ) diff --git a/applications/reconstruct/include/ftl/cuda_operators.hpp b/applications/reconstruct/include/ftl/cuda_operators.hpp index b7a3b44426bf6fcf490af67f38461ca849ade428..ae85cfbd785fb72f79225ae28278bf4862b680db 100644 --- a/applications/reconstruct/include/ftl/cuda_operators.hpp +++ b/applications/reconstruct/include/ftl/cuda_operators.hpp @@ -406,6 +406,12 @@ inline __host__ __device__ float length(float3 v) return sqrtf(dot(v, v)); } +// length squared +inline __host__ __device__ float length2(const float3 &v) +{ + return dot(v, v); +} + // normalize inline __host__ __device__ float3 normalize(float3 v) { diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu index 9a36ea452225c1a174d0ba6ced0baf98471688eb..7a322eaf733230772e72c50722e849335bf7e891 100644 --- a/applications/reconstruct/src/depth_camera.cu +++ b/applications/reconstruct/src/depth_camera.cu @@ -47,6 +47,21 @@ void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStre clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); } +__global__ void clear_to_zero_kernel(ftl::cuda::TextureObject<float> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = 0.0f; //PINF; + } +} + +void ftl::cuda::clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_to_zero_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + __global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -79,6 +94,21 @@ void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cuda clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); } +__global__ void clear_colour_kernel(ftl::cuda::TextureObject<float4> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = make_float4(0.0f); + } +} + +void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + // ===== Type convert ===== template <typename A, typename B> @@ -103,331 +133,6 @@ void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda: convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale); } -/// ===== MLS Smooth - -// TODO:(Nick) Put this in a common location (used in integrators.cu) -extern __device__ float spatialWeighting(float r); -extern __device__ float spatialWeighting(float r, float h); - -/* - * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005). - * Real-time foreground-background segmentation using codebook model. - * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004 - */ - __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) { - const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z; - const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z; - const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2); - const float p_2 = xv_2 / v_2; - return sqrt(x_2 - p_2); -} - -__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) { - float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z); - return length(delta); -} - -/* - * Colour weighting as suggested in: - * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014. - * c = colour distance - */ - __device__ float colourWeighting(float c) { - const float h = c_hashParams.m_colourSmoothing; - if (c >= h) return 0.0f; - float ch = c / h; - ch = 1.0f - ch*ch; - return ch*ch*ch*ch; -} - -#define WINDOW_RADIUS 5 - -__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { - const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; - - const float3 pf = camera.poseInverse * mPos; - float3 pos = make_float3(0.0f, 0.0f, 0.0f); - const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); - float weights = 0.0f; - - //#pragma unroll - for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { - for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { - //if (screenPos.x+u < width && screenPos.y+v < height) { //on screen - float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); - const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); - float weight = spatialWeighting(length(pf - camPos)); - - if (weight > 0.0f) { - uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); - weight *= colourWeighting(colordiffFloat2(c1,c2)); - - if (weight > 0.0f) { - wpos += weight* (camera.pose * camPos); - weights += weight; - } - } - //} - } - } - - //wpos += (camera.pose * pos); - - return weights; -} - -__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) { - const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; - - const float3 pf = camera.poseInverse * mPos; - float3 pos = make_float3(0.0f, 0.0f, 0.0f); - const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); - float weights = 0.0f; - - //#pragma unroll - for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { - for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { - //if (screenPos.x+u < width && screenPos.y+v < height) { //on creen - float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); - const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); - - // TODO:(Nick) dot product of normals < 0 means the point - // should be ignored with a weight of 0 since it is facing the wrong direction - // May be good to simply weight using the dot product to give - // a stronger weight to those whose normals are closer - - float weight = spatialWeighting(length(pf - camPos), h); - - if (weight > 0.0f) { - float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); - if (dot(make_float3(norm), make_float3(n2)) > 0.0f) { - - uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); - - if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) { - pos += weight*camPos; // (camera.pose * camPos); - weights += weight; - } - } - } - //} - } - } - - if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights; - - return weights; -} - -__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { - const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; - - const float3 pf = camera.poseInverse * mPos; - float3 pos = make_float3(0.0f, 0.0f, 0.0f); - const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); - float weights = 0.0f; - - //#pragma unroll - for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { - for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { - //if (screenPos.x+u < width && screenPos.y+v < height) { //on screen - float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); - const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); - float weight = spatialWeighting(length(pf - camPos)); - - if (weight > 0.0f) { - uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); - weight *= colourWeighting(colordiffFloat2(c1,c2)); - - if (weight > weights) { - pos = weight* (camera.pose * camPos); - weights = weight; - } - } - //} - } - } - - wpos += pos; - //wpos += (camera.pose * pos); - - return weights; -} - -__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { - const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; - - const float3 pf = camera.poseInverse * mPos; - float3 pos = make_float3(0.0f, 0.0f, 0.0f); - const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); - float weights = 0.0f; - - - //float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y); - const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y)); - if (worldPos.z == MINF) return 0.0f; - - float weight = spatialWeighting(length(mPos - worldPos)); - - if (weight > 0.0f) { - wpos += weight* (worldPos); - weights += weight; - } - - return weights; -} - -__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) { - const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; - - const int width = output.width(); - const int height = output.height(); - - const DepthCameraCUDA &mainCamera = c_cameras[cam]; - - if (x < width && y < height) { - - const float depth = tex2D<float>(mainCamera.depth, x, y); - const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y); - const float4 norm = tex2D<float4>(mainCamera.normal, x, y); - //if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x); - - float3 wpos = make_float3(0.0f); - float3 wnorm = make_float3(0.0f); - float weights = 0.0f; - - if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) { - float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth); - - if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x && - mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y && - mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) { - - if (hashParams.m_flags & ftl::voxhash::kFlagMLS) { - for (uint cam2=0; cam2<numcams; ++cam2) { - //if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos); - weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f)); - - // Previous approach - //if (cam2 == cam) continue; - //weights += mlsCameraBest(cam2, mPos, c1, wpos); - } - wpos /= weights; - } else { - weights = 1000.0f; - wpos = mPos; - } - - //output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); - - if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f); - - //const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos)); - //if (screenPos.x < output.width() && screenPos.y < output.height()) { - // output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); - //} - } - } - } -} - -void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) { - const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - - mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam); - -#ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); -#endif -} - -#define RESAMPLE_RADIUS 7 - -__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) { - const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; - - const int width = depthin.width(); - const int height = depthin.height(); - - if (x < width && y < height) { - - //const int depth = depthin.tex2D((int)x, (int)y); - //if (depth != 0x7FFFFFFF) { - // depthout(x,y) = (float)depth / 1000.0f; - // return; - //} - - struct map_t { - int d; - int quad; - }; - - map_t mappings[5]; - int mapidx = 0; - - for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) { - for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) { - - const int depth = depthin.tex2D((int)x+u, (int)y+v); - const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v); - - if (depth != 0x7FFFFFFF) { - int i=0; - for (i=0; i<mapidx; ++i) { - if (abs(mappings[i].d - depth) < 100) { - if (u < 0 && v < 0) mappings[i].quad |= 0x1; - if (u > 0 && v < 0) mappings[i].quad |= 0x2; - if (u > 0 && v > 0) mappings[i].quad |= 0x4; - if (u < 0 && v > 0) mappings[i].quad |= 0x8; - break; - } - } - if (i == mapidx && i < 5) { - mappings[mapidx].d = depth; - mappings[mapidx].quad = 0; - if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1; - if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2; - if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4; - if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8; - ++mapidx; - } else { - //printf("EXCEEDED\n"); - } - } - } - } - - int bestdepth = 1000000; - //int count = 0; - for (int i=0; i<mapidx; ++i) { - if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d; - //if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d; - //if (mappings[i].quad == 15) count ++; - } - - //depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f; - - if (bestdepth < 1000000) { - depthout(x,y) = (float)bestdepth / 1000.0f; - } - } -} - -void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams ¶ms, cudaStream_t stream) { - const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - - mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params); - -#ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); -#endif -} - - /// ===== Median Filter ====== #define WINDOW_SIZE 3 diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp index 552c2e59d44206c089fee68124ca814d3e752ad6..26bfcad73f5ae72f20cfe2dd29d0e91c62fca62b 100644 --- a/applications/reconstruct/src/depth_camera_cuda.hpp +++ b/applications/reconstruct/src/depth_camera_cuda.hpp @@ -10,8 +10,10 @@ namespace cuda { void clear_depth(const TextureObject<float> &depth, cudaStream_t stream); void clear_depth(const TextureObject<int> &depth, cudaStream_t stream); +void clear_to_zero(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream); void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream); void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream); +void clear_colour(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream); void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream); @@ -21,7 +23,7 @@ void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureO void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream); -void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); +void mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const ftl::render::SplatParams ¶ms, int numcams, cudaStream_t stream); void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams ¶ms, cudaStream_t stream); diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu index 9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f..acb86216a7eecd6e10f3ebf3a3b999964b56cf09 100644 --- a/applications/reconstruct/src/dibr.cu +++ b/applications/reconstruct/src/dibr.cu @@ -1,12 +1,20 @@ #include "splat_render_cuda.hpp" +#include "depth_camera_cuda.hpp" #include <cuda_runtime.h> #include <ftl/cuda_matrix_util.hpp> #include "splat_params.hpp" +#include "mls_cuda.hpp" #include <ftl/depth_camera.hpp> #define T_PER_BLOCK 8 +#define UPSAMPLE_FACTOR 1.8f +#define WARP_SIZE 32 +#define DEPTH_THRESHOLD 0.05f +#define UPSAMPLE_MAX 60 +#define MAX_ITERATIONS 10 +#define SPATIAL_SMOOTHING 0.01f using ftl::cuda::TextureObject; using ftl::render::SplatParams; @@ -23,55 +31,343 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) { } } +__device__ inline bool isStable(const float3 &previous, const float3 &estimate, const SplatParams ¶ms, float d) { + const float psize = 2.0f * d / params.camera.fx; + //printf("PSIZE %f\n", psize); + return fabs(previous.x - estimate.x) <= psize && + fabs(previous.y - estimate.y) <= psize && + fabs(previous.z - estimate.z) <= psize; +} -__global__ void dibr_depthmap_kernel( - TextureObject<int> depth, int numcams, SplatParams params) { +/* + * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2. + * These values are also used as the actual surface estimate during rendering so should + * at least be plane or sphere fitted if not MLS smoothed onto the actual surface. + */ +__global__ void dibr_visibility_kernel(TextureObject<int> depth, int cam, SplatParams params) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y)); + const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y)); + if (worldPos.x == MINF) return; + const float r = (camera.poseInverse * worldPos).z / camera.params.fx; + + // Get virtual camera ray for splat centre and backface cull if possible + //const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f); + //const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin); + //if (dot(rayDir, normal) > 0.0f) return; + + // Find the virtual screen position of current point + const float3 camPos = params.m_viewMatrix * worldPos; + if (camPos.z < params.camera.m_sensorDepthWorldMin) return; + if (camPos.z > params.camera.m_sensorDepthWorldMax) return; + const uint2 screenPos = params.camera.cameraToKinectScreen(camPos); + + const int upsample = min(UPSAMPLE_MAX, int((5.0f*r) * params.camera.fx / camPos.z)); + + // Not on screen so stop now... + if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 || + screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return; + + // TODO:(Nick) Check depth buffer and don't do anything if already hidden? - const int i = threadIdx.y*blockDim.y + threadIdx.x; - const int bx = blockIdx.x*blockDim.x; - const int by = blockIdx.y*blockDim.y; - const int x = bx + threadIdx.x; - const int y = by + threadIdx.y; - - for (int j=0; j<numcams; ++j) { - const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; + // Each thread in warp takes an upsample point and updates corresponding depth buffer. + const int lane = threadIdx.x % WARP_SIZE; + for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) { + const float u = (i % upsample) - (upsample / 2); + const float v = (i / upsample) - (upsample / 2); + + // Make an initial estimate of the points location + // Use centroid depth as estimate...? + float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample); + + // Use current points z as estimate + //float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,camPos.z); + + // Or calculate upper and lower bounds for depth and do gradient + // descent until the gradient change is too small or max iter is reached + // and depth remains within the bounds. + // How to find min and max depths? + + float ld = nearest.z; + + // TODO: (Nick) Estimate depth using points plane, but needs better normals. + //float t; + //if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) { + // Plane based estimate of surface at this pixel + //const float3 nearest = rayOrigin + rayDir * camPos.z; + float3 output; + + // Use MLS of camera neighbor points to get more exact estimate + // Iterate until pixel is stable on the surface. + for (int k=0; k<MAX_ITERATIONS; ++k) { + + // TODO:(Nick) Should perhaps use points from all cameras? + // Instead of doing each camera separately... + // If the depth already is close then it has already been done and can skip this point + if (ftl::cuda::mls_point_surface<1>(camera.points, make_int2(x,y), params.m_viewMatrixInverse * nearest, output, SPATIAL_SMOOTHING) <= 0.0f) { + /*const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + if (cx < depth.width() && cy < depth.height()) { + atomicMax(&depth(cx,cy), 10000.0f); + }*/ + break; + } + + //ftl::cuda::render_depth(depth, params, output); + + output = params.m_viewMatrix * output; + + // This is essentially the SDF function f(x), only the normal should be estimated also from the weights + //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z); + + const float d = nearest.z + copysignf(0.5f*length(output - nearest), output.z - nearest.z); + nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d); + + const float2 sp = params.camera.cameraToKinectScreenFloat(output); + + //if (isStable(nearest, output, params, d)) { + //if (fabs(sp.x - float(screenPos.x+u)) < 2.0f && fabs(sp.y - float(screenPos.y+v)) < 2.0f) { + if (length(output - nearest) <= 2.0f * params.camera.fx / camPos.z) { + const unsigned int cx = screenPos.x+u; + const unsigned int cy = screenPos.y+v; + + if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) { + // Transform estimated point to virtual cam space and output z + atomicMin(&depth(cx,cy), d * 1000.0f); + } + break; + } + + /*if (k >= MAX_ITERATIONS-1 && length(output - nearest) <= SPATIAL_SMOOTHING) { + const unsigned int cx = screenPos.x+u; + const unsigned int cy = screenPos.y+v; + if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) { + //atomicMin(&depth(cx,cy), d * 1000.0f); + printf("ERR = %f, %f\n", fabs(sp.x - float(screenPos.x+u)), fabs(sp.y - float(screenPos.y+v))); + } + }*/ + + //nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d); // ld + (d - ld)*0.8f + ld = d; + } + //} + } +} + +// ------ Alternative for pass 1: principle surfaces --------------------------- + +#define NEIGHBOR_RADIUS 1 +#define MAX_NEIGHBORS ((NEIGHBOR_RADIUS*2+1)*(NEIGHBOR_RADIUS*2+1)) + +/* + * Pass 1: Determine depth buffer with enough accuracy for a visibility test in pass 2. + * These values are also used as the actual surface estimate during rendering so should + * at least be plane or sphere fitted if not MLS smoothed onto the actual surface. + */ + __global__ void dibr_visibility_principal_kernel(TextureObject<int> depth, int cam, SplatParams params) { + __shared__ float3 neighborhood_cache[2*T_PER_BLOCK][MAX_NEIGHBORS]; + __shared__ int minimum[2*T_PER_BLOCK]; + __shared__ int maximum[2*T_PER_BLOCK]; + + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const int warp = threadIdx.x / WARP_SIZE + threadIdx.y*2; + const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y)); + //const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y)); + if (worldPos.x == MINF) return; + const float r = (camera.poseInverse * worldPos).z / camera.params.fx; + + // Get virtual camera ray for splat centre and backface cull if possible + //const float3 rayOrigin = params.m_viewMatrixInverse * make_float3(0.0f,0.0f,0.0f); + //const float3 rayDir = normalize(params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x,y,1.0f) - rayOrigin); + //if (dot(rayDir, normal) > 0.0f) return; + + // Find the virtual screen position of current point + const float3 camPos = params.m_viewMatrix * worldPos; + if (camPos.z < params.camera.m_sensorDepthWorldMin) return; + if (camPos.z > params.camera.m_sensorDepthWorldMax) return; + const uint2 screenPos = params.camera.cameraToKinectScreen(camPos); + + const int upsample = min(UPSAMPLE_MAX, int((4.0f*r) * params.camera.fx / camPos.z)); + + // Not on screen so stop now... + if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 || + screenPos.x - upsample >= depth.width() || screenPos.y - upsample >= depth.height()) return; + + // TODO:(Nick) Check depth buffer and don't do anything if already hidden? - float4 d = tex2D<float4>(camera.points, x, y); - if (d.z < 0.0f) continue; - //if (d >= params.camera.m_sensorDepthWorldMax) continue; - - //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); + // TODO:(Nick) Preload point neighbors and transform to eye + const int lane = threadIdx.x % WARP_SIZE; + if (lane == 0) { + minimum[warp] = 100000000; + maximum[warp] = -100000000; + } + + __syncwarp(); + + for (int i=lane; i<MAX_NEIGHBORS; i+=WARP_SIZE) { + const int u = (i % (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS; + const int v = (i / (2*NEIGHBOR_RADIUS+1)) - NEIGHBOR_RADIUS; + const float3 point = params.m_viewMatrix * make_float3(tex2D<float4>(camera.points, x+u, y+v)); + neighborhood_cache[warp][i] = point; + + if (length(point - camPos) <= 0.04f) { + atomicMin(&minimum[warp], point.z*1000.0f); + atomicMax(&maximum[warp], point.z*1000.0f); + } + } + + __syncwarp(); + + const float interval = (float(maximum[warp])/1000.0f - float(minimum[warp]) / 1000.0f) / float(MAX_ITERATIONS); + //if (y == 200) printf("interval: %f\n", interval); + + // TODO:(Nick) Find min and max depths of neighbors to estimate z bounds + + // Each thread in warp takes an upsample point and updates corresponding depth buffer. + for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) { + const float u = (i % upsample) - (upsample / 2); + const float v = (i / upsample) - (upsample / 2); + + // Make an initial estimate of the points location + // Use centroid depth as estimate...? + //float3 nearest = ftl::cuda::screen_centroid<1>(camera.points, make_float2(screenPos.x+u, screenPos.y+v), make_int2(x,y), params, upsample); + + // Use current points z as estimate + // TODO: Use min point as estimate + float3 nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,float(minimum[warp])/1000.0f); + + // Or calculate upper and lower bounds for depth and do gradient + // descent until the gradient change is too small or max iter is reached + // and depth remains within the bounds. + // How to find min and max depths? + + // TODO: (Nick) Estimate depth using points plane, but needs better normals. + //float t; + //if (ftl::cuda::intersectPlane(normal, worldPos, rayOrigin, rayDir, t)) { + // Plane based estimate of surface at this pixel + //const float3 nearest = rayOrigin + rayDir * camPos.z; + + // Use MLS of camera neighbor points to get more exact estimate + // Iterate until pixel is stable on the surface. + for (int k=0; k<MAX_ITERATIONS; ++k) { + + // TODO:(Nick) Should perhaps use points from all cameras? + // Instead of doing each camera separately... + // If the depth already is close then it has already been done and can skip this point + const float energy = ftl::cuda::mls_point_energy<MAX_NEIGHBORS>(neighborhood_cache[warp], nearest, SPATIAL_SMOOTHING); + + if (energy <= 0.0f) break; + + //ftl::cuda::render_depth(depth, params, output); + + // This is essentially the SDF function f(x), only the normal should be estimated also from the weights + //const float d = nearest.z + (normal.x*output.x + normal.y*output.y + normal.z*output.z); + + const float d = nearest.z; + nearest = params.camera.kinectDepthToSkeleton(screenPos.x+u,screenPos.y+v,d+interval); + + if (energy >= 0.1f) { + const unsigned int cx = screenPos.x+u; + const unsigned int cy = screenPos.y+v; + if (d > params.camera.m_sensorDepthWorldMin && d < params.camera.m_sensorDepthWorldMax && cx < depth.width() && cy < depth.height()) { + // Transform estimated point to virtual cam space and output z + atomicMin(&depth(cx,cy), d * 1000.0f); + } + break; + } + } + //} + } +} + +__device__ inline float4 make_float4(const uchar4 &c) { + return make_float4(c.x,c.y,c.z,c.w); +} + +/* + * Pass 2: Accumulate attribute contributions if the points pass a visibility test. + */ +__global__ void dibr_attribute_contrib_kernel( + TextureObject<int> depth_in, + TextureObject<float4> colour_out, + TextureObject<float4> normal_out, + TextureObject<float> contrib_out, int cam, + SplatParams params) { - const float3 worldPos = make_float3(d); - const float3 camPos = params.m_viewMatrix * worldPos; - const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); - const uint2 screenPos = make_uint2(make_int2(screenPosf)); + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE; + const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + const float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y)); + //const float3 normal = make_float3(tex2D<float4>(camera.normal, x, y)); + if (worldPos.x == MINF) return; + const float r = (camera.poseInverse * worldPos).z / camera.params.fx; - const unsigned int cx = screenPos.x; - const unsigned int cy = screenPos.y; + const float3 camPos = params.m_viewMatrix * worldPos; + if (camPos.z < params.camera.m_sensorDepthWorldMin) return; + if (camPos.z > params.camera.m_sensorDepthWorldMax) return; + const uint2 screenPos = params.camera.cameraToKinectScreen(camPos); + const int upsample = min(UPSAMPLE_MAX, int((10.0f*r) * params.camera.fx / camPos.z)); - if (cx < depth.width() && cy < depth.height()) { - //float camd = depth_in.tex2D((int)cx,(int)cy); - //atomicMin(&depth(x,y), idepth); - //float camdiff = fabs(camPos.z-camd); - //if (camdiff < 0.1f) { - //colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); - //} else { - //colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255); - //} + // Not on screen so stop now... + if (screenPos.x + upsample < 0 || screenPos.y + upsample < 0 || + screenPos.x - upsample >= depth_in.width() || screenPos.y - upsample >= depth_in.height()) return; - atomicMin(&depth(cx,cy), camPos.z * 1000.0f); + // Is this point near the actual surface and therefore a contributor? + const float d = ((float)depth_in.tex2D((int)screenPos.x, (int)screenPos.y)/1000.0f); + if (abs(d - camPos.z) > DEPTH_THRESHOLD) return; + + // TODO:(Nick) Should just one thread load these to shared mem? + const float4 colour = make_float4(tex2D<uchar4>(camera.colour, x, y)); + const float4 normal = tex2D<float4>(camera.normal, x, y); + + // Each thread in warp takes an upsample point and updates corresponding depth buffer. + const int lane = threadIdx.x % WARP_SIZE; + for (int i=lane; i<upsample*upsample; i+=WARP_SIZE) { + const float u = (i % upsample) - (upsample / 2); + const float v = (i / upsample) - (upsample / 2); + + // Use the depth buffer to determine this pixels 3D position in camera space + const float d = ((float)depth_in.tex2D(screenPos.x+u, screenPos.y+v)/1000.0f); + const float3 nearest = params.camera.kinectDepthToSkeleton((int)(screenPos.x+u),(int)(screenPos.y+v),d); + + // What is contribution of our current point at this pixel? + const float weight = ftl::cuda::spatialWeighting(length(nearest - camPos), SPATIAL_SMOOTHING); + if (screenPos.x+u < colour_out.width() && screenPos.y+v < colour_out.height() && weight > 0.0f) { // TODO: Use confidence threshold here + const float4 wcolour = colour * weight; + const float4 wnormal = normal * weight; + + // Add this points contribution to the pixel buffer + atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v), wcolour.x); + atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+1, wcolour.y); + atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+2, wcolour.z); + atomicAdd((float*)&colour_out(screenPos.x+u, screenPos.y+v)+3, wcolour.w); + atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v), wnormal.x); + atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+1, wnormal.y); + atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+2, wnormal.z); + atomicAdd((float*)&normal_out(screenPos.x+u, screenPos.y+v)+3, wnormal.w); + atomicAdd(&contrib_out(screenPos.x+u, screenPos.y+v), weight); } - } + } } - -__global__ void dibr_kernel( +/* + * Pass 2: Accumulate attribute contributions if the points pass a visibility test. + */ +/*__global__ void dibr_attribute_contrib_kernel( TextureObject<int> depth_in, - TextureObject<uchar4> colour_out, int numcams, SplatParams params) { + TextureObject<uchar4> colour_out, + TextureObject<float4> normal_out, int numcams, SplatParams params) { const int i = threadIdx.y*blockDim.y + threadIdx.x; const int bx = blockIdx.x*blockDim.x; @@ -80,119 +376,89 @@ __global__ void dibr_kernel( const int y = by + threadIdx.y; for (int j=0; j<numcams; ++j) { - const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[j]; - float4 d = tex2D<float4>(camera.points, x, y); - if (d.z < 0.0f) continue; - //if (d >= params.camera.m_sensorDepthWorldMax) continue; + float3 worldPos = make_float3(tex2D<float4>(camera.points, x, y)); + float r = (camera.poseInverse * worldPos).z; + //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), worldPos, 0.02f) < 0.001f) continue; + if (worldPos.x == MINF) continue; - //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); - - const float3 worldPos = make_float3(d); const float3 camPos = params.m_viewMatrix * worldPos; - const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); - const uint2 screenPos = make_uint2(make_int2(screenPosf)); - - if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; - const unsigned int cx = screenPos.x; - const unsigned int cy = screenPos.y; + // Estimate upsample factor using ratio of source depth and output depth + const int upsample = min(15, (int)(UPSAMPLE_FACTOR * (r / camPos.z))+1); + const float upfactor = 2.0f / (float)(upsample); - if (cx < colour_out.width() && cy < colour_out.height()) { - //float camd = depth_in.tex2D((int)cx,(int)cy); - //atomicMin(&depth(x,y), idepth); - //float camdiff = fabs(camPos.z-camd); - //if (camdiff < 0.1f) { - - if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) { - colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); - //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255); + for (int v=0; v<upsample; ++v) { + for (int u=0; u<upsample; ++u) { + float3 point; + const ftl::cuda::fragment nearest = ftl::cuda::upsampled_point(camera.points, camera.normal, camera.colour, + make_float2((float)x-1.0f+u*upfactor,(float)y-1.0f+v*upfactor)); + //if (ftl::cuda::mls_point_surface<3>(camera.points, make_int2(x,y), nearest, point, 0.02f) < 0.001f) continue; + ftl::cuda::render_fragment(depth_in, normal_out, colour_out, params, nearest); } - - - //} else { - //colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255); - //} } } -} - -__device__ inline float4 make_float4(const uchar4 &c) { - return make_float4(c.x,c.y,c.z,c.w); -} - -__global__ void dibr_kernel_rev( - TextureObject<float> depth_in, - TextureObject<uchar4> colour_out, int numcams, SplatParams params) { - - const int i = threadIdx.y*blockDim.y + threadIdx.x; - const int bx = blockIdx.x*blockDim.x; - const int by = blockIdx.y*blockDim.y; - const int x = bx + threadIdx.x; - const int y = by + threadIdx.y; - - float camd = depth_in.tex2D((int)x,(int)y); - if (camd < 0.01f) return; - if (camd >= params.camera.m_sensorDepthWorldMax) return; - - const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd); - float mindiff = 1000.0f; - float4 col = make_float4(0.0f,0.0f,0.0f,0.0f); - int count = 0; +}*/ - for (int j=0; j<numcams; ++j) { - const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; - - const float3 camPos = camera.poseInverse * worldPos; - const float2 screenPosf = camera.params.cameraToKinectScreenFloat(camPos); - const uint2 screenPos = make_uint2(make_int2(screenPosf)); - - if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; - const unsigned int cx = screenPos.x; - const unsigned int cy = screenPos.y; - if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) { - float d = tex2D<float>(camera.depth, (int)cx, (int)cy); - float camdiff = fabs(camPos.z-d); - - if (camdiff < mindiff) { - mindiff = camdiff; - col += make_float4(tex2D<uchar4>(camera.colour,cx,cy)); - ++count; - } +__global__ void dibr_normalise_kernel( + TextureObject<float4> colour_in, + TextureObject<uchar4> colour_out, + TextureObject<float4> normals, + TextureObject<float> contribs) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; - //if (camdiff < 0.1f) { - // colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy); - //} else { - //colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255); - //} - } - } + if (x < colour_in.width() && y < colour_in.height()) { + const float4 colour = colour_in.tex2D((int)x,(int)y); + const float4 normal = normals.tex2D((int)x,(int)y); + const float contrib = contribs.tex2D((int)x,(int)y); - if (count > 0) { - col = col / (float)count; - colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255); - } else { - colour_out(x,y) = make_uchar4(76,76,76,255); - } + if (contrib > 0.0f) { + colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0); + normals(x,y) = normal / contrib; + } + } } void ftl::cuda::dibr(const TextureObject<int> &depth_out, - const TextureObject<uchar4> &colour_out, int numcams, const SplatParams ¶ms, cudaStream_t stream) { - + const TextureObject<uchar4> &colour_out, + const TextureObject<float4> &normal_out, + const TextureObject<float> &confidence_out, + const TextureObject<float4> &tmp_colour, + int numcams, + const SplatParams ¶ms, + cudaStream_t stream) { + + const dim3 sgridSize((depth_out.width() + 2 - 1)/2, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 sblockSize(2*WARP_SIZE, T_PER_BLOCK); const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + ftl::cuda::clear_to_zero(confidence_out, stream); + ftl::cuda::clear_colour(tmp_colour, stream); + ftl::cuda::clear_colour(normal_out, stream); #ifdef _DEBUG cudaSafeCall(cudaDeviceSynchronize()); #endif - dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params); - dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params); + int i=3; + // Pass 1, merge a depth map from each camera. + for (int i=0; i<numcams; ++i) + dibr_visibility_principal_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, i, params); + + // Pass 2, accumulate all point contributions to pixels + for (int i=0; i<numcams; ++i) + dibr_attribute_contrib_kernel<<<sgridSize, sblockSize, 0, stream>>>(depth_out, tmp_colour, normal_out, confidence_out, i, params); + + // Pass 3, normalise contributions + dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(tmp_colour, colour_out, normal_out, confidence_out); + cudaSafeCall( cudaGetLastError() ); #ifdef _DEBUG @@ -200,22 +466,21 @@ void ftl::cuda::dibr(const TextureObject<int> &depth_out, #endif } -void ftl::cuda::dibr(const TextureObject<float> &depth_out, - const TextureObject<uchar4> &colour_out, int numcams, const SplatParams ¶ms, cudaStream_t stream) { +void ftl::cuda::dibr_raw(const TextureObject<int> &depth_out, + int numcams, const SplatParams ¶ms, cudaStream_t stream) { const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - - clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); #ifdef _DEBUG cudaSafeCall(cudaDeviceSynchronize()); #endif - dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params); + //dibr_depthmap_direct_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params); cudaSafeCall( cudaGetLastError() ); #ifdef _DEBUG cudaSafeCall(cudaDeviceSynchronize()); #endif } + diff --git a/applications/reconstruct/src/mls.cu b/applications/reconstruct/src/mls.cu new file mode 100644 index 0000000000000000000000000000000000000000..71201a017cf1ebab6e06257e9425403df50427d3 --- /dev/null +++ b/applications/reconstruct/src/mls.cu @@ -0,0 +1,273 @@ +#include <ftl/cuda_common.hpp> +#include <ftl/cuda_util.hpp> +#include <ftl/depth_camera.hpp> +#include "depth_camera_cuda.hpp" + +#include "mls_cuda.hpp" + +#define T_PER_BLOCK 16 +#define MINF __int_as_float(0xff800000) + +using ftl::voxhash::DepthCameraCUDA; +using ftl::voxhash::HashData; +using ftl::voxhash::HashParams; +using ftl::cuda::TextureObject; +using ftl::render::SplatParams; + +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; +extern __constant__ HashParams c_hashParams; + +/// ===== MLS Smooth + +/* + * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005). + * Real-time foreground-background segmentation using codebook model. + * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004 + */ + __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) { + const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z; + const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z; + const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2); + const float p_2 = xv_2 / v_2; + return sqrt(x_2 - p_2); +} + +__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) { + float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z); + return length(delta); +} + +/* + * Colour weighting as suggested in: + * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014. + * c = colour distance + */ + __device__ float colourWeighting(float c) { + const float h = c_hashParams.m_colourSmoothing; + if (c >= h) return 0.0f; + float ch = c / h; + ch = 1.0f - ch*ch; + return ch*ch*ch*ch; +} + +#define WINDOW_RADIUS 5 + +__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const float3 pf = camera.poseInverse * mPos; + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float weights = 0.0f; + + //#pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + //if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + float weight = ftl::cuda::spatialWeighting(length(pf - camPos), c_hashParams.m_spatialSmoothing); + + if (weight > 0.0f) { + uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); + weight *= colourWeighting(colordiffFloat2(c1,c2)); + + if (weight > 0.0f) { + wpos += weight* (camera.pose * camPos); + weights += weight; + } + } + //} + } + } + + //wpos += (camera.pose * pos); + + return weights; +} + +__device__ float mlsCameraNoColour(int cam, const float3 &mPos, uchar4 c1, const float4 &norm, float3 &wpos, float h) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const float3 pf = camera.poseInverse * mPos; + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float weights = 0.0f; + + //#pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + //if (screenPos.x+u < width && screenPos.y+v < height) { //on creen + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + + // TODO:(Nick) dot product of normals < 0 means the point + // should be ignored with a weight of 0 since it is facing the wrong direction + // May be good to simply weight using the dot product to give + // a stronger weight to those whose normals are closer + + float weight = ftl::cuda::spatialWeighting(length(pf - camPos), h); + + if (weight > 0.0f) { + float4 n2 = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); + if (dot(make_float3(norm), make_float3(n2)) > 0.0f) { + + uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); + + if (colourWeighting(colordiffFloat2(c1,c2)) > 0.0f) { + pos += weight*camPos; // (camera.pose * camPos); + weights += weight; + } + } + } + //} + } + } + + if (weights > 0.0f) wpos += (camera.pose * (pos / weights)) * weights; + + return weights; +} + + +__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = output.width(); + const int height = output.height(); + + const DepthCameraCUDA &mainCamera = c_cameras[cam]; + + if (x < width && y < height) { + + const float depth = tex2D<float>(mainCamera.depth, x, y); + const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y); + const float4 norm = tex2D<float4>(mainCamera.normal, x, y); + //if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x); + + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + + if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) { + float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth); + + if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x && + mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y && + mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) { + + if (hashParams.m_flags & ftl::voxhash::kFlagMLS) { + for (uint cam2=0; cam2<numcams; ++cam2) { + //if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos); + weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f)); + + // Previous approach + //if (cam2 == cam) continue; + //weights += mlsCameraBest(cam2, mPos, c1, wpos); + } + wpos /= weights; + } else { + weights = 1000.0f; + wpos = mPos; + } + + //output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + + if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f); + + //const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos)); + //if (screenPos.x < output.width() && screenPos.y < output.height()) { + // output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + //} + } + } + } +} + +void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) { + const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} + + +// ===== Render Depth using MLS ================================================ + +#define MAX_UPSAMPLE 5 +#define SAMPLE_BUFFER ((2*MAX_UPSAMPLE+1)*(2*MAX_UPSAMPLE+1)) +#define WARP_SIZE 32 +#define BLOCK_WIDTH 4 +#define MLS_RADIUS 5 +#define MLS_WIDTH (2*MLS_RADIUS+1) +#define MLS_SAMPLES (MLS_WIDTH*MLS_WIDTH) + +__global__ void mls_render_depth_kernel(const TextureObject<int> input, TextureObject<int> output, SplatParams params, int numcams) { + /*const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = output.width(); + const int height = output.height(); + + if (x < width && y < height) { + + const float depth = tex2D<float>(mainCamera.depth, x, y); + const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y); + const float4 norm = tex2D<float4>(mainCamera.normal, x, y); + //if (x == 400 && y == 200) printf("NORMX: %f\n", norm.x); + + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + + if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) { + float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth); + + if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x && + mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y && + mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) { + + if (hashParams.m_flags & ftl::voxhash::kFlagMLS) { + for (uint cam2=0; cam2<numcams; ++cam2) { + //if (cam2 == cam) weights += mlsCameraNoColour(cam2, mPos, c1, wpos, c_hashParams.m_spatialSmoothing*0.1f); //weights += 0.5*mlsCamera(cam2, mPos, c1, wpos); + weights += mlsCameraNoColour(cam2, mPos, c1, norm, wpos, c_hashParams.m_spatialSmoothing); //*((cam == cam2)? 0.1f : 5.0f)); + + // Previous approach + //if (cam2 == cam) continue; + //weights += mlsCameraBest(cam2, mPos, c1, wpos); + } + wpos /= weights; + } else { + weights = 1000.0f; + wpos = mPos; + } + + //output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + + //if (weights >= hashParams.m_confidenceThresh) output(x,y) = make_float4(wpos, 0.0f); + + const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos)); + if (screenPos.x < output.width() && screenPos.y < output.height()) { + output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + } + } + } + }*/ +} + + +void ftl::cuda::mls_render_depth(const TextureObject<int> &input, TextureObject<int> &output, const SplatParams ¶ms, int numcams, cudaStream_t stream) { + const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + mls_render_depth_kernel<<<gridSize, blockSize, 0, stream>>>(input, output, params, numcams); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} diff --git a/applications/reconstruct/src/mls_cuda.hpp b/applications/reconstruct/src/mls_cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..420b3beb0a5603abbc1a34e5cfeba5b99912408d --- /dev/null +++ b/applications/reconstruct/src/mls_cuda.hpp @@ -0,0 +1,377 @@ +#ifndef _FTL_MLS_CUDA_HPP_ +#define _FTL_MLS_CUDA_HPP_ + +#include <ftl/cuda_util.hpp> +#include <ftl/cuda_common.hpp> +#include <ftl/cuda_matrix_util.hpp> +#include "splat_params.hpp" + +__device__ inline float3 make_float3(const uchar4 &c) { + return make_float3((float)c.x,(float)c.y,(float)c.z); +} + +__device__ inline uchar4 make_uchar4(const float3 &c) { + return make_uchar4(c.x,c.y,c.z,255); +} + +namespace ftl { +namespace cuda { + +/* + * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007. + * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras + * r = distance between points + * h = smoothing parameter in meters (default 4cm) + */ +__device__ inline float spatialWeighting(float r, float h) { + if (r >= h) return 0.0f; + float rh = r / h; + rh = 1.0f - rh*rh; + return rh*rh*rh*rh; +} + +__device__ float colourWeighting(float c); + +struct fragment { + float3 point; + float3 normal; + uchar4 colour; +}; + +__device__ inline float3 upsampled_point(cudaTextureObject_t pointset, const float2 &uv) { + float3 R = make_float3(0.0f, 0.0f, 0.0f); + const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y))); + const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y))); + R += D1 * P1; + + const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f))); + const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f))); + R += D2 * P2; + + const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y))); + const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y))); + R += D3 * P3; + + const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f))); + const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f))); + R += D4 * P4; + + // R is the centroid of surrounding points. + R /= (D1+D2+D3+D4); + + // FIXME: Should not use centroid but instead sample the surface at this point + // Use plane estimate at point to get "centroid" and then do the spatial weighted sample? + return R; +} + +__device__ inline fragment upsampled_point(cudaTextureObject_t pointset, + cudaTextureObject_t normals, cudaTextureObject_t colours, const float2 &uv) { + float3 R = make_float3(0.0f, 0.0f, 0.0f); + float3 N = make_float3(0.0f, 0.0f, 0.0f); + float3 C = make_float3(0.0f, 0.0f, 0.0f); + + // TODO:(Nick) Don't upsample points if distance is too great + + const float3 P1 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y))); + const float D1 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y))); + R += D1 * P1; + N += D1 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y))); + C += D1 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y))); + + const float3 P2 = make_float3(tex2D<float4>(pointset, int(uv.x), int(uv.y+1.0f))); + const float D2 = 1.0f - length(uv - make_float2(int(uv.x), int(uv.y+1.0f))); + R += D2 * P2; + N += D2 * make_float3(tex2D<float4>(normals, int(uv.x), int(uv.y+1.0f))); + C += D2 * make_float3(tex2D<uchar4>(colours, int(uv.x), int(uv.y+1.0f))); + + const float3 P3 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y))); + const float D3 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y))); + R += D3 * P3; + N += D3 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y))); + C += D3 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y))); + + const float3 P4 = make_float3(tex2D<float4>(pointset, int(uv.x+1.0f), int(uv.y+1.0f))); + const float D4 = 1.0f - length(uv - make_float2(int(uv.x+1.0f), int(uv.y+1.0f))); + R += D4 * P4; + N += D4 * make_float3(tex2D<float4>(normals, int(uv.x+1.0f), int(uv.y+1.0f))); + C += D4 * make_float3(tex2D<uchar4>(colours, int(uv.x+1.0f), int(uv.y+1.0f))); + + return {R / (D1+D2+D3+D4), N / (D1+D2+D3+D4), make_uchar4(C / (D1+D2+D3+D4))}; +} + +__device__ inline void render_depth(ftl::cuda::TextureObject<int> &depth, ftl::render::SplatParams ¶ms, const float3 &worldPos) { + const float3 camPos = params.m_viewMatrix * worldPos; + const float d = camPos.z; + if (d < params.camera.m_sensorDepthWorldMin) return; + + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); + + const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + + if (cx < depth.width() && cy < depth.height()) { + atomicMin(&depth(cx,cy), d * 1000.0f); + } +} + +__device__ inline void render_fragment( + ftl::cuda::TextureObject<int> &depth_in, + ftl::cuda::TextureObject<float4> &normal_out, + ftl::cuda::TextureObject<uchar4> &colour_out, + ftl::render::SplatParams ¶ms, const fragment &frag) { + const float3 camPos = params.m_viewMatrix * frag.point; + const float d = camPos.z; + if (d < params.camera.m_sensorDepthWorldMin) return; + + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); + + const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + + if (cx < depth_in.width() && cy < depth_in.height()) { + if (depth_in(cx,cy) == (int)(d * 1000.0f)) { + colour_out(cx,cy) = frag.colour; + normal_out(cx,cy) = make_float4(frag.normal, 0.0f); + } + } +} + +/** + * Estimate the point set surface location near to a given point. + */ +template <int R> +__device__ float3 screen_centroid( + cudaTextureObject_t pointset, + const float2 &suv, + const int2 &uv, + const ftl::render::SplatParams ¶ms, + float smoothing) { + + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + float weights = 0.0f; + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = params.m_viewMatrix * make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = ftl::cuda::spatialWeighting(length(suv - params.camera.cameraToKinectScreenFloat(samplePoint)), smoothing); + pos += weight*samplePoint; + weights += weight; + //} + } + } + + if (weights > 0.0f) pos = pos / weights; + return pos; +} + +/** + * Estimate a point set surface point from an existing point in the set. + */ +template <int R> +__device__ float mls_point_surface( + cudaTextureObject_t pointset, + const int2 &uv, + float3 &estPoint, + float smoothing) { + + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + float weights = 0.0f; + const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y)); + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing); + pos += weight*samplePoint; + weights += weight; + //} + } + } + + if (weights > 0.0f) estPoint = pos / weights; + return weights; +}; + +/** + * Estimate the point set surface location near to a given point. + */ +template <int R> +__device__ float mls_point_surface( + cudaTextureObject_t pointset, + const int2 &uv, + const float3 &nearPoint, + float3 &estPoint, + float smoothing) { + + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + float weights = 0.0f; + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing); + pos += weight*samplePoint; + weights += weight; + //} + } + } + + if (weights > 0.0f) estPoint = pos / weights; + return weights; +} + +/** + * Calculate the point sample energy. + */ +template <int R> +__device__ float mls_point_energy( + cudaTextureObject_t pointset, + const int2 &uv, + const float3 &nearPoint, + float smoothing) { + + float weights = 0.0f; + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing); + weights += weight; + //} + } + } + + return weights; +} + +/** + * Calculate the point sample energy. + */ +template <int M> +__device__ float mls_point_energy( + const float3 (&pointset)[M], + const float3 &nearPoint, + float smoothing) { + + float weights = 0.0f; + + //#pragma unroll + for (int i=0; i<M; ++i) { + const float3 samplePoint = pointset[i]; + const float weight = ftl::cuda::spatialWeighting(length(nearPoint - samplePoint), smoothing); + weights += weight; + } + + return weights; +} + +/** + * Estimate a point set surface location near an existing and return also + * an estimate of the normal and colour of that point. + */ +template <int R> +__device__ float mls_point_surface( + cudaTextureObject_t pointset, + cudaTextureObject_t normalset, + cudaTextureObject_t colourset, + const int2 &uv, + float3 &estPoint, + float3 &estNormal, + uchar4 &estColour, + float smoothing) { + + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 colour = make_float3(0.0f, 0.0f, 0.0f); + float weights = 0.0f; + const float3 nearPoint = make_float3(tex2D<float4>(pointset, uv.x, uv.y)); + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing); + + if (weight > 0.0f) { + pos += weight*samplePoint; + weights += weight; + + normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v)); + const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v); + colour += weight * make_float3(c.x, c.y, c.z); + } + //} + } + } + + if (weights > 0.0f) { + estPoint = pos / weights; + estNormal = normal / weights; + estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255); + } + return weights; +} + +/** + * Estimate a point set surface location near a given point and return also + * an estimate of the normal and colour of that point. + */ +template <int R> +__device__ float mls_point_surface( + cudaTextureObject_t pointset, + cudaTextureObject_t normalset, + cudaTextureObject_t colourset, + const int2 &uv, + const float3 &nearPoint, + float3 &estPoint, + float3 &estNormal, + uchar4 &estColour, + float smoothing) { + + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + float3 normal = make_float3(0.0f, 0.0f, 0.0f); + float3 colour = make_float3(0.0f, 0.0f, 0.0f); + float weights = 0.0f; + + //#pragma unroll + for (int v=-R; v<=R; ++v) { + for (int u=-R; u<=R; ++u) { + //if (uv.x+u >= 0 && uv.x+u < pointset.width() && uv.y+v >= 0 && uv.y+v < pointset.height()) { + const float3 samplePoint = make_float3(tex2D<float4>(pointset, uv.x+u, uv.y+v)); + const float weight = spatialWeighting(length(nearPoint - samplePoint), smoothing); + + if (weight > 0.0f) { + pos += weight*samplePoint; + weights += weight; + + normal += weight * make_float3(tex2D<float4>(normalset, uv.x+u, uv.y+v)); + const uchar4 c = tex2D<uchar4>(colourset, uv.x+u, uv.y+v); + colour += weight * make_float3(c.x, c.y, c.z); + } + //} + } + } + + if (weights > 0.0f) { + estPoint = pos / weights; + estNormal = normal / weights; + estColour = make_uchar4(colour.x / weights, colour.y / weights, colour.z / weights, 255); + } + return weights; +} + +} +} + +#endif // _FTL_MLS_CUDA_HPP_ diff --git a/applications/reconstruct/src/splat_params.hpp b/applications/reconstruct/src/splat_params.hpp index e38e4447286bf6c70f6c753deed35154b6aafd8a..cb2d3febda3364a3407264711620a25f9dc116a1 100644 --- a/applications/reconstruct/src/splat_params.hpp +++ b/applications/reconstruct/src/splat_params.hpp @@ -9,6 +9,7 @@ namespace ftl { namespace render { static const uint kShowBlockBorders = 0x0001; +static const uint kNoSplatting = 0x0002; struct __align__(16) SplatParams { float4x4 m_viewMatrix; @@ -16,6 +17,7 @@ struct __align__(16) SplatParams { uint m_flags; float voxelSize; + float depthThreshold; DepthCameraParams camera; }; diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp index 92d70a1c50d06e985fcb0faa1ef8f0bd5b961f86..1f7a18f66af2a0e6cd30fac3e4c139355028f6db 100644 --- a/applications/reconstruct/src/splat_render.cpp +++ b/applications/reconstruct/src/splat_render.cpp @@ -24,9 +24,18 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) { depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height); } + if ((unsigned int)depth3_.width() != camera.width || (unsigned int)depth3_.height() != camera.height) { + depth3_ = ftl::cuda::TextureObject<int>(camera.width, camera.height); + } if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) { colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height); } + if ((unsigned int)colour_tmp_.width() != camera.width || (unsigned int)colour_tmp_.height() != camera.height) { + colour_tmp_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height); + } + if ((unsigned int)normal1_.width() != camera.width || (unsigned int)normal1_.height() != camera.height) { + normal1_ = ftl::cuda::TextureObject<float4>(camera.width, camera.height); + } if ((unsigned int)depth2_.width() != camera.width || (unsigned int)depth2_.height() != camera.height) { depth2_ = ftl::cuda::TextureObject<float>(camera.width, camera.height); } @@ -56,30 +65,31 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { if (scene_->value("voxels", false)) { // TODO:(Nick) Stereo for voxel version ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream); - ftl::cuda::splat_points(depth1_, depth2_, params, stream); - ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); + //ftl::cuda::splat_points(depth1_, depth2_, params, stream); + //ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); src->writeFrames(colour1_, depth2_, stream); } else { - //ftl::cuda::clear_colour(colour1_, stream); ftl::cuda::clear_depth(depth1_, stream); + ftl::cuda::clear_depth(depth3_, stream); ftl::cuda::clear_depth(depth2_, stream); - ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream); - //ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream); + ftl::cuda::clear_colour(colour2_, stream); + ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream); - // Splat the depth from first DIBR to expand the points - ftl::cuda::splat_points(depth1_, depth2_, params, stream); + // Step 1: Put all points into virtual view to gather them + //ftl::cuda::dibr_raw(depth1_, scene_->cameraCount(), params, stream); - // Alternative to above... - //ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream); - - - // Use reverse sampling to obtain more colour details - // Should use nearest neighbor point depths to verify which camera to use - //ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); + // Step 2: For each point, use a warp to do MLS and up sample + //ftl::cuda::mls_render_depth(depth1_, depth3_, params, scene_->cameraCount(), stream); if (src->getChannel() == ftl::rgbd::kChanDepth) { - ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream); - src->writeFrames(colour1_, depth2_, stream); + //ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream); + if (src->value("splatting", false)) { + //ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream); + src->writeFrames(colour2_, depth2_, stream); + } else { + ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream); + src->writeFrames(colour1_, depth2_, stream); + } } else if (src->getChannel() == ftl::rgbd::kChanRight) { // Adjust pose to right eye position Eigen::Affine3f transform(Eigen::Translation3f(camera.baseline,0.0f,0.0f)); @@ -88,10 +98,16 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { params.m_viewMatrixInverse = MatrixConversion::toCUDA(matrix); ftl::cuda::clear_depth(depth1_, stream); - ftl::cuda::dibr(depth1_, colour2_, scene_->cameraCount(), params, stream); + ftl::cuda::dibr(depth1_, colour1_, normal1_, depth2_, colour_tmp_, scene_->cameraCount(), params, stream); src->writeFrames(colour1_, colour2_, stream); } else { - src->writeFrames(colour1_, depth2_, stream); + if (src->value("splatting", false)) { + //ftl::cuda::splat_points(depth1_, colour1_, normal1_, depth2_, colour2_, params, stream); + src->writeFrames(colour2_, depth2_, stream); + } else { + ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream); + src->writeFrames(colour1_, depth2_, stream); + } } } diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp index 7737bb5d903d725a55b81d75812b99b52e7ace66..7511e859ff6471dcba18d5aee5518f55045eef9b 100644 --- a/applications/reconstruct/src/splat_render.hpp +++ b/applications/reconstruct/src/splat_render.hpp @@ -33,9 +33,12 @@ class Splatter { private: int device_; ftl::cuda::TextureObject<int> depth1_; + ftl::cuda::TextureObject<int> depth3_; ftl::cuda::TextureObject<uchar4> colour1_; + ftl::cuda::TextureObject<float4> colour_tmp_; ftl::cuda::TextureObject<float> depth2_; ftl::cuda::TextureObject<uchar4> colour2_; + ftl::cuda::TextureObject<float4> normal1_; SplatParams params_; ftl::voxhash::SceneRep *scene_; }; diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp index d8fd9cf55649280c20805c8d4f8b9f7b758b2223..f8cbdbb6bd87f7ea9e2275e000ad75de87e9dd11 100644 --- a/applications/reconstruct/src/splat_render_cuda.hpp +++ b/applications/reconstruct/src/splat_render_cuda.hpp @@ -10,6 +10,80 @@ namespace ftl { namespace cuda { +__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l, float &t) { + // assuming vectors are all normalized + float denom = dot(n, l); + if (denom > 1e-6) { + t = dot(p0 - l0, n) / denom; + return (t >= 0); + } + + return false; +} + +__device__ inline bool intersectPlane(const float3 &n, const float3 &p0, const float3 &l, float &t) { + // assuming vectors are all normalized + float denom = dot(n, l); + if (denom > 1e-6) { + t = dot(p0, n) / denom; + return (t >= 0); + } + return false; +} + +__device__ inline bool intersectDisk(const float3 &n, const float3 &p0, float radius, const float3 &l0, const float3 &l) { + float t = 0; + if (intersectPlane(n, p0, l0, l, t)) { + float3 p = l0 + l * t; + float3 v = p - p0; + float d2 = dot(v, v); + return (sqrt(d2) <= radius); + // or you can use the following optimisation (and precompute radius^2) + // return d2 <= radius2; // where radius2 = radius * radius + } + return false; +} + +/** + * Get the radius of a ray intersection with a disk. + * @param n Normalised normal of disk. + * @param p0 Centre of disk in camera space + * @param l Normalised ray direction in camera space + * @return Radius from centre of disk where intersection occurred. + */ +__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l0, const float3 &l) { + float t = 0; + if (intersectPlane(n, p0, l0, l, t)) { + const float3 p = l0 + l * t; + const float3 v = p - p0; + const float d2 = dot(v, v); + return sqrt(d2); + // or you can use the following optimisation (and precompute radius^2) + // return d2 <= radius2; // where radius2 = radius * radius + } + return PINF; +} + +/** + * Get the radius of a ray intersection with a disk. + * @param n Normalised normal of disk. + * @param p0 Centre of disk in camera space + * @param l Normalised ray direction in camera space + * @return Radius from centre of disk where intersection occurred. + */ +__device__ inline float intersectDistance(const float3 &n, const float3 &p0, const float3 &l) { + float t = 0; + if (intersectPlane(n, p0, l, t)) { + const float3 p = l * t; + const float3 v = p - p0; + const float d2 = dot(v, v); + return sqrt(d2); + // or you can use the following optimisation (and precompute radius^2) + // return d2 <= radius2; // where radius2 = radius * radius + } + return PINF; +} + /** * NOTE: Not strictly isosurface currently since it includes the internals * of objects up to at most truncation depth. @@ -26,11 +100,22 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData, // TODO: isosurface_point_cloud void splat_points(const ftl::cuda::TextureObject<int> &depth_in, + const ftl::cuda::TextureObject<uchar4> &colour_in, + const ftl::cuda::TextureObject<float4> &normal_in, const ftl::cuda::TextureObject<float> &depth_out, - const ftl::render::SplatParams ¶ms, cudaStream_t stream); + const ftl::cuda::TextureObject<uchar4> &colour_out, const ftl::render::SplatParams ¶ms, cudaStream_t stream); void dibr(const ftl::cuda::TextureObject<int> &depth_out, - const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams, + const ftl::cuda::TextureObject<uchar4> &colour_out, + const ftl::cuda::TextureObject<float4> &normal_out, + const ftl::cuda::TextureObject<float> &confidence_out, + const ftl::cuda::TextureObject<float4> &tmp_colour, int numcams, + const ftl::render::SplatParams ¶ms, cudaStream_t stream); + +/** + * Directly render input depth maps to virtual view with clipping. + */ +void dibr_raw(const ftl::cuda::TextureObject<int> &depth_out, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); void dibr(const ftl::cuda::TextureObject<float> &depth_out, diff --git a/applications/reconstruct/src/voxel_hash.cpp b/applications/reconstruct/src/voxel_hash.cpp index 14cb75078652001275404e87c09f77f0b9d19385..6f929c746d66cae5c382bf15b2d25e26f6b46702 100644 --- a/applications/reconstruct/src/voxel_hash.cpp +++ b/applications/reconstruct/src/voxel_hash.cpp @@ -1,4 +1,5 @@ #include <ftl/voxel_hash.hpp> +#include <loguru.hpp> using ftl::voxhash::HashData; using ftl::voxhash::HashParams; diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/voxel_render.cu similarity index 65% rename from applications/reconstruct/src/splat_render.cu rename to applications/reconstruct/src/voxel_render.cu index 3addf108dcf89ea8f3069e3ba0daea59e67ae918..46eb6f57c031b0541add1dcd982a9c6d1e690140 100644 --- a/applications/reconstruct/src/splat_render.cu +++ b/applications/reconstruct/src/voxel_render.cu @@ -258,143 +258,4 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, #endif } -// ---- Pass 2: Expand the point splats ---------------------------------------- - -#define SPLAT_RADIUS 7 -#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1) -#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS) -#define MAX_VALID 100 - -__device__ float distance2(float3 a, float3 b) { - const float x = a.x-b.x; - const float y = a.y-b.y; - const float z = a.z-b.z; - return x*x+y*y+z*z; -} - -__global__ void splatting_kernel( - TextureObject<int> depth_in, - TextureObject<float> depth_out, SplatParams params) { - // Read an NxN region and - // - interpolate a depth value for this pixel - // - interpolate an rgb value for this pixel - // Must respect depth discontinuities. - // How much influence a given neighbour has depends on its depth value - - __shared__ float3 positions[SPLAT_BUFFER_SIZE]; - - const int i = threadIdx.y*blockDim.y + threadIdx.x; - const int bx = blockIdx.x*blockDim.x; - const int by = blockIdx.y*blockDim.y; - const int x = bx + threadIdx.x; - const int y = by + threadIdx.y; - - // const float camMinDepth = params.camera.m_sensorDepthWorldMin; - // const float camMaxDepth = params.camera.m_sensorDepthWorldMax; - - for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) { - const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS; - const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS; - if (sx >= depth_in.width() || sy >= depth_in.height()) { - positions[j] = make_float3(1000.0f,1000.0f, 1000.0f); - } else { - positions[j] = params.camera.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f); - } - } - __syncthreads(); - - if (x >= depth_in.width() || y >= depth_in.height()) return; - - const float voxelSquared = params.voxelSize*params.voxelSize; - float mindepth = 1000.0f; - int minidx = -1; - // float3 minpos; - - //float3 validPos[MAX_VALID]; - int validIndices[MAX_VALID]; - int validix = 0; - - for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) { - for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) { - //const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u; - const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS; - - float3 posp = positions[idx]; - const float d = posp.z; - //if (d < camMinDepth || d > camMaxDepth) continue; - - float3 pos = params.camera.kinectDepthToSkeleton(x, y, d); - float dist = distance2(pos, posp); - - if (dist < voxelSquared) { - // Valid so check for minimum - //validPos[validix] = pos; - //validIndices[validix++] = idx; - if (d < mindepth) { - mindepth = d; - minidx = idx; - // minpos = pos; - } - } - } - } - - if (minidx == -1) { - depth_out(x,y) = 0.0f; - //colour_out(x,y) = make_uchar4(76,76,82,255); - return; - } - - //float3 colour = make_float3(0.0f, 0.0f, 0.0f); - float depth = 0.0f; - float contrib = 0.0f; - float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth); // TODO:(Nick) Mindepth assumption is poor choice. - - //for (int j=0; j<validix; ++j) { - const int idx = minidx; //validIndices[j]; - float3 posp = positions[idx]; - //float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z); - float3 delta = (posp - pos) / 2*params.voxelSize; - float dist = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; - - // It contributes to pixel - if (dist < 1.0f && fabs(posp.z - mindepth) < 2*params.voxelSize) { - const unsigned int sx = (idx % SPLAT_BOUNDS)+bx-SPLAT_RADIUS; - const unsigned int sy = (idx / SPLAT_BOUNDS)+by-SPLAT_RADIUS; - - // Fast and simple trilinear interpolation - float c = fabs((1.0f - delta.x) * (1.0f - delta.y) * (1.0f - delta.z)); - //uchar4 col = colour_in.tex2D((int)sx, (int)sy); - //colour.x += col.x*c; - //colour.y += col.y*c; - //colour.z += col.z*c; - contrib += c; - depth += posp.z * c; - } - //} - - // Normalise - //colour.x /= contrib; - //colour.y /= contrib; - //colour.z /= contrib; - depth /= contrib; - - depth_out(x,y) = depth; - //colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255); -} - -void ftl::cuda::splat_points(const TextureObject<int> &depth_in, - const TextureObject<float> &depth_out, const SplatParams ¶ms, cudaStream_t stream) -{ - - const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - - splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params); - cudaSafeCall( cudaGetLastError() ); - -#ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); -#endif -} diff --git a/components/common/cpp/include/ftl/configurable.hpp b/components/common/cpp/include/ftl/configurable.hpp index 5bae67b4a9972d5f69d0947871d5be259df36be3..94a080eaed480fa2e57b113e9c8c3d7bc04388bd 100644 --- a/components/common/cpp/include/ftl/configurable.hpp +++ b/components/common/cpp/include/ftl/configurable.hpp @@ -68,7 +68,9 @@ class Configurable { template <typename T> T value(const std::string &name, T def) { auto r = get<T>(name); - return (r) ? *r : def; + if (r) return *r; + (*config_)[name] = def; + return def; } /**