splatter.cu

#include <ftl/render/splat_params.hpp>
#include "splatter_cuda.hpp"
#include <ftl/rgbd/camera.hpp>
#include <ftl/cuda_common.hpp>

#include <ftl/cuda/weighting.hpp>
#include <ftl/cuda/intersections.hpp>
#include <ftl/cuda/warp.hpp>

#define T_PER_BLOCK 8
#define UPSAMPLE_FACTOR 1.8f
#define WARP_SIZE 32
#define DEPTH_THRESHOLD 0.05f
#define UPSAMPLE_MAX 60
#define MAX_ITERATIONS 32  // Note: Must be multiple of 32
#define SPATIAL_SMOOTHING 0.005f

#define ENERGY_THRESHOLD 0.1f
#define SMOOTHING_MULTIPLIER_A 10.0f	// For surface search
#define SMOOTHING_MULTIPLIER_B 4.0f		// For z contribution
#define SMOOTHING_MULTIPLIER_C 2.0f		// For colour contribution

#define ACCUM_DIAMETER 8

using ftl::cuda::TextureObject;
using ftl::render::SplatParams;
using ftl::cuda::warpMin;
using ftl::cuda::warpSum;

/*
 * Pass 1: Directly render each camera into virtual view but with no upsampling
 * for sparse points.
 */
 template <bool CULLING>
 __global__ void dibr_merge_kernel(TextureObject<float4> points,
		TextureObject<float4> normals,
		TextureObject<int> depth, SplatParams params) {
	const int x = blockIdx.x*blockDim.x + threadIdx.x;
	const int y = blockIdx.y*blockDim.y + threadIdx.y;

	const float4 worldPos = points.tex2D(x, y);
	if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;

	// Compile time enable/disable of culling back facing points
	if (CULLING) {
		float3 ray = params.m_viewMatrixInverse.getFloat3x3() * params.camera.screenToCam(x,y,1.0f);
		ray = ray / length(ray);
		float3 n = make_float3(normals.tex2D((int)x,(int)y));
		float l = length(n);
		if (l == 0) {
			return;
		}
		n /= l;

		const float facing = dot(ray, n);
		if (facing <= 0.0f) return;
	}

    // Find the virtual screen position of current point
	const float3 camPos = params.m_viewMatrix * make_float3(worldPos);
	if (camPos.z < params.camera.minDepth) return;
	if (camPos.z > params.camera.maxDepth) return;

	const float d = camPos.z;

	const uint2 screenPos = params.camera.camToScreen<uint2>(camPos);
	const unsigned int cx = screenPos.x;
	const unsigned int cy = screenPos.y;
	if (d > params.camera.minDepth && d < params.camera.maxDepth && cx < depth.width() && cy < depth.height()) {
		// Transform estimated point to virtual cam space and output z
		atomicMin(&depth(cx,cy), d * 1000.0f);
	}
}

void ftl::cuda::dibr_merge(TextureObject<float4> &points, TextureObject<float4> &normals, TextureObject<int> &depth, SplatParams params, bool culling, cudaStream_t stream) {
    const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

	if (culling) dibr_merge_kernel<true><<<gridSize, blockSize, 0, stream>>>(points, normals, depth, params);
	else dibr_merge_kernel<false><<<gridSize, blockSize, 0, stream>>>(points, normals, depth, params);
    cudaSafeCall( cudaGetLastError() );
}

//==============================================================================

__device__ inline float4 make_float4(const uchar4 &c) {
    return make_float4(c.x,c.y,c.z,c.w);
}

/*
 * Pass 1b: Expand splats to full size and merge
 */
 template <int SEARCH_DIAMETER>
 __global__ void splat_kernel(
        //TextureObject<float4> points,       // Original 3D points
        TextureObject<float4> normals,
        TextureObject<uchar4> colour_in,
        TextureObject<int> depth_in,        // Virtual depth map
        TextureObject<float> depth_out,   // Accumulated output
        TextureObject<uchar4> colour_out,
        //ftl::rgbd::Camera camera,
        //float4x4 pose_inv,
        SplatParams params) {

    //const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];

    const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
    //const int warp = tid / WARP_SIZE;
    const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
    const int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;

    const float3 origin = params.m_viewMatrixInverse * make_float3(0.0f);
    float3 ray = params.camera.screenToCam(x,y,1.0f);
    ray = ray / length(ray);
    const float scale = ray.z;
    ray = params.m_viewMatrixInverse.getFloat3x3() * ray;

    //float depth = 0.0f;
    //float contrib = 0.0f;
    float depth = 1000.0f;

    struct Result {
        float weight;
        float depth;
        uchar4 colour;
    };

    Result results[(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE];

    // Each thread in warp takes an upsample point and updates corresponding depth buffer.
    const int lane = tid % WARP_SIZE;
    for (int i=lane; i<SEARCH_DIAMETER*SEARCH_DIAMETER; i+=WARP_SIZE) {
        const float u = (i % SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
        const float v = (i / SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);

        results[i/WARP_SIZE] = {0.0f, 0.0f, make_uchar4(0,0,0,0)};

        // Use the depth buffer to determine this pixels 3D position in camera space
        const float d = ((float)depth_in.tex2D(x+u, y+v)/1000.0f);

        if (d < params.camera.minDepth || d > params.camera.maxDepth) continue;

        const float3 camPos = params.camera.screenToCam((int)(x+u),(int)(y+v),d);
        const float3 camPos2 = params.camera.screenToCam((int)(x),(int)(y),d);
        //if (length(camPos - camPos2) > 2.0f*(camPos.z/params.camera.fx)) continue;
        const float3 worldPos = params.m_viewMatrixInverse * camPos;
        //const float3 camPos2 = pose_inv * worldPos;
        //const uint2 screenPos = camera.camToScreen<uint2>(camPos2);

        //if (screenPos.x < points.width() && screenPos.y < points.height()) {
            // Can now read points, normals and colours from source cam

            // What is contribution of our current point at this pixel?
            //const float3 p = make_float3(points.tex2D((int)screenPos.x, (int)screenPos.y));
            //const float weight = ftl::cuda::spatialWeighting(worldPos, p, (camPos.z/params.camera.fx)); //*(camPos2.z/camera.fx));
            //if (weight <= 0.0f) continue;

            float3 n = make_float3(normals.tex2D((int)(x+u), (int)(y+v)));
            const float l = length(n);
            if (l == 0.0f) continue;
            n /= l;

            // Does the ray intersect plane of splat?
            float t = 1000.0f;
            if (ftl::cuda::intersectPlane(n, worldPos, origin, ray, t)) { //} && fabs(t-camPos.z) < 0.01f) {
                //t *= (params.m_viewMatrix.getFloat3x3() * ray).z;
                t *= scale;
                const float3 camPos3 = params.camera.screenToCam((int)(x),(int)(y),t);
                const float weight = ftl::cuda::spatialWeighting(camPos, camPos3, 2.0f*(camPos3.z/params.camera.fx)); //*(camPos2.z/camera.fx));
                if (weight == 0.0f) continue;
                //depth += t * weight;
                //contrib += weight;
                depth = min(depth, t);
                results[i/WARP_SIZE] = {weight, t, colour_in.tex2D((int)x+u, (int)y+v)}; //make_float2(t, weight);
                //atomicMin(&depth_out(x,y), (int)(depth * 1000.0f));
            }
        //}
    }

    depth = warpMin(depth);

    float adepth = 0.0f;
    float contrib = 0.0f;
    float4 colour = make_float4(0.0f);

    // Loop over results array
    for (int i=0; i<(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE; ++i) {
        if (results[i].depth - depth < 0.04f) {
            adepth += results[i].depth * results[i].weight;
            colour += make_float4(results[i].colour) * results[i].weight;
            contrib += results[i].weight;
        }
    }

    // Sum all attributes and contributions
    adepth = warpSum(adepth);
    colour.x = warpSum(colour.x);
    colour.y = warpSum(colour.y);
    colour.z = warpSum(colour.z);
    contrib = warpSum(contrib);

    if (lane == 0 && contrib > 0.0f) {
        depth_out(x,y) = adepth / contrib;
        colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 255.0f);
    }
}

void ftl::cuda::splat(
        TextureObject<float4> &normals,
        TextureObject<uchar4> &colour_in,
        TextureObject<int> &depth_in,        // Virtual depth map
        TextureObject<float> &depth_out,
        TextureObject<uchar4> &colour_out,
        const SplatParams &params, cudaStream_t stream) {
    const dim3 gridSize((depth_in.width() + 2 - 1)/2, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(2*WARP_SIZE, T_PER_BLOCK);

    splat_kernel<8><<<gridSize, blockSize, 0, stream>>>(
        normals,
        colour_in,
        depth_in,
        depth_out,
        colour_out,
        params
    );
    cudaSafeCall( cudaGetLastError() );
}

//==============================================================================

template <typename T>
__device__ inline T generateInput(const T &in, const SplatParams &params, const float4 &worldPos) {
	return in;
}

template <>
__device__ inline uchar4 generateInput(const uchar4 &in, const SplatParams &params, const float4 &worldPos) {
	return (params.m_flags & ftl::render::kShowDisconMask && worldPos.w < 0.0f) ?
		make_uchar4(0,0,255,255) :  // Show discontinuity mask in red
		in;
}

/*
 * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
 */
 template <typename T>
__global__ void dibr_attribute_contrib_kernel(
        TextureObject<T> in,				// Attribute input
        TextureObject<float4> points,       // Original 3D points
        TextureObject<int> depth_in,        // Virtual depth map
        TextureObject<T> out,			// Accumulated output
        SplatParams params) {

	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
	const int y = blockIdx.y*blockDim.y + threadIdx.y;

	const float4 worldPos = points.tex2D(x, y);
	if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;

	const float3 camPos = params.m_viewMatrix * make_float3(worldPos);
	if (camPos.z < params.camera.minDepth) return;
	if (camPos.z > params.camera.maxDepth) return;
	const uint2 screenPos = params.camera.camToScreen<uint2>(camPos);

	// Not on screen so stop now...
	if (screenPos.x >= depth_in.width() || screenPos.y >= depth_in.height()) return;

    // Is this point near the actual surface and therefore a contributor?
    const int d = depth_in.tex2D((int)screenPos.x, (int)screenPos.y);

	const T input = generateInput(in.tex2D(x, y), params, worldPos);

	//const float3 nearest = params.camera.screenToCam((int)(screenPos.x),(int)(screenPos.y),d);

	//const float l = length(nearest - camPos);
	if (d == (int)(camPos.z*1000.0f)) {
		out(screenPos.x, screenPos.y) = input;
	}
}


template <typename T>
void ftl::cuda::dibr_attribute(
        TextureObject<T> &in,
        TextureObject<float4> &points,       // Original 3D points
        TextureObject<int> &depth_in,        // Virtual depth map
        TextureObject<T> &out,   // Accumulated output
        SplatParams &params, cudaStream_t stream) {
	const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_attribute_contrib_kernel<<<gridSize, blockSize, 0, stream>>>(
        in,
        points,
        depth_in,
        out,
        params
    );
    cudaSafeCall( cudaGetLastError() );
}

template void ftl::cuda::dibr_attribute<uchar4>(
	ftl::cuda::TextureObject<uchar4> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<uchar4> &out,	// Accumulated output
	ftl::render::SplatParams &params, cudaStream_t stream);

template void ftl::cuda::dibr_attribute<float>(
	ftl::cuda::TextureObject<float> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<float> &out,	// Accumulated output
	ftl::render::SplatParams &params, cudaStream_t stream);

template void ftl::cuda::dibr_attribute<float4>(
	ftl::cuda::TextureObject<float4> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<float4> &out,	// Accumulated output
	ftl::render::SplatParams &params, cudaStream_t stream);

//==============================================================================

/*__global__ void dibr_normalise_kernel(
        TextureObject<float4> colour_in,
        TextureObject<uchar4> colour_out,
        //TextureObject<float4> normals,
        TextureObject<float> contribs) {
    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < colour_in.width() && y < colour_in.height()) {
        const float4 colour = colour_in.tex2D((int)x,(int)y);
        //const float4 normal = normals.tex2D((int)x,(int)y);
        const float contrib = contribs.tex2D((int)x,(int)y);

        if (contrib > 0.0f) {
            colour_out(x,y) = make_uchar4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0);
            //normals(x,y) = normal / contrib;
        }
    }
}

__global__ void dibr_normalise_kernel(
        TextureObject<float4> colour_in,
        TextureObject<float> colour_out,
        //TextureObject<float4> normals,
        TextureObject<float> contribs) {
    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < colour_in.width() && y < colour_in.height()) {
        const float4 colour = colour_in.tex2D((int)x,(int)y);
        //const float4 normal = normals.tex2D((int)x,(int)y);
        const float contrib = contribs.tex2D((int)x,(int)y);

        if (contrib > 0.0f) {
            colour_out(x,y) = colour.x / contrib;
            //normals(x,y) = normal / contrib;
        }
    }
}

__global__ void dibr_normalise_kernel(
        TextureObject<float4> colour_in,
        TextureObject<float4> colour_out,
        //TextureObject<float4> normals,
        TextureObject<float> contribs) {
    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < colour_in.width() && y < colour_in.height()) {
        const float4 colour = colour_in.tex2D((int)x,(int)y);
        //const float4 normal = normals.tex2D((int)x,(int)y);
        const float contrib = contribs.tex2D((int)x,(int)y);

        if (contrib > 0.0f) {
            colour_out(x,y) = make_float4(colour.x / contrib, colour.y / contrib, colour.z / contrib, 0);
            //normals(x,y) = normal / contrib;
        }
    }
}

void ftl::cuda::dibr_normalise(TextureObject<float4> &colour_in, TextureObject<uchar4> &colour_out, TextureObject<float> &contribs, cudaStream_t stream) {
    const dim3 gridSize((colour_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(colour_in, colour_out, contribs);
    cudaSafeCall( cudaGetLastError() );
}

void ftl::cuda::dibr_normalise(TextureObject<float4> &colour_in, TextureObject<float> &colour_out, TextureObject<float> &contribs, cudaStream_t stream) {
    const dim3 gridSize((colour_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(colour_in, colour_out, contribs);
    cudaSafeCall( cudaGetLastError() );
}

void ftl::cuda::dibr_normalise(TextureObject<float4> &colour_in, TextureObject<float4> &colour_out, TextureObject<float> &contribs, cudaStream_t stream) {
    const dim3 gridSize((colour_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(colour_in, colour_out, contribs);
    cudaSafeCall( cudaGetLastError() );
}*/