#include <ftl/render/splat_params.hpp>
#include "splatter_cuda.hpp"
#include <ftl/rgbd/camera.hpp>
#include <ftl/cuda_common.hpp>

#include <ftl/cuda/weighting.hpp>
#include <ftl/cuda/intersections.hpp>
#include <ftl/cuda/warp.hpp>

#define T_PER_BLOCK 8
#define UPSAMPLE_FACTOR 1.8f
#define WARP_SIZE 32
#define DEPTH_THRESHOLD 0.05f
#define UPSAMPLE_MAX 60
#define MAX_ITERATIONS 32  // Note: Must be multiple of 32
#define SPATIAL_SMOOTHING 0.005f

#define ENERGY_THRESHOLD 0.1f
#define SMOOTHING_MULTIPLIER_A 10.0f	// For surface search
#define SMOOTHING_MULTIPLIER_B 4.0f		// For z contribution
#define SMOOTHING_MULTIPLIER_C 2.0f		// For colour contribution

#define ACCUM_DIAMETER 8

using ftl::cuda::TextureObject;
using ftl::render::SplatParams;
using ftl::cuda::warpMin;
using ftl::cuda::warpSum;

/*
 * Pass 1: Directly render each camera into virtual view but with no upsampling
 * for sparse points.
 */
 template <bool CULLING>
 __global__ void dibr_merge_kernel(TextureObject<float4> points,
		TextureObject<float4> normals,
		TextureObject<int> depth, SplatParams params) {
	const int x = blockIdx.x*blockDim.x + threadIdx.x;
	const int y = blockIdx.y*blockDim.y + threadIdx.y;

	const float4 worldPos = points.tex2D(x, y);
	if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;

	// Compile time enable/disable of culling back facing points
	if (CULLING) {
		float3 ray = params.m_viewMatrixInverse.getFloat3x3() * params.camera.screenToCam(x,y,1.0f);
		ray = ray / length(ray);
		float3 n = make_float3(normals.tex2D((int)x,(int)y));
		float l = length(n);
		if (l == 0) {
			return;
		}
		n /= l;

		const float facing = dot(ray, n);
		if (facing <= 0.0f) return;
	}

    // Find the virtual screen position of current point
	const float3 camPos = params.m_viewMatrix * make_float3(worldPos);
	if (camPos.z < params.camera.minDepth) return;
	if (camPos.z > params.camera.maxDepth) return;

	const float d = camPos.z;

	const uint2 screenPos = params.camera.camToScreen<uint2>(camPos);
	const unsigned int cx = screenPos.x;
	const unsigned int cy = screenPos.y;
	if (d > params.camera.minDepth && d < params.camera.maxDepth && cx < depth.width() && cy < depth.height()) {
		// Transform estimated point to virtual cam space and output z
		atomicMin(&depth(cx,cy), d * 1000.0f);
	}
}

void ftl::cuda::dibr_merge(TextureObject<float4> &points, TextureObject<float4> &normals, TextureObject<int> &depth, SplatParams params, bool culling, cudaStream_t stream) {
    const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

	if (culling) dibr_merge_kernel<true><<<gridSize, blockSize, 0, stream>>>(points, normals, depth, params);
	else dibr_merge_kernel<false><<<gridSize, blockSize, 0, stream>>>(points, normals, depth, params);
    cudaSafeCall( cudaGetLastError() );
}

//==============================================================================

__device__ inline float4 make_float4(const uchar4 &c) {
    return make_float4(c.x,c.y,c.z,c.w);
}

__device__ inline float4 make_float4(const float4 &v) {
	return v;
}

template <typename T>
__device__ inline T make();

template <>
__device__ inline uchar4 make() {
	return make_uchar4(0,0,0,0);
}

template <>
__device__ inline float4 make() {
	return make_float4(0.0f,0.0f,0.0f,0.0f);
}

template <>
__device__ inline float make() {
	return 0.0f;
}

template <typename T>
__device__ inline T make(const float4 &);

template <>
__device__ inline uchar4 make(const float4 &v) {
	return make_uchar4((int)v.x, (int)v.y, (int)v.z, (int)v.w);
}

template <>
__device__ inline float4 make(const float4 &v) {
	return v;
}

template <>
__device__ inline float make(const float4 &v) {
	return v.x;
}

template <typename T>
__device__ inline T make(const uchar4 &v);

template <>
__device__ inline float4 make(const uchar4 &v) {
	return make_float4((float)v.x, (float)v.y, (float)v.z, (float)v.w);
}

template <typename T>
__device__ inline T make(float v);

template <>
__device__ inline float make(float v) {
	return v;
}

/*
 * Pass 1b: Expand splats to full size and merge
 */
 template <int SEARCH_DIAMETER, typename T>
 __global__ void splat_kernel(
        //TextureObject<float4> points,       // Original 3D points
        TextureObject<float4> normals,
        TextureObject<float> density,
        TextureObject<T> in,
        TextureObject<int> depth_in,        // Virtual depth map
        TextureObject<float> depth_out,   // Accumulated output
        TextureObject<T> out,
        //ftl::rgbd::Camera camera,
        //float4x4 pose_inv,
        SplatParams params) {
        
    //const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam];

    const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
    //const int warp = tid / WARP_SIZE;
    const int x = (blockIdx.x*blockDim.x + threadIdx.x) / WARP_SIZE;
    const int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return;

    const float3 origin = params.m_viewMatrixInverse * make_float3(0.0f);
    float3 ray = params.camera.screenToCam(x,y,1.0f);
    ray = ray / length(ray);
    const float scale = ray.z;
    ray = params.m_viewMatrixInverse.getFloat3x3() * ray;

    //float depth = 0.0f;
    //float contrib = 0.0f;
    float depth = 1000.0f;
    float pdepth = 1000.0f;

    struct Result {
        float weight;
        float depth;
        T attr;
    };

    Result results[(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE];

    // Each thread in warp takes an upsample point and updates corresponding depth buffer.
    const int lane = tid % WARP_SIZE;
    for (int i=lane; i<SEARCH_DIAMETER*SEARCH_DIAMETER; i+=WARP_SIZE) {
        const float u = (i % SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);
        const float v = (i / SEARCH_DIAMETER) - (SEARCH_DIAMETER / 2);

        results[i/WARP_SIZE] = {0.0f, 0.0f, make<T>()};

        // Use the depth buffer to determine this pixels 3D position in camera space
        const float d = ((float)depth_in.tex2D(x+u, y+v)/1000.0f);

        if (d < params.camera.minDepth || d > params.camera.maxDepth) continue;

        const float3 camPos = params.camera.screenToCam((int)(x)+u,(int)(y)+v,d);
        const float3 camPos2 = params.camera.screenToCam((int)(x),(int)(y),d);
        const float3 worldPos = params.m_viewMatrixInverse * camPos;


        // Assumed to be normalised
        float4 n = normals.tex2D((int)(x)+u, (int)(y)+v);
        n /= length(n);
		//if (length(make_float3(n)) == 0.0f) printf("BAD NORMAL\n");

        // Does the ray intersect plane of splat?
        float t = 1000.0f;
        const float r = ftl::cuda::intersectDistance(make_float3(n), worldPos, origin, ray, t);
        if (r != PINF) { //} && fabs(t-camPos.z) < 0.01f) {
            // Adjust from normalised ray back to original meters units
            t *= scale;
            const float dens = density.tex2D((int)(x)+u, (int)(y)+v);
            float weight = ftl::cuda::weighting(r, dens/params.camera.fx); // (1.0f/params.camera.fx) / (t/params.camera.fx)

            /* Buehler C. et al. 2001. Unstructured Lumigraph Rendering. */
            /* Orts-Escolano S. et al. 2016. Holoportation: Virtual 3D teleportation in real-time. */
            // This is the simple naive colour weighting. It might be good
            // enough for our purposes if the alignment step prevents ghosting
            // TODO: Use depth and perhaps the neighbourhood consistency in:
            //     Kuster C. et al. 2011. FreeCam: A hybrid camera system for interactive free-viewpoint video
            if (params.m_flags & ftl::render::kNormalWeightColours) weight *= n.w * n.w;
            //if (params.m_flags & ftl::render::kDepthWeightColours) weight *= ???

            if (weight <= 0.0f) continue;

            //depth = min(depth, t);
            if (t < depth) {
                pdepth = depth;
                depth = t;
            }
            results[i/WARP_SIZE] = {weight, t, in.tex2D((int)x+u, (int)y+v)};
        }
    }

    depth = warpMin(depth);
    pdepth = warpMin(pdepth);

    float adepth = 0.0f;
    float contrib = 0.0f;
    float4 attr = make_float4(0.0f);

    // Loop over results array
    for (int i=0; i<(SEARCH_DIAMETER*SEARCH_DIAMETER) / WARP_SIZE; ++i) {
        if (results[i].depth - depth < 0.04f) {
            adepth += results[i].depth * results[i].weight;
            attr += make_float4(results[i].attr) * results[i].weight;
            contrib += results[i].weight;
        }
    }

    // Sum all attributes and contributions
    adepth = warpSum(adepth);
    attr.x = warpSum(attr.x);
    attr.y = warpSum(attr.y);
    attr.z = warpSum(attr.z);
    contrib = warpSum(contrib);

    if (lane == 0 && contrib > 0.0f) {
        depth_out(x,y) = adepth / contrib;
        out(x,y) = make<T>(attr / contrib);
    }
}

template <typename T>
void ftl::cuda::splat(
        TextureObject<float4> &normals,
        TextureObject<float> &density,
        TextureObject<T> &colour_in,
        TextureObject<int> &depth_in,        // Virtual depth map
        TextureObject<float> &depth_out,
        TextureObject<T> &colour_out,
        const SplatParams &params, cudaStream_t stream) {
    const dim3 gridSize((depth_in.width() + 2 - 1)/2, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(2*WARP_SIZE, T_PER_BLOCK);

    splat_kernel<8,T><<<gridSize, blockSize, 0, stream>>>(
        normals,
        density,
        colour_in,
        depth_in,
        depth_out,
        colour_out,
        params
    );
    cudaSafeCall( cudaGetLastError() );
}

template void ftl::cuda::splat<uchar4>(
        TextureObject<float4> &normals,
        TextureObject<float> &density,
        TextureObject<uchar4> &colour_in,
        TextureObject<int> &depth_in,        // Virtual depth map
        TextureObject<float> &depth_out,
        TextureObject<uchar4> &colour_out,
		const SplatParams &params, cudaStream_t stream);
		
template void ftl::cuda::splat<float4>(
    TextureObject<float4> &normals,
    TextureObject<float> &density,
	TextureObject<float4> &colour_in,
	TextureObject<int> &depth_in,        // Virtual depth map
	TextureObject<float> &depth_out,
	TextureObject<float4> &colour_out,
	const SplatParams &params, cudaStream_t stream);

template void ftl::cuda::splat<float>(
    TextureObject<float4> &normals,
    TextureObject<float> &density,
	TextureObject<float> &colour_in,
	TextureObject<int> &depth_in,        // Virtual depth map
	TextureObject<float> &depth_out,
	TextureObject<float> &colour_out,
	const SplatParams &params, cudaStream_t stream);

//==============================================================================

template <typename T>
__device__ inline T generateInput(const T &in, const SplatParams &params, const float4 &worldPos) {
	return in;
}

template <>
__device__ inline uchar4 generateInput(const uchar4 &in, const SplatParams &params, const float4 &worldPos) {
	return (params.m_flags & ftl::render::kShowDisconMask && worldPos.w < 0.0f) ?
		make_uchar4(0,0,255,255) :  // Show discontinuity mask in red
		in;
}

template <typename A, typename B>
__device__ inline B weightInput(const A &in, float weight) {
	return in * weight;
}

template <>
__device__ inline float4 weightInput(const uchar4 &in, float weight) {
	return make_float4(
		(float)in.x * weight,
		(float)in.y * weight,
		(float)in.z * weight,
		(float)in.w * weight);
}

template <typename T>
__device__ inline void accumulateOutput(TextureObject<T> &out, TextureObject<float> &contrib, const uint2 &pos, const T &in, float w) {
	atomicAdd(&out(pos.x, pos.y), in);
	atomicAdd(&contrib(pos.x, pos.y), w);
} 

template <>
__device__ inline void accumulateOutput(TextureObject<float4> &out, TextureObject<float> &contrib, const uint2 &pos, const float4 &in, float w) {
	atomicAdd((float*)&out(pos.x, pos.y), in.x);
	atomicAdd(((float*)&out(pos.x, pos.y))+1, in.y);
	atomicAdd(((float*)&out(pos.x, pos.y))+2, in.z);
	atomicAdd(((float*)&out(pos.x, pos.y))+3, in.w);
	atomicAdd(&contrib(pos.x, pos.y), w);
} 

/*
 * Pass 2: Accumulate attribute contributions if the points pass a visibility test.
 */
 template <typename A, typename B>
__global__ void dibr_attribute_contrib_kernel(
        TextureObject<A> in,				// Attribute input
        TextureObject<float4> points,       // Original 3D points
        TextureObject<int> depth_in,        // Virtual depth map
		TextureObject<B> out,			// Accumulated output
		TextureObject<float> contrib,
        SplatParams params) {
        
	const int x = (blockIdx.x*blockDim.x + threadIdx.x);
	const int y = blockIdx.y*blockDim.y + threadIdx.y;

	const float4 worldPos = points.tex2D(x, y);
	if (worldPos.x == MINF || (!(params.m_flags & ftl::render::kShowDisconMask) && worldPos.w < 0.0f)) return;

	const float3 camPos = params.m_viewMatrix * make_float3(worldPos);
	if (camPos.z < params.camera.minDepth) return;
	if (camPos.z > params.camera.maxDepth) return;
	const uint2 screenPos = params.camera.camToScreen<uint2>(camPos);

	// Not on screen so stop now...
	if (screenPos.x >= depth_in.width() || screenPos.y >= depth_in.height()) return;
            
    // Is this point near the actual surface and therefore a contributor?
    const float d = (float)depth_in.tex2D((int)screenPos.x, (int)screenPos.y) / 1000.0f;

	const A input = generateInput(in.tex2D(x, y), params, worldPos);
	const float weight = ftl::cuda::weighting(fabs(camPos.z - d), 0.02f);
	const B weighted = make<B>(input) * weight; //weightInput(input, weight);

	if (weight > 0.0f) {
		accumulateOutput(out, contrib, screenPos, weighted, weight);
		//out(screenPos.x, screenPos.y) = input;
	}
}


template <typename A, typename B>
void ftl::cuda::dibr_attribute(
        TextureObject<A> &in,
        TextureObject<float4> &points,       // Original 3D points
        TextureObject<int> &depth_in,        // Virtual depth map
		TextureObject<B> &out,   // Accumulated output
		TextureObject<float> &contrib,
        SplatParams &params, cudaStream_t stream) {
	const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
	const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_attribute_contrib_kernel<<<gridSize, blockSize, 0, stream>>>(
        in,
        points,
        depth_in,
		out,
		contrib,
        params
    );
    cudaSafeCall( cudaGetLastError() );
}

template void ftl::cuda::dibr_attribute(
	ftl::cuda::TextureObject<uchar4> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<float4> &out,	// Accumulated output
	ftl::cuda::TextureObject<float> &contrib,
	ftl::render::SplatParams &params, cudaStream_t stream);

template void ftl::cuda::dibr_attribute(
	ftl::cuda::TextureObject<float> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<float> &out,	// Accumulated output
	ftl::cuda::TextureObject<float> &contrib,
	ftl::render::SplatParams &params, cudaStream_t stream);

template void ftl::cuda::dibr_attribute(
	ftl::cuda::TextureObject<float4> &in,	// Original colour image
	ftl::cuda::TextureObject<float4> &points,		// Original 3D points
	ftl::cuda::TextureObject<int> &depth_in,		// Virtual depth map
	ftl::cuda::TextureObject<float4> &out,	// Accumulated output
	ftl::cuda::TextureObject<float> &contrib,
	ftl::render::SplatParams &params, cudaStream_t stream);

//==============================================================================

template <typename A, typename B>
__global__ void dibr_normalise_kernel(
        TextureObject<A> in,
        TextureObject<B> out,
        TextureObject<float> contribs) {
    const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

    if (x < in.width() && y < in.height()) {
        const A a = in.tex2D((int)x,(int)y);
        //const float4 normal = normals.tex2D((int)x,(int)y);
        const float contrib = contribs.tex2D((int)x,(int)y);

        if (contrib > 0.0f) {
            out(x,y) = make<B>(a / contrib);
            //normals(x,y) = normal / contrib;
        }
    }
}

template <typename A, typename B>
void ftl::cuda::dibr_normalise(TextureObject<A> &in, TextureObject<B> &out, TextureObject<float> &contribs, cudaStream_t stream) {
    const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);

    dibr_normalise_kernel<<<gridSize, blockSize, 0, stream>>>(in, out, contribs);
    cudaSafeCall( cudaGetLastError() );
}

template void ftl::cuda::dibr_normalise<float4,uchar4>(TextureObject<float4> &in, TextureObject<uchar4> &out, TextureObject<float> &contribs, cudaStream_t stream);
template void ftl::cuda::dibr_normalise<float,float>(TextureObject<float> &in, TextureObject<float> &out, TextureObject<float> &contribs, cudaStream_t stream);
template void ftl::cuda::dibr_normalise<float4,float4>(TextureObject<float4> &in, TextureObject<float4> &out, TextureObject<float> &contribs, cudaStream_t stream);