diff --git a/components/common/cpp/include/ftl/cuda_matrix_util.hpp b/components/common/cpp/include/ftl/cuda_matrix_util.hpp index 4dd1db7fa8a58c84a40f9d93abd8cc0a492b2792..cbb82052da87e2e5ddca271afe79233639a379bb 100644 --- a/components/common/cpp/include/ftl/cuda_matrix_util.hpp +++ b/components/common/cpp/include/ftl/cuda_matrix_util.hpp @@ -750,7 +750,7 @@ public: } //! returns the 3x3 part of the matrix - inline __device__ __host__ float3x3 getFloat3x3() { + inline __device__ __host__ float3x3 getFloat3x3() const { float3x3 ret; ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; @@ -1086,7 +1086,7 @@ public: //! returns the 3x3 part of the matrix - inline __device__ __host__ float3x3 getFloat3x3() { + inline __device__ __host__ float3x3 getFloat3x3() const { float3x3 ret; ret.m11 = m11; ret.m12 = m12; ret.m13 = m13; ret.m21 = m21; ret.m22 = m22; ret.m23 = m23; diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt index 5eb5cb71c41692fb95bbc6c9300396f36766d588..23a80cb7c0fd20bfd9d63842a467051d737f415c 100644 --- a/components/operators/CMakeLists.txt +++ b/components/operators/CMakeLists.txt @@ -1,39 +1,43 @@ set(OPERSRC - src/smoothing.cpp - src/smoothing.cu - src/mls.cu - src/smoothchan.cu + src/surface/smoothing.cpp + src/surface/smoothing.cu + src/surface/mls/image_basic.cu + src/surface/mls.cu + src/analysis/local/smoothchan.cu src/operator.cpp - src/colours.cpp - src/normals.cpp - src/filling.cpp - src/filling.cu + src/misc/colours.cpp + src/analysis/local/normals.cpp + src/surface/filling.cpp + src/surface/filling.cu src/disparity/libstereo.cpp src/disparity/disp2depth.cu src/disparity/disparity_to_depth.cpp src/disparity/bilateral_filter.cpp src/disparity/opencv/disparity_bilateral_filter.cpp src/disparity/opencv/disparity_bilateral_filter.cu - src/segmentation.cu - src/segmentation.cpp - src/mask.cu - src/mask.cpp - src/antialiasing.cpp - src/antialiasing.cu + src/analysis/segmentation/segmentation.cu + src/analysis/segmentation/segmentation.cpp + src/analysis/local/mask.cu + src/analysis/local/mask.cpp + src/misc/antialiasing.cpp + src/misc/antialiasing.cu src/fusion/mvmls.cpp src/fusion/correspondence.cu src/fusion/correspondence_depth.cu src/fusion/correspondence_util.cu src/fusion/mls_aggr.cu - src/clipping.cpp - src/depth.cpp - src/detectandtrack.cpp - src/aruco.cpp - src/weighting.cpp - src/weighting.cu - src/poser.cpp - src/gt_analysis.cpp - src/gt_analysis.cu + src/fusion/smoothing/mls_multi_weighted.cu + src/fusion/carving/carver.cu + src/fusion/fusion.cpp + src/misc/clipping.cpp + src/disparity/depth.cpp + src/analysis/tracking/detectandtrack.cpp + src/analysis/tracking/aruco.cpp + src/analysis/local/weighting.cpp + src/analysis/local/weighting.cu + src/misc/poser.cpp + src/analysis/evaluation/gt_analysis.cpp + src/analysis/evaluation/gt_analysis.cu ) if (HAVE_LIBSGM) @@ -42,8 +46,8 @@ endif (HAVE_LIBSGM) if (HAVE_OPTFLOW) list(APPEND OPERSRC - src/nvopticalflow.cpp - src/opticalflow.cu + src/analysis/tracking/nvopticalflow.cpp + src/analysis/tracking/opticalflow.cu src/disparity/optflow_smoothing.cu src/disparity/optflow_smoothing.cpp) endif() diff --git a/components/operators/include/ftl/cuda/fixed.hpp b/components/operators/include/ftl/cuda/fixed.hpp index efb0c09caabd5e41f024679e8e80a2c6ea2b6df3..16e1e6ae0769fbfa16bad8931a58ac23ff43cb7b 100644 --- a/components/operators/include/ftl/cuda/fixed.hpp +++ b/components/operators/include/ftl/cuda/fixed.hpp @@ -3,7 +3,7 @@ template <int FRAC> __device__ inline float fixed2float(short v) { - return v / (1 << FRAC); + return float(v) / float(1 << FRAC); } template <int FRAC> diff --git a/components/operators/include/ftl/operators/cuda/carver.hpp b/components/operators/include/ftl/operators/cuda/carver.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1dd0c04a884610211ca7e6ffa8c19702ff1f25cb --- /dev/null +++ b/components/operators/include/ftl/operators/cuda/carver.hpp @@ -0,0 +1,34 @@ +#ifndef _FTL_CUDA_CARVER_HPP_ +#define _FTL_CUDA_CARVER_HPP_ + +#include <ftl/cuda_common.hpp> +#include <ftl/cuda_matrix_util.hpp> +#include <ftl/rgbd/camera.hpp> + +namespace ftl { +namespace cuda { + +/** + * Carve `in` using `ref` as visibility reference. + */ +void depth_carve( + cv::cuda::GpuMat &in, + const cv::cuda::GpuMat &ref, + //const cv::cuda::GpuMat &in_colour, + //const cv::cuda::GpuMat &ref_colour, + //cv::cuda::GpuMat &colour_scale, + const float4x4 &transform, + const ftl::rgbd::Camera &incam, + const ftl::rgbd::Camera &refcam, + cudaStream_t stream); + +void apply_colour_scaling( + const cv::cuda::GpuMat &scale, + cv::cuda::GpuMat &colour, + int radius, + cudaStream_t stream); + +} +} + +#endif \ No newline at end of file diff --git a/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp new file mode 100644 index 0000000000000000000000000000000000000000..884fd0da46db2d61e68aba175603d7a04717f407 --- /dev/null +++ b/components/operators/include/ftl/operators/cuda/mls/multi_intensity.hpp @@ -0,0 +1,76 @@ +#ifndef _FTL_CUDA_MLS_MULTIINTENSITY_HPP_ +#define _FTL_CUDA_MLS_MULTIINTENSITY_HPP_ + +#include <ftl/cuda_common.hpp> +#include <ftl/rgbd/camera.hpp> +#include <ftl/cuda_matrix_util.hpp> + +namespace ftl { +namespace cuda { + +// TODO: 4th channel of normals could be used as curvature estimate? Then can +// adaptively adjust the smoothing amount by the degree of curvature. This +// curvature value must be low for noise data, is only high if a consistent +// high curvature is detected over a radius. Other option is to also or instead +// use the curvature value as a feature, to only smooth against points with +// similar curvature? + +/** + * For a particular viewpoint, use a set of other viewpoints to estimate a + * smooth surface for the focus viewpoint. This version of MLS uses absolute + * difference of some 8-bit value as a weight, this can be raw intensity or + * a local contrast measure. This `prime`, `gather`, `adjust` cycle should + * be iterated 2-3 times, but perhaps better to do one iteration of each view + * first before repeating a view. + * + * Note: Have one instance per frameset because the internal buffers are + * allocated based upon frame image size. + */ +class MLSMultiIntensity +{ +public: + MLSMultiIntensity(int radius); + ~MLSMultiIntensity(); + + void prime( + const cv::cuda::GpuMat &depth_prime, + const cv::cuda::GpuMat &intensity_prime, + const ftl::rgbd::Camera &cam_prime, + const float4x4 &pose_prime, + cudaStream_t stream + ); + + void gatherPrime(float smoothing, cudaStream_t stream); + + void gather( + const cv::cuda::GpuMat &depth_src, + const cv::cuda::GpuMat &normals_src, + const cv::cuda::GpuMat &intensity_src, + const ftl::rgbd::Camera &cam_src, + const float4x4 &pose_src, + float smoothing, + cudaStream_t stream + ); + + void adjust( + cv::cuda::GpuMat &depth_out, + cv::cuda::GpuMat &normals_out, + cudaStream_t stream + ); + +private: + cv::cuda::GpuMat depth_prime_; + cv::cuda::GpuMat intensity_prime_; + ftl::rgbd::Camera cam_prime_; + float4x4 pose_prime_; + int radius_; + + cv::cuda::GpuMat normal_accum_; + cv::cuda::GpuMat centroid_accum_; + cv::cuda::GpuMat weight_accum_; +}; + +} +} + +#endif \ No newline at end of file diff --git a/components/operators/include/ftl/operators/cuda/mls_cuda.hpp b/components/operators/include/ftl/operators/cuda/mls_cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9790595d6aa6350369f4c4cb88adb8a1728531ca --- /dev/null +++ b/components/operators/include/ftl/operators/cuda/mls_cuda.hpp @@ -0,0 +1,60 @@ +#ifndef _FTL_CUDA_MLS_HPP_ +#define _FTL_CUDA_MLS_HPP_ + +#include <ftl/rgbd/camera.hpp> +#include <ftl/cuda_common.hpp> + +namespace ftl { +namespace cuda { + +/** + * Basic Moving Least Squares smoothing on a single depth map. Outputs boths + * normals and depth values. This is a single iteration of an algorithm that + * should iterate 2-3 times. Normals can use some basic estimation as initial + * input since they are smoothed by MLS. + * + * @param normals_in 4 channel 16-bit float (half). + * @param normals_out 4 channel 16-bit float (half). + * @param depth_in 1 channel 32-bit float + * @param depth_out 1 channel 32-bit float + * @param smoothing Gaussian radius in depth units + * @param radius Window radius for smoothing + * @param camera Camera intrinsics + * @param stream Optional CUDA stream + */ +void mls_smooth( + const cv::cuda::GpuMat &normals_in, + cv::cuda::GpuMat &normals_out, + const cv::cuda::GpuMat &depth_in, + cv::cuda::GpuMat &depth_out, + float smoothing, + int radius, + const ftl::rgbd::Camera &camera, + cudaStream_t stream); + +/** + * Basic Moving Least Squares smoothing on a single depth map. Outputs just the + * smoothed normals. This is a single iteration of an algorithm that should + * iterate 2-3 times. + * + * @param normals_in 4 channel 16-bit float (half). + * @param normals_out 4 channel 16-bit float (half). + * @param depth_in 1 channel 32-bit float + * @param smoothing Gaussian radius in depth units + * @param radius Window radius for smoothing + * @param camera Camera intrinsics + * @param stream Optional CUDA stream + */ +void mls_smooth( + const cv::cuda::GpuMat &normals_in, + cv::cuda::GpuMat &normals_out, + const cv::cuda::GpuMat &depth_in, + float smoothing, + int radius, + const ftl::rgbd::Camera &camera, + cudaStream_t stream); + +} +} + +#endif // _FTL_CUDA_MLS_HPP_ diff --git a/components/operators/src/smoothing_cuda.hpp b/components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp similarity index 91% rename from components/operators/src/smoothing_cuda.hpp rename to components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp index c44c7787b9fe8dd18d6e148d96c8d71803ba368c..f1934cdae28f4a3a90cf007c3572f3da017b9357 100644 --- a/components/operators/src/smoothing_cuda.hpp +++ b/components/operators/include/ftl/operators/cuda/smoothing_cuda.hpp @@ -7,16 +7,6 @@ namespace ftl { namespace cuda { -void mls_smooth( - ftl::cuda::TextureObject<half4> &normals_in, - ftl::cuda::TextureObject<half4> &normals_out, - ftl::cuda::TextureObject<float> &depth_in, - ftl::cuda::TextureObject<float> &depth_out, - float smoothing, - int radius, - const ftl::rgbd::Camera &camera, - cudaStream_t stream); - void colour_mls_smooth( ftl::cuda::TextureObject<half4> &normals_in, ftl::cuda::TextureObject<half4> &normals_out, diff --git a/components/operators/include/ftl/operators/fusion.hpp b/components/operators/include/ftl/operators/fusion.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ed0e5f9cd324f1e57979d9f20fc9d9a98f796ce0 --- /dev/null +++ b/components/operators/include/ftl/operators/fusion.hpp @@ -0,0 +1,29 @@ +#ifndef _FTL_OPERATORS_FUSION_HPP_ +#define _FTL_OPERATORS_FUSION_HPP_ + +#include <ftl/operators/operator.hpp> +#include <ftl/operators/cuda/mls/multi_intensity.hpp> +#include <vector> + +namespace ftl { +namespace operators { + +class Fusion : public ftl::operators::Operator { + public: + Fusion(ftl::operators::Graph *g, ftl::Configurable*); + ~Fusion(); + + inline Operator::Type type() const override { return Operator::Type::ManyToMany; } + + bool apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) override; + + private: + ftl::cuda::MLSMultiIntensity mls_; + std::vector<cv::cuda::GpuMat> weights_; + cv::cuda::GpuMat temp_; +}; + +} +} + +#endif diff --git a/components/operators/src/gt_analysis.cpp b/components/operators/src/analysis/evaluation/gt_analysis.cpp similarity index 100% rename from components/operators/src/gt_analysis.cpp rename to components/operators/src/analysis/evaluation/gt_analysis.cpp diff --git a/components/operators/src/gt_analysis.cu b/components/operators/src/analysis/evaluation/gt_analysis.cu similarity index 100% rename from components/operators/src/gt_analysis.cu rename to components/operators/src/analysis/evaluation/gt_analysis.cu diff --git a/components/operators/src/mask.cpp b/components/operators/src/analysis/local/mask.cpp similarity index 100% rename from components/operators/src/mask.cpp rename to components/operators/src/analysis/local/mask.cpp diff --git a/components/operators/src/mask.cu b/components/operators/src/analysis/local/mask.cu similarity index 100% rename from components/operators/src/mask.cu rename to components/operators/src/analysis/local/mask.cu diff --git a/components/operators/src/normals.cpp b/components/operators/src/analysis/local/normals.cpp similarity index 100% rename from components/operators/src/normals.cpp rename to components/operators/src/analysis/local/normals.cpp diff --git a/components/operators/src/smoothchan.cu b/components/operators/src/analysis/local/smoothchan.cu similarity index 98% rename from components/operators/src/smoothchan.cu rename to components/operators/src/analysis/local/smoothchan.cu index ea40854fccd7579fc3973a6acb41543ea7bc75d5..2766e6d23ab37f92e3a56bc0938a91df9ac45bd0 100644 --- a/components/operators/src/smoothchan.cu +++ b/components/operators/src/analysis/local/smoothchan.cu @@ -1,4 +1,4 @@ -#include "smoothing_cuda.hpp" +#include <ftl/operators/cuda/smoothing_cuda.hpp> #include <ftl/cuda/weighting.hpp> diff --git a/components/operators/src/weighting.cpp b/components/operators/src/analysis/local/weighting.cpp similarity index 100% rename from components/operators/src/weighting.cpp rename to components/operators/src/analysis/local/weighting.cpp diff --git a/components/operators/src/weighting.cu b/components/operators/src/analysis/local/weighting.cu similarity index 100% rename from components/operators/src/weighting.cu rename to components/operators/src/analysis/local/weighting.cu diff --git a/components/operators/src/weighting_cuda.hpp b/components/operators/src/analysis/local/weighting_cuda.hpp similarity index 100% rename from components/operators/src/weighting_cuda.hpp rename to components/operators/src/analysis/local/weighting_cuda.hpp diff --git a/components/operators/src/segmentation.cpp b/components/operators/src/analysis/segmentation/segmentation.cpp similarity index 100% rename from components/operators/src/segmentation.cpp rename to components/operators/src/analysis/segmentation/segmentation.cpp diff --git a/components/operators/src/segmentation.cu b/components/operators/src/analysis/segmentation/segmentation.cu similarity index 100% rename from components/operators/src/segmentation.cu rename to components/operators/src/analysis/segmentation/segmentation.cu diff --git a/components/operators/src/segmentation_cuda.hpp b/components/operators/src/analysis/segmentation/segmentation_cuda.hpp similarity index 100% rename from components/operators/src/segmentation_cuda.hpp rename to components/operators/src/analysis/segmentation/segmentation_cuda.hpp diff --git a/components/operators/src/aruco.cpp b/components/operators/src/analysis/tracking/aruco.cpp similarity index 100% rename from components/operators/src/aruco.cpp rename to components/operators/src/analysis/tracking/aruco.cpp diff --git a/components/operators/src/detectandtrack.cpp b/components/operators/src/analysis/tracking/detectandtrack.cpp similarity index 100% rename from components/operators/src/detectandtrack.cpp rename to components/operators/src/analysis/tracking/detectandtrack.cpp diff --git a/components/operators/src/nvopticalflow.cpp b/components/operators/src/analysis/tracking/nvopticalflow.cpp similarity index 100% rename from components/operators/src/nvopticalflow.cpp rename to components/operators/src/analysis/tracking/nvopticalflow.cpp diff --git a/components/operators/src/opticalflow.cu b/components/operators/src/analysis/tracking/opticalflow.cu similarity index 100% rename from components/operators/src/opticalflow.cu rename to components/operators/src/analysis/tracking/opticalflow.cu diff --git a/components/operators/src/opticalflow_cuda.hpp b/components/operators/src/analysis/tracking/opticalflow_cuda.hpp similarity index 100% rename from components/operators/src/opticalflow_cuda.hpp rename to components/operators/src/analysis/tracking/opticalflow_cuda.hpp diff --git a/components/operators/src/depth.cpp b/components/operators/src/disparity/depth.cpp similarity index 100% rename from components/operators/src/depth.cpp rename to components/operators/src/disparity/depth.cpp diff --git a/components/operators/src/fusion/carving/carver.cu b/components/operators/src/fusion/carving/carver.cu new file mode 100644 index 0000000000000000000000000000000000000000..d1dd480ef0cbbf2ea4a75595fd6af4fee712297d --- /dev/null +++ b/components/operators/src/fusion/carving/carver.cu @@ -0,0 +1,245 @@ +#include <ftl/operators/cuda/carver.hpp> +#include <cudatl/fixed.hpp> +#include <ftl/cuda/weighting.hpp> + +__device__ inline float depthErrorCoef(const ftl::rgbd::Camera &cam, float disps=1.0f) { + return disps / (cam.baseline*cam.fx); +} + +// ==== Reverse Verify Result ================================================== + +// No colour scale calculations +/*__global__ void reverse_check_kernel( + float* __restrict__ depth_in, + const float* __restrict__ depth_original, + int pitch4, + int opitch4, + float4x4 transformR, + ftl::rgbd::Camera vintrin, + ftl::rgbd::Camera ointrin +) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < 0 || x >= vintrin.width || y < 0 || y >= vintrin.height) return; + + float d = depth_in[y*pitch4+x]; + + const float err_coef = 0.001f; //depthErrorCoef(ointrin); + + int count = 10; // Allow max 2cm of carving. + while (--count >= 0) { + float3 campos = transformR * vintrin.screenToCam(x,y,d); + int2 spos = ointrin.camToScreen<int2>(campos); + int ox = spos.x; + int oy = spos.y; + + if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) { + float d2 = depth_original[oy*opitch4+ox]; + + // TODO: Threshold comes from depth error characteristics + // If the value is significantly further then carve. Depth error + // is not always easy to calculate, depends on source. + if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*d2*err_coef)) break; + + d += 0.002f; + } else break; + } + + // Too much carving means just outright remove the point. + depth_in[y*pitch4+x] = (count < 0) ? 0.0f : d; +}*/ + +__global__ void reverse_check_kernel( + float* __restrict__ depth_in, + const float* __restrict__ depth_original, + //const uchar4* __restrict__ in_colour, + //const uchar4* __restrict__ ref_colour, + //int8_t* __restrict__ colour_scale, + int pitch4, + //int pitch, + int opitch4, + //int in_col_pitch4, + //int o_col_pitch4, + //int cwidth, + //int cheight, + float4x4 transformR, + ftl::rgbd::Camera vintrin, + ftl::rgbd::Camera ointrin +) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < 0 || x >= vintrin.width || y < 0 || y >= vintrin.height) return; + + float d = depth_in[y*pitch4+x]; + + // TODO: Externally provide the error coefficient + const float err_coef = 0.0005f; //depthErrorCoef(ointrin); + + int ox = 0; + int oy = 0; + + bool match = false; + + int count = 10; // Allow max 2cm of carving. + while (--count >= 0) { + float3 campos = transformR * vintrin.screenToCam(x,y,d); + int2 spos = ointrin.camToScreen<int2>(campos); + ox = spos.x; + oy = spos.y; + + if (campos.z > 0.0f && ox >= 0 && ox < ointrin.width && oy >= 0 && oy < ointrin.height) { + float d2 = depth_original[oy*opitch4+ox]; + + // TODO: Threshold comes from depth error characteristics + // If the value is significantly further then carve. Depth error + // is not always easy to calculate, depends on source. + if (!(d2 < ointrin.maxDepth && d2 - campos.z > d2*d2*err_coef)) { + match = fabsf(campos.z - d2) < d2*d2*err_coef; break; + } + + d += 0.002f; // TODO: Should this be += error or what? + } else break; + } + + // We found a match, so do a colour check + //float idiff = 127.0f; + //if (match) { + /* // Generate colour scaling + const float ximgscale = float(cwidth) / float(ointrin.width); + ox = float(ox) * ximgscale; + const float yimgscale = float(cheight) / float(ointrin.height); + oy = float(oy) * yimgscale; + + int cy = float(y) * yimgscale; + int cx = float(x) * ximgscale; + + const uchar4 vcol = in_colour[cy*in_col_pitch4+cx]; + const uchar4 ocol = (match) ? ref_colour[oy*o_col_pitch4+ox] : vcol; + + float i1 = (0.2126f*float(vcol.z) + 0.7152f*float(vcol.y) + 0.0722f*float(vcol.x)); + float i2 = (0.2126f*float(ocol.z) + 0.7152f*float(ocol.y) + 0.0722f*float(ocol.x)); + idiff = i2-i1; + + //const float scaleX = (vcol.x == 0) ? 1.0f : float(ocol.x) / float(vcol.x); + //const float scaleY = (vcol.y == 0) ? 1.0f : float(ocol.y) / float(vcol.y); + //const float scaleZ = (vcol.z == 0) ? 1.0f : float(ocol.z) / float(vcol.z); + //scale = (0.2126f*scaleZ + 0.7152f*scaleY + 0.0722f*scaleX); + //} + colour_scale[x+pitch*y] = int8_t(max(-127.0f,min(127.0f,idiff)));*/ + + // Too much carving means just outright remove the point. + depth_in[y*pitch4+x] = (count < 0) ? 0.0f : d; +} + +void ftl::cuda::depth_carve( + cv::cuda::GpuMat &depth_in, + const cv::cuda::GpuMat &depth_original, + //const cv::cuda::GpuMat &in_colour, + //const cv::cuda::GpuMat &ref_colour, + //cv::cuda::GpuMat &colour_scale, + const float4x4 &transformR, + const ftl::rgbd::Camera &vintrin, + const ftl::rgbd::Camera &ointrin, + cudaStream_t stream) +{ + static constexpr int THREADS_X = 16; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + //colour_scale.create(depth_in.size(), CV_8U); + + reverse_check_kernel<<<gridSize, blockSize, 0, stream>>>( + depth_in.ptr<float>(), + depth_original.ptr<float>(), + //in_colour.ptr<uchar4>(), + //ref_colour.ptr<uchar4>(), + //colour_scale.ptr<int8_t>(), + depth_in.step1(), + //colour_scale.step1(), + depth_original.step1(), + //in_colour.step1()/4, + //ref_colour.step1()/4, + //in_colour.cols, + //in_colour.rows, + transformR, + vintrin, ointrin); + + cudaSafeCall( cudaGetLastError() ); +} + +// ==== Apply colour scale ===================================================== + +template <int RADIUS> +__global__ void apply_colour_scaling_kernel( + const int8_t* __restrict__ scale, + uchar4* __restrict__ colour, + int spitch, + int cpitch, + int swidth, + int sheight, + int cwidth, + int cheight +) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x >= 0 && x < cwidth && y >= 0 && y < cheight) { + int sx = (float(swidth) / float(cwidth)) * float(x); + int sy = (float(sheight) / float(cheight)) * float(y); + + float s = 0.0f; + int count = 0; + //float mindiff = 100.0f; + + for (int v=-RADIUS; v<=RADIUS; ++v) { + #pragma unroll + for (int u=-RADIUS; u<=RADIUS; ++u) { + float ns = (sx >= RADIUS && sy >= RADIUS && sx < swidth-RADIUS && sy < sheight-RADIUS) ? scale[sx+u+(sy+v)*spitch] : 0.0f; + if (fabsf(ns) < 30) { + s += ns; + ++count; + } + } + } + + if (count > 0) s /= float(count); + + uchar4 c = colour[x+y*cpitch]; + colour[x+y*cpitch] = make_uchar4( + max(0.0f, min(255.0f, float(c.x) + s)), + max(0.0f, min(255.0f, float(c.y) + s)), + max(0.0f, min(255.0f, float(c.z) + s)), + 255.0f + ); + } +} + +void ftl::cuda::apply_colour_scaling( + const cv::cuda::GpuMat &scale, + cv::cuda::GpuMat &colour, + int radius, + cudaStream_t stream) +{ + static constexpr int THREADS_X = 16; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((colour.cols + THREADS_X - 1)/THREADS_X, (colour.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + apply_colour_scaling_kernel<2><<<gridSize, blockSize, 0, stream>>>( + scale.ptr<int8_t>(), + colour.ptr<uchar4>(), + scale.step1(), + colour.step1()/4, + scale.cols, + scale.rows, + colour.cols, + colour.rows + ); + + cudaSafeCall( cudaGetLastError() ); +} diff --git a/components/operators/src/fusion/fusion.cpp b/components/operators/src/fusion/fusion.cpp new file mode 100644 index 0000000000000000000000000000000000000000..334820a3a12d590bbfcdee51390b2fef1f6b41a7 --- /dev/null +++ b/components/operators/src/fusion/fusion.cpp @@ -0,0 +1,124 @@ +#include <ftl/operators/fusion.hpp> +#include <ftl/operators/cuda/carver.hpp> +#include <ftl/utility/matrix_conversion.hpp> +#include <opencv2/core/cuda_stream_accessor.hpp> + +#include <opencv2/cudaimgproc.hpp> +#include <opencv2/cudawarping.hpp> + +using ftl::operators::Fusion; +using ftl::codecs::Channel; +using cv::cuda::GpuMat; + +Fusion::Fusion(ftl::operators::Graph *g, ftl::Configurable *cfg) : ftl::operators::Operator(g, cfg), mls_(3) { + +} + +Fusion::~Fusion() { + +} + +bool Fusion::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cudaStream_t stream) { + float mls_smoothing = config()->value("mls_smoothing", 0.01f); + int mls_iters = config()->value("mls_iterations", 2); + + if (weights_.size() != in.frames.size()) weights_.resize(in.frames.size()); + + cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream); + + for (size_t i=0; i<in.frames.size(); ++i) { + if (!in.hasFrame(i)) continue; + const GpuMat &col = in.frames[i].get<GpuMat>(Channel::Colour); + const GpuMat &d = in.frames[i].get<GpuMat>(Channel::Depth); + + cv::cuda::cvtColor(col, temp_, cv::COLOR_BGRA2GRAY, 0, cvstream); + cv::cuda::resize(temp_, weights_[i], d.size(), 0, 0, cv::INTER_LINEAR, cvstream); + } + + if (config()->value("visibility_carving", true)) { + for (size_t i=0; i < in.frames.size(); ++i) { + if (!in.hasFrame(i)) continue; + + auto &f = in.frames[i].cast<ftl::rgbd::Frame>(); + + for (size_t j=0; j < in.frames.size(); ++j) { + if (i == j) continue; + if (!in.hasFrame(j)) continue; + + auto &ref = in.frames[j].cast<ftl::rgbd::Frame>(); + + auto transformR = MatrixConversion::toCUDA(ref.getPose().cast<float>().inverse() * f.getPose().cast<float>()); + + ftl::cuda::depth_carve( + f.create<cv::cuda::GpuMat>(Channel::Depth), + ref.get<cv::cuda::GpuMat>(Channel::Depth), + //f.get<cv::cuda::GpuMat>(Channel::Colour), + //ref.get<cv::cuda::GpuMat>(Channel::Colour), + //colour_scale_, + transformR, + f.getLeft(), + ref.getLeft(), + stream + ); + } + + //ftl::cuda::apply_colour_scaling(colour_scale_, f.create<cv::cuda::GpuMat>(Channel::Colour), 3, stream_); + } + } + + for (int iters=0; iters < mls_iters; ++iters) { + for (size_t i=0; i<in.frames.size(); ++i) { + if (!in.hasFrame(i)) continue; + + auto &f1 = in.frames[i].cast<ftl::rgbd::Frame>(); + + Eigen::Vector4d d1(0.0, 0.0, 1.0, 0.0); + d1 = f1.getPose() * d1; + + auto pose1 = MatrixConversion::toCUDA(f1.getPose().cast<float>()); + + mls_.prime( + f1.get<GpuMat>(Channel::Depth), + weights_[i], + f1.getLeft(), + pose1, + stream + ); + + for (size_t j=0; j<in.frames.size(); ++j) { + if (!in.hasFrame(j)) continue; + //if (i == j) continue; + + //LOG(INFO) << "Running phase1"; + + auto &f2 = in.frames[j].cast<ftl::rgbd::Frame>(); + + // Are cameras facing similar enough direction? + Eigen::Vector4d d2(0.0, 0.0, 1.0, 0.0); + d2 = f2.getPose() * d2; + // No, so skip this combination + if (d1.dot(d2) <= 0.0) continue; + + auto pose2 = MatrixConversion::toCUDA(f2.getPose().cast<float>()); + + mls_.gather( + f2.get<GpuMat>(Channel::Depth), + f2.get<GpuMat>(Channel::Normals), + weights_[j], + f2.getLeft(), + pose2, + mls_smoothing, + stream + ); + } + + mls_.adjust( + f1.create<GpuMat>(Channel::Depth), + f1.create<GpuMat>(Channel::Normals), + stream + ); + } + } + + return true; +} diff --git a/components/operators/src/fusion/mvmls.cpp b/components/operators/src/fusion/mvmls.cpp index 38328f33ea724a1b687163bc961c2ccec1d6b4fa..15742b05544fea94846db4927ee0434a22c20967 100644 --- a/components/operators/src/fusion/mvmls.cpp +++ b/components/operators/src/fusion/mvmls.cpp @@ -1,5 +1,5 @@ #include <ftl/operators/mvmls.hpp> -#include "smoothing_cuda.hpp" +#include <ftl/operators/cuda/smoothing_cuda.hpp> #include <ftl/utility/matrix_conversion.hpp> #include "mvmls_cuda.hpp" #include <ftl/cuda/normals.hpp> diff --git a/components/operators/src/fusion/smoothing/mls_multi_weighted.cu b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5e5ded31f193bda1c5b3d24b58994d5717ead32 --- /dev/null +++ b/components/operators/src/fusion/smoothing/mls_multi_weighted.cu @@ -0,0 +1,256 @@ +#include <ftl/operators/cuda/mls/multi_intensity.hpp> +#include <opencv2/core/cuda_stream_accessor.hpp> +#include <ftl/cuda/weighting.hpp> + +using ftl::cuda::MLSMultiIntensity; +using cv::cuda::GpuMat; + +// ==== Multi image MLS ======================================================== + +__device__ inline float featureWeight(int f1, int f2) { + const float w = (1.0f-(float(abs(f1 - f2)) / 255.0f)); + return w*w; +} + +/* + * Gather points for Moving Least Squares, from each source image + */ + template <int SEARCH_RADIUS, typename T> + __global__ void mls_gather_intensity_kernel( + const half4* __restrict__ normals_in, + half4* __restrict__ normals_out, + const float* __restrict__ depth_origin, + const float* __restrict__ depth_in, + const T* __restrict__ feature_origin, + const T* __restrict__ feature_in, + float4* __restrict__ centroid_out, + float* __restrict__ contrib_out, + float smoothing, + float4x4 o_2_in, + float4x4 in_2_o, + float3x3 in_2_o33, + ftl::rgbd::Camera camera_origin, + ftl::rgbd::Camera camera_in, + int npitch_out, + int cpitch_out, + int wpitch_out, + int dpitch_o, + int dpitch_i, + int npitch_in, + int fpitch_o, + int fpitch_i +) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < 0 || y < 0 || x >= camera_origin.width || y >= camera_origin.height) return; + + float3 nX = make_float3(normals_out[y*npitch_out+x]); + float3 aX = make_float3(centroid_out[y*cpitch_out+x]); + float contrib = contrib_out[y*wpitch_out+x]; + + float d0 = depth_origin[x+y*dpitch_o]; + if (d0 <= camera_origin.minDepth || d0 >= camera_origin.maxDepth) return; + + const int feature1 = feature_origin[x+y*fpitch_o]; + + float3 X = camera_origin.screenToCam((int)(x),(int)(y),d0); + + int2 s = camera_in.camToScreen<int2>(o_2_in * X); + + // Neighbourhood + for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) { + for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) { + const float d = (s.x+u >= 0 && s.x+u < camera_in.width && s.y+v >= 0 && s.y+v < camera_in.height) ? depth_in[s.x+u+(s.y+v)*dpitch_i] : 0.0f; + if (d <= camera_in.minDepth || d >= camera_in.maxDepth) continue; + + // Point and normal of neighbour + const float3 Xi = in_2_o * camera_in.screenToCam(s.x+u, s.y+v, d); + const float3 Ni = make_float3(normals_in[s.x+u+(s.y+v)*npitch_in]); + + const int feature2 = feature_in[s.x+y+(s.y+v)*fpitch_i]; + + // Gauss approx weighting function using point distance + const float w = (length(Ni) > 0.0f) + ? ftl::cuda::spatialWeighting(X,Xi,smoothing) * featureWeight(feature1, feature2) + : 0.0f; + + aX += Xi*w; + nX += (in_2_o33 * Ni)*w; + contrib += w; + } + } + + normals_out[y*npitch_out+x] = make_half4(nX, 0.0f); + centroid_out[y*cpitch_out+x] = make_float4(aX, 0.0f); + contrib_out[y*wpitch_out+x] = contrib; +} + +/** + * Convert accumulated values into estimate of depth and normals at pixel. + */ + __global__ void mls_reduce_kernel_2( + const float4* __restrict__ centroid, + const half4* __restrict__ normals, + const float* __restrict__ contrib_out, + half4* __restrict__ normals_out, + float* __restrict__ depth, + ftl::rgbd::Camera camera, + int npitch_in, + int cpitch_in, + int wpitch, + int npitch, + int dpitch +) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) { + float3 nX = make_float3(normals[y*npitch_in+x]); + float3 aX = make_float3(centroid[y*cpitch_in+x]); + float contrib = contrib_out[y*wpitch+x]; + + //depth[x+y*dpitch] = X.z; + normals_out[x+y*npitch] = make_half4(0.0f, 0.0f, 0.0f, 0.0f); + + float d0 = depth[x+y*dpitch]; + //depth[x+y*dpitch] = 0.0f; + if (d0 <= camera.minDepth || d0 >= camera.maxDepth || contrib == 0.0f) return; + float3 X = camera.screenToCam((int)(x),(int)(y),d0); + + nX /= contrib; // Weighted average normal + aX /= contrib; // Weighted average point (centroid) + + // Signed-Distance Field function + float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); + + // Calculate new point using SDF function to adjust depth (and position) + X = X - nX * fX; + + depth[x+y*dpitch] = X.z; + normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f); + } +} + + +MLSMultiIntensity::MLSMultiIntensity(int radius) + : radius_(radius) +{ + +} + +MLSMultiIntensity::~MLSMultiIntensity() +{ + +} + +void MLSMultiIntensity::prime( + const GpuMat &depth_prime, + const GpuMat &intensity_prime, + const ftl::rgbd::Camera &cam_prime, + const float4x4 &pose_prime, + cudaStream_t stream) +{ + depth_prime_ = depth_prime; + intensity_prime_ = intensity_prime; + cam_prime_ = cam_prime; + pose_prime_ = pose_prime; + + centroid_accum_.create(depth_prime.size(), CV_32FC4); + normal_accum_.create(depth_prime.size(), CV_16FC4); + weight_accum_.create(depth_prime.size(), CV_32F); + + cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream); + + // Reset buffers + centroid_accum_.setTo(cv::Scalar(0,0,0,0), cvstream); + weight_accum_.setTo(cv::Scalar(0), cvstream); + normal_accum_.setTo(cv::Scalar(0,0,0,0), cvstream); +} + +void MLSMultiIntensity::gatherPrime(float smoothing, cudaStream_t stream) +{ + // Can use a simpler kernel without pose transformations +} + +void MLSMultiIntensity::gather( + const GpuMat &depth_src, + const GpuMat &normals_src, + const GpuMat &intensity_src, + const ftl::rgbd::Camera &cam_src, + const float4x4 &pose_src, + float smoothing, + cudaStream_t stream) +{ + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + float4x4 inv_pose_src = pose_src; + inv_pose_src.invert(); + float4x4 o_2_in = inv_pose_src * pose_prime_; + float4x4 inv_pose_prime = pose_prime_; + inv_pose_prime.invert(); + float4x4 in_2_o = inv_pose_prime * pose_src; + float3x3 in_2_o33 = inv_pose_prime.getFloat3x3() * pose_src.getFloat3x3(); + + mls_gather_intensity_kernel<3><<<gridSize, blockSize, 0, stream>>>( + normals_src.ptr<half4>(), + normal_accum_.ptr<half4>(), + depth_prime_.ptr<float>(), + depth_src.ptr<float>(), + intensity_prime_.ptr<uchar>(), + intensity_src.ptr<uchar>(), + centroid_accum_.ptr<float4>(), + weight_accum_.ptr<float>(), + smoothing, + o_2_in, + in_2_o, + in_2_o33, + cam_prime_, + cam_src, + normal_accum_.step1()/4, + centroid_accum_.step1()/4, + weight_accum_.step1(), + depth_prime_.step1(), + depth_src.step1(), + normals_src.step1()/4, + intensity_prime_.step1(), + intensity_src.step1() + ); + cudaSafeCall( cudaGetLastError() ); +} + +void MLSMultiIntensity::adjust( + GpuMat &depth_out, + GpuMat &normals_out, + cudaStream_t stream) +{ + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + normals_out.create(depth_prime_.size(), CV_16FC4); + depth_out.create(depth_prime_.size(), CV_32F); + + // FIXME: Depth prime assumed to be same as depth out + + mls_reduce_kernel_2<<<gridSize, blockSize, 0, stream>>>( + centroid_accum_.ptr<float4>(), + normal_accum_.ptr<half4>(), + weight_accum_.ptr<float>(), + normals_out.ptr<half4>(), + depth_prime_.ptr<float>(), + cam_prime_, + normal_accum_.step1()/4, + centroid_accum_.step1()/4, + weight_accum_.step1(), + normals_out.step1()/4, + depth_prime_.step1() + ); + cudaSafeCall( cudaGetLastError() ); +} diff --git a/components/operators/src/antialiasing.cpp b/components/operators/src/misc/antialiasing.cpp similarity index 100% rename from components/operators/src/antialiasing.cpp rename to components/operators/src/misc/antialiasing.cpp diff --git a/components/operators/src/antialiasing.cu b/components/operators/src/misc/antialiasing.cu similarity index 100% rename from components/operators/src/antialiasing.cu rename to components/operators/src/misc/antialiasing.cu diff --git a/components/operators/src/antialiasing_cuda.hpp b/components/operators/src/misc/antialiasing_cuda.hpp similarity index 100% rename from components/operators/src/antialiasing_cuda.hpp rename to components/operators/src/misc/antialiasing_cuda.hpp diff --git a/components/operators/src/clipping.cpp b/components/operators/src/misc/clipping.cpp similarity index 100% rename from components/operators/src/clipping.cpp rename to components/operators/src/misc/clipping.cpp diff --git a/components/operators/src/colours.cpp b/components/operators/src/misc/colours.cpp similarity index 100% rename from components/operators/src/colours.cpp rename to components/operators/src/misc/colours.cpp diff --git a/components/operators/src/poser.cpp b/components/operators/src/misc/poser.cpp similarity index 100% rename from components/operators/src/poser.cpp rename to components/operators/src/misc/poser.cpp diff --git a/components/operators/src/filling.cpp b/components/operators/src/surface/filling.cpp similarity index 100% rename from components/operators/src/filling.cpp rename to components/operators/src/surface/filling.cpp diff --git a/components/operators/src/filling.cu b/components/operators/src/surface/filling.cu similarity index 100% rename from components/operators/src/filling.cu rename to components/operators/src/surface/filling.cu diff --git a/components/operators/src/filling_cuda.hpp b/components/operators/src/surface/filling_cuda.hpp similarity index 100% rename from components/operators/src/filling_cuda.hpp rename to components/operators/src/surface/filling_cuda.hpp diff --git a/components/operators/src/mls.cu b/components/operators/src/surface/mls.cu similarity index 89% rename from components/operators/src/mls.cu rename to components/operators/src/surface/mls.cu index 55813446ab7a62d0913b96d107187b2bb62f0543..0a6373719c5d4655db9d9a2f7bacbadcea63e72c 100644 --- a/components/operators/src/mls.cu +++ b/components/operators/src/surface/mls.cu @@ -1,4 +1,4 @@ -#include "smoothing_cuda.hpp" +#include <ftl/operators/cuda/smoothing_cuda.hpp> #include <ftl/cuda/weighting.hpp> @@ -7,98 +7,6 @@ using ftl::cuda::TextureObject; #define T_PER_BLOCK 8 #define WARP_SIZE 32 -// ===== MLS Smooth ============================================================ - -/* - * Smooth depth map using Moving Least Squares - */ - template <int SEARCH_RADIUS> - __global__ void mls_smooth_kernel( - TextureObject<half4> normals_in, - TextureObject<half4> normals_out, - TextureObject<float> depth_in, // Virtual depth map - TextureObject<float> depth_out, // Accumulated output - float smoothing, - ftl::rgbd::Camera camera) { - - const int x = blockIdx.x*blockDim.x + threadIdx.x; - const int y = blockIdx.y*blockDim.y + threadIdx.y; - - if (x < 0 || y < 0 || x >= depth_in.width() || y >= depth_in.height()) return; - - float3 aX = make_float3(0.0f,0.0f,0.0f); - float3 nX = make_float3(0.0f,0.0f,0.0f); - float contrib = 0.0f; - - float d0 = depth_in.tex2D(x, y); - depth_out(x,y) = d0; - if (d0 < camera.minDepth || d0 > camera.maxDepth) return; - float3 X = camera.screenToCam((int)(x),(int)(y),d0); - - // Neighbourhood - for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) { - for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) { - const float d = depth_in.tex2D(x+u, y+v); - if (d < camera.minDepth || d > camera.maxDepth) continue; - - // Point and normal of neighbour - const float3 Xi = camera.screenToCam((int)(x)+u,(int)(y)+v,d); - const float3 Ni = make_float3(normals_in.tex2D((int)(x)+u, (int)(y)+v)); - - // Gauss approx weighting function using point distance - const float w = ftl::cuda::spatialWeighting(X,Xi,smoothing); - - aX += Xi*w; - nX += Ni*w; - contrib += w; - } - } - - nX /= contrib; // Weighted average normal - aX /= contrib; // Weighted average point (centroid) - - // Signed-Distance Field function - float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); - - // Calculate new point using SDF function to adjust depth (and position) - X = X - nX * fX; - - //uint2 screen = camera.camToScreen<uint2>(X); - - //if (screen.x < depth_out.width() && screen.y < depth_out.height()) { - // depth_out(screen.x,screen.y) = X.z; - //} - depth_out(x,y) = X.z; - normals_out(x,y) = make_half4(nX / length(nX), 0.0f); -} - -void ftl::cuda::mls_smooth( - ftl::cuda::TextureObject<half4> &normals_in, - ftl::cuda::TextureObject<half4> &normals_out, - ftl::cuda::TextureObject<float> &depth_in, - ftl::cuda::TextureObject<float> &depth_out, - float smoothing, - int radius, - const ftl::rgbd::Camera &camera, - cudaStream_t stream) { - - const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - - switch (radius) { - case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break; - case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break; - case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break; - case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break; - case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in, normals_out, depth_in, depth_out, smoothing, camera); break; - } - cudaSafeCall( cudaGetLastError() ); - - - #ifdef _DEBUG - cudaSafeCall(cudaDeviceSynchronize()); - #endif -} // ===== Colour MLS Smooth ===================================================== diff --git a/components/operators/src/surface/mls/image_basic.cu b/components/operators/src/surface/mls/image_basic.cu new file mode 100644 index 0000000000000000000000000000000000000000..17d528391f1ca69ea6d92a24346be5c2590f1e98 --- /dev/null +++ b/components/operators/src/surface/mls/image_basic.cu @@ -0,0 +1,128 @@ +#include <ftl/operators/cuda/mls_cuda.hpp> +#include <ftl/cuda/weighting.hpp> + +// ===== MLS Smooth ============================================================ + +/* + * Smooth depth map using Moving Least Squares. This version is for a single + * depth image and does not use colour. + */ + template <int RADIUS> + __global__ void mls_smooth_kernel( + const half4* __restrict__ normals_in, + half4* __restrict__ normals_out, // Can be nullptr + const float* __restrict__ depth_in, + float* __restrict__ depth_out, // Can be nullptr + int npitch, + int dpitch, + float smoothing, // Radius of Gaussian in cm + ftl::rgbd::Camera camera +) { + + const int x = blockIdx.x*blockDim.x + threadIdx.x; + const int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < RADIUS || y < RADIUS || x >= camera.width-RADIUS || y >= camera.height-RADIUS) return; + + float3 aX = make_float3(0.0f,0.0f,0.0f); + float3 nX = make_float3(0.0f,0.0f,0.0f); + float contrib = 0.0f; + + const float d0 = depth_in[x+y*dpitch]; + if (depth_out) depth_out[x+y*dpitch] = d0; + if (normals_out) normals_out[x+y*npitch] = normals_in[x+y*npitch]; + if (d0 < camera.minDepth || d0 > camera.maxDepth) return; + + float3 X = camera.screenToCam((int)(x),(int)(y),d0); + + // Neighbourhood + for (int v=-RADIUS; v<=RADIUS; ++v) { + for (int u=-RADIUS; u<=RADIUS; ++u) { + const float d = depth_in[x+u+(y+v)*dpitch]; + if (d < camera.minDepth || d > camera.maxDepth) continue; + + // Point and normal of neighbour + const float3 Xi = camera.screenToCam(x+u, y+v, d); + const float3 Ni = make_float3(normals_in[x+u+(y+v)*npitch]); + + // Gauss approx weighting function using point distance + const float w = (Ni.x+Ni.y+Ni.z == 0.0f) ? 0.0f : ftl::cuda::spatialWeighting(X,Xi,smoothing); + + aX += Xi*w; + nX += Ni*w; + contrib += w; + } + } + + if (contrib > 0.0f) { + nX /= contrib; // Weighted average normal + aX /= contrib; // Weighted average point (centroid) + + // Signed-Distance Field function + float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); + + // Calculate new point using SDF function to adjust depth (and position) + X = X - nX * fX; + + if (depth_out) depth_out[x+y*dpitch] = X.z; + if (normals_out) normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f); + } +} + +/* One iteration of MLS Smoothing, simple, single image and output depth also. */ +void ftl::cuda::mls_smooth( + const cv::cuda::GpuMat &normals_in, + cv::cuda::GpuMat &normals_out, + const cv::cuda::GpuMat &depth_in, + cv::cuda::GpuMat &depth_out, + float smoothing, + int radius, + const ftl::rgbd::Camera &camera, + cudaStream_t stream +) { + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + normals_out.create(normals_in.size(), CV_16FC4); + depth_out.create(depth_in.size(), CV_32F); + + switch (radius) { + case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), depth_out.ptr<float>(), normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + } + cudaSafeCall( cudaGetLastError() ); +} + +/* One iteration of MLS Smoothing, simple, single image, normals only */ +void ftl::cuda::mls_smooth( + const cv::cuda::GpuMat &normals_in, + cv::cuda::GpuMat &normals_out, + const cv::cuda::GpuMat &depth_in, + float smoothing, + int radius, + const ftl::rgbd::Camera &camera, + cudaStream_t stream +) { + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + normals_out.create(normals_in.size(), CV_16FC4); + + switch (radius) { + case 5: mls_smooth_kernel<5><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 4: mls_smooth_kernel<4><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 3: mls_smooth_kernel<3><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 2: mls_smooth_kernel<2><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + case 1: mls_smooth_kernel<1><<<gridSize, blockSize, 0, stream>>>(normals_in.ptr<half4>(), normals_out.ptr<half4>(), depth_in.ptr<float>(), nullptr, normals_in.step1()/4, depth_in.step1(), smoothing, camera); break; + } + cudaSafeCall( cudaGetLastError() ); +} diff --git a/components/operators/src/smoothing.cpp b/components/operators/src/surface/smoothing.cpp similarity index 97% rename from components/operators/src/smoothing.cpp rename to components/operators/src/surface/smoothing.cpp index 347b6b7754b49363be98578a94e1d049b6ffec1a..dd96fe6fcfaafac8c48b3d17af649c07257ca87a 100644 --- a/components/operators/src/smoothing.cpp +++ b/components/operators/src/surface/smoothing.cpp @@ -1,5 +1,6 @@ #include <ftl/operators/smoothing.hpp> -#include "smoothing_cuda.hpp" +#include <ftl/operators/cuda/smoothing_cuda.hpp> +#include <ftl/operators/cuda/mls_cuda.hpp> #define LOGURU_REPLACE_GLOG 1 #include <loguru.hpp> @@ -163,10 +164,10 @@ bool SimpleMLS::apply(ftl::rgbd::Frame &in, ftl::rgbd::Frame &out, cudaStream_t // FIXME: Assume in and out are the same frame. for (int i=0; i<iters; ++i) { ftl::cuda::mls_smooth( - in.createTexture<half4>(Channel::Normals), - temp.createTexture<half4>(Channel::Normals, ftl::rgbd::Format<half4>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), - in.createTexture<float>(Channel::Depth), - temp.createTexture<float>(Channel::Depth, ftl::rgbd::Format<float>(in.get<cv::cuda::GpuMat>(Channel::Depth).size())), + in.get<GpuMat>(Channel::Normals), + temp.create<GpuMat>(Channel::Normals), + in.get<GpuMat>(Channel::Depth), + temp.create<GpuMat>(Channel::Depth), thresh, radius, in.getLeftCamera(), diff --git a/components/operators/src/smoothing.cu b/components/operators/src/surface/smoothing.cu similarity index 99% rename from components/operators/src/smoothing.cu rename to components/operators/src/surface/smoothing.cu index a2b7377fba011243c08cf5347f3dea4800b8ab51..a1184475317a174e65c30ea087a4922f20e3ccb0 100644 --- a/components/operators/src/smoothing.cu +++ b/components/operators/src/surface/smoothing.cu @@ -1,4 +1,4 @@ -#include "smoothing_cuda.hpp" +#include <ftl/operators/cuda/smoothing_cuda.hpp> #include <ftl/cuda/weighting.hpp> diff --git a/components/renderers/cpp/CMakeLists.txt b/components/renderers/cpp/CMakeLists.txt index 5346849e22b23a31471ca6b3a1b52dc8eb37209d..fe93b78bb84000a5a499eda8dd136a5f7c153fae 100644 --- a/components/renderers/cpp/CMakeLists.txt +++ b/components/renderers/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ target_include_directories(ftlrender PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/ext/nanogui/include> $<INSTALL_INTERFACE:include> PRIVATE src) -target_link_libraries(ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) +target_link_libraries(ftlrender ftlrgbd ftlcommon cudatl Eigen3::Eigen Threads::Threads nanogui ${NANOGUI_EXTRA_LIBS} ${OpenCV_LIBS}) target_precompile_headers(ftlrender REUSE_FROM ftldata) diff --git a/components/renderers/cpp/include/ftl/render/CUDARender.hpp b/components/renderers/cpp/include/ftl/render/CUDARender.hpp index a4e65bce6ba1aa40dbce3c4a8ba24747f258300a..59eb1377685af76a5eed1d78314dd5c9c76b698d 100644 --- a/components/renderers/cpp/include/ftl/render/CUDARender.hpp +++ b/components/renderers/cpp/include/ftl/render/CUDARender.hpp @@ -58,9 +58,13 @@ class CUDARender : public ftl::render::FSRenderer { ftl::cuda::TextureObject<float4> accum_; // 2 is number of channels can render together ftl::cuda::TextureObject<int> contrib_; //ftl::cuda::TextureObject<half4> normals_; + cv::cuda::GpuMat colour_scale_; + cv::cuda::GpuMat mls_contrib_; + cv::cuda::GpuMat mls_centroid_; + cv::cuda::GpuMat mls_normals_; - std::list<ftl::cuda::TextureObject<short2>*> screen_buffers_; - std::list<ftl::cuda::TextureObject<float>*> depth_buffers_; + std::list<cv::cuda::GpuMat*> screen_buffers_; + std::list<cv::cuda::GpuMat*> depth_buffers_; ftl::cuda::TextureObject<float> depth_out_; ftl::cuda::Collision *collisions_; @@ -118,8 +122,8 @@ class CUDARender : public ftl::render::FSRenderer { bool _alreadySeen() const { return last_frame_ == scene_->timestamp(); } void _adjustDepthThresholds(const ftl::rgbd::Camera &fcam); - ftl::cuda::TextureObject<float> &_getDepthBuffer(const cv::Size &); - ftl::cuda::TextureObject<short2> &_getScreenBuffer(const cv::Size &); + cv::cuda::GpuMat &_getDepthBuffer(const cv::Size &); + cv::cuda::GpuMat &_getScreenBuffer(const cv::Size &); inline ftl::codecs::Channel _getDepthChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Depth : ftl::codecs::Channel::Depth2; } inline ftl::codecs::Channel _getNormalsChannel() const { return (out_chan_ == ftl::codecs::Channel::Colour) ? ftl::codecs::Channel::Normals : ftl::codecs::Channel::Normals2; } diff --git a/components/renderers/cpp/src/CUDARender.cpp b/components/renderers/cpp/src/CUDARender.cpp index dab897d5696f78ebffbec097485778bebd57c03b..0f888dec2b9b75b1de77c791f9295d1563c113b5 100644 --- a/components/renderers/cpp/src/CUDARender.cpp +++ b/components/renderers/cpp/src/CUDARender.cpp @@ -6,6 +6,8 @@ #include <ftl/operators/cuda/mask.hpp> #include <ftl/render/colouriser.hpp> #include <ftl/cuda/transform.hpp> +#include <ftl/operators/cuda/mls_cuda.hpp> +#include <ftl/utility/image_debug.hpp> #include <ftl/cuda/colour_cuda.hpp> @@ -218,20 +220,20 @@ void CUDARender::_adjustDepthThresholds(const ftl::rgbd::Camera &fcam) { params_.depthCoef = fcam.baseline*fcam.fx; } -ftl::cuda::TextureObject<float> &CUDARender::_getDepthBuffer(const cv::Size &size) { +cv::cuda::GpuMat &CUDARender::_getDepthBuffer(const cv::Size &size) { for (auto *b : depth_buffers_) { - if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b; + if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b; } - auto *nb = new ftl::cuda::TextureObject<float>(size.width, size.height); + auto *nb = new cv::cuda::GpuMat(size, CV_16S); depth_buffers_.push_back(nb); return *nb; } -ftl::cuda::TextureObject<short2> &CUDARender::_getScreenBuffer(const cv::Size &size) { +cv::cuda::GpuMat &CUDARender::_getScreenBuffer(const cv::Size &size) { for (auto *b : screen_buffers_) { - if (b->width() == static_cast<size_t>(size.width) && b->height() == static_cast<size_t>(size.height)) return *b; + if (b->cols == static_cast<size_t>(size.width) && b->rows == static_cast<size_t>(size.height)) return *b; } - auto *nb = new ftl::cuda::TextureObject<short2>(size.width, size.height); + auto *nb = new cv::cuda::GpuMat(size, CV_16SC2); screen_buffers_.push_back(nb); return *nb; } @@ -241,18 +243,20 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre bool do_blend = value("mesh_blend", false); float blend_alpha = value("blend_alpha", 0.02f); - if (do_blend) { - temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream); - temp_.set<GpuMat>(Channel::Weights).setTo(cv::Scalar(0.0f), cvstream); - } else { + //if (do_blend) { + // temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream); + // temp_.set<GpuMat>(Channel::Weights).setTo(cv::Scalar(0.0f), cvstream); + //} else { temp_.set<GpuMat>(Channel::Depth2).setTo(cv::Scalar(0x7FFFFFFF), cvstream); - } + //} int valid_count = 0; // FIXME: Is it possible to remember previously if there should be depth? bool use_depth = scene_->anyHasChannel(Channel::Depth) || scene_->anyHasChannel(Channel::GroundTruth); + //if (!colour_scale_.empty()) ftl::utility::show_image(colour_scale_, "CScale", 1.0f, ftl::utility::ImageVisualisation::HEAT_MAPPED); + // For each source depth map for (size_t i=0; i < scene_->frames.size(); ++i) { //if (!scene_->hasFrame(i)) continue; @@ -284,14 +288,14 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre if (use_depth) { if (f.hasChannel(Channel::Depth)) { ftl::cuda::screen_coord( - f.createTexture<float>(Channel::Depth), + f.get<cv::cuda::GpuMat>(Channel::Depth), depthbuffer, screenbuffer, params_, transform, f.getLeftCamera(), stream ); } else if (f.hasChannel(Channel::GroundTruth)) { ftl::cuda::screen_coord( - f.createTexture<float>(Channel::GroundTruth), + f.get<cv::cuda::GpuMat>(Channel::GroundTruth), depthbuffer, screenbuffer, params_, transform, f.getLeftCamera(), stream @@ -307,16 +311,16 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre } // Must reset depth channel if blending - if (do_blend) { - temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream); - } + //if (do_blend) { + // temp_.set<GpuMat>(Channel::Depth).setTo(cv::Scalar(0x7FFFFFFF), cvstream); + //} depth_out_.to_gpumat().setTo(cv::Scalar(1000.0f), cvstream); // Decide on and render triangles around each point ftl::cuda::triangle_render1( depthbuffer, - temp_.createTexture<int>((do_blend) ? Channel::Depth : Channel::Depth2), + temp_.create<cv::cuda::GpuMat>((do_blend) ? Channel::Depth : Channel::Depth2), screenbuffer, params_, stream ); @@ -324,7 +328,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre // TODO: Reproject here // And merge based upon weight adjusted distances - if (do_blend) { + /*if (do_blend) { // Blend this sources mesh with previous meshes ftl::cuda::mesh_blender( temp_.getTexture<int>(Channel::Depth), @@ -338,7 +342,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre blend_alpha, stream ); - } + }*/ } if (valid_count == 0) return; @@ -346,7 +350,7 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre // Convert from int depth to float depth //temp_.get<GpuMat>(Channel::Depth2).convertTo(out.get<GpuMat>(Channel::Depth), CV_32F, 1.0f / 100000.0f, cvstream); - if (do_blend) { + /*if (do_blend) { ftl::cuda::dibr_normalise( //out.getTexture<float>(_getDepthChannel()), //out.getTexture<float>(_getDepthChannel()), @@ -355,23 +359,69 @@ void CUDARender::_mesh(ftl::rgbd::Frame &out, const Eigen::Matrix4d &t, cudaStre temp_.getTexture<float>(Channel::Weights), stream_ ); - } else { + } else {*/ //ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), out.createTexture<float>(_getDepthChannel()), 1.0f / 100000.0f, stream_); - ftl::cuda::merge_convert_depth(temp_.getTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_); - } + ftl::cuda::merge_convert_depth(temp_.createTexture<int>(Channel::Depth2), depth_out_, 1.0f / 100000.0f, stream_); + //} // Now merge new render to any existing frameset render, detecting collisions ftl::cuda::touch_merge(depth_out_, out.createTexture<float>(_getDepthChannel()), collisions_, 1024, touch_dist_, stream_); - //filters_->filter(out, src, stream); + // Generate actual depth map using MLS with mesh as estimate + float mls_smoothing = value("mls_smooth", 0.05f); + int mls_iter = value("mls_iter", 3); + int mls_radius = value("mls_radius", 0); - // Generate normals for final virtual image - ftl::cuda::normals( + if (mls_radius == 0) { + ftl::cuda::normals( + out.createTexture<half4>(_getNormalsChannel()), + out.getTexture<float>(_getDepthChannel()), + params_.camera, stream_); + } else { + ftl::cuda::normals( + temp_.createTexture<half4>(Channel::Normals), + out.getTexture<float>(_getDepthChannel()), + params_.camera, stream_); + + ftl::cuda::mls_smooth( + temp_.get<cv::cuda::GpuMat>(Channel::Normals), + out.create<cv::cuda::GpuMat>(_getNormalsChannel()), + out.get<cv::cuda::GpuMat>(_getDepthChannel()), + //out.getTexture<float>(_getDepthChannel()), + value("mls_smooth", 0.01f), + value("mls_radius", 2), + params_.camera, + stream_ + ); + + /*ftl::cuda::mls_smooth( + out.createTexture<half4>(_getNormalsChannel()), + temp_.createTexture<half4>(Channel::Normals), + out.getTexture<float>(_getDepthChannel()), + //out.getTexture<float>(_getDepthChannel()), + value("mls_smooth", 0.01f), + value("mls_radius", 2), + params_.camera, + stream_ + ); + + ftl::cuda::mls_smooth( + temp_.createTexture<half4>(Channel::Normals), + out.createTexture<half4>(_getNormalsChannel()), + out.getTexture<float>(_getDepthChannel()), + //out.getTexture<float>(_getDepthChannel()), + value("mls_smooth", 0.01f), + value("mls_radius", 2), + params_.camera, + stream_ + );*/ + } + + ftl::cuda::transform_normals( out.createTexture<half4>(_getNormalsChannel()), - temp_.createTexture<half4>(Channel::Normals), - out.getTexture<float>(_getDepthChannel()), - value("normal_radius", 1), value("normal_smoothing", 0.02f), - params_.camera, pose_.getFloat3x3(), poseInverse_.getFloat3x3(), stream_); + poseInverse_.getFloat3x3(), + stream_ + ); } void CUDARender::_allocateChannels(ftl::rgbd::Frame &out, ftl::codecs::Channel chan) { diff --git a/components/renderers/cpp/src/colouriser.cpp b/components/renderers/cpp/src/colouriser.cpp index a8399d016b3229fe04d009d49ca9b902f4e6e68e..2412aeb86addf8bd1c558f421a7e9e7230f20a27 100644 --- a/components/renderers/cpp/src/colouriser.cpp +++ b/components/renderers/cpp/src/colouriser.cpp @@ -143,7 +143,7 @@ TextureObject<uchar4> &Colouriser::_processNormals(ftl::rgbd::Frame &f, Channel light_diffuse.w = value("alpha", 0.5f)*255.0f; light_ambient.w = light_diffuse.w; - auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", 1.0f)); + auto light_pos = make_float3(value("light_x", 0.3f), value("light_y", 0.2f), value("light_z", -1.0f)); ftl::cuda::normal_visualise(f.createTexture<half4>(c), buf, light_pos, diff --git a/components/renderers/cpp/src/screen.cu b/components/renderers/cpp/src/screen.cu index 15473b9ad4781957b1b9f1c6949469db1c6d230c..3b3e4d3bf575f3d027b3edc36114cbd2eb3ae3c4 100644 --- a/components/renderers/cpp/src/screen.cu +++ b/components/renderers/cpp/src/screen.cu @@ -2,6 +2,7 @@ #include "splatter_cuda.hpp" #include <ftl/rgbd/camera.hpp> #include <ftl/cuda_common.hpp> +#include <cudatl/fixed.hpp> using ftl::rgbd::Camera; using ftl::cuda::TextureObject; @@ -34,29 +35,23 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Stretch>(const Parameters return make_uint2(params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos))); } -/*template <> -__device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters ¶ms, const float3 &camPos) { - float2 pt = params.camera.camToScreen<float2>(camPos); //params.viewport.map(params.camera, params.camera.camToScreen<float2>(camPos)); - const float coeff = 1.0f / (params.viewport.warpMatrix.entries[6] * pt.x + params.viewport.warpMatrix.entries[7] * pt.y + params.viewport.warpMatrix.entries[8]); - const float xcoo = coeff * (params.viewport.warpMatrix.entries[0] * pt.x + params.viewport.warpMatrix.entries[1] * pt.y + params.viewport.warpMatrix.entries[2]); - const float ycoo = coeff * (params.viewport.warpMatrix.entries[3] * pt.x + params.viewport.warpMatrix.entries[4] * pt.y + params.viewport.warpMatrix.entries[5]); - return make_uint2(xcoo, ycoo); -}*/ - /* * Convert source screen position to output screen coordinates. */ template <ftl::render::ViewPortMode VPMODE, Projection PROJECT> - __global__ void screen_coord_kernel(TextureObject<float> depth, - TextureObject<float> depth_out, - TextureObject<short2> screen_out, Parameters params, float4x4 pose, Camera camera) { + __global__ void screen_coord_kernel( + const float* __restrict__ depth, + short* __restrict__ depth_out, + short2* __restrict__ screen_out, + int pitch4, int pitch2, Parameters params, float4x4 pose, Camera camera) +{ const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x >= 0 && y >= 0 && x < depth.width() && y < depth.height()) { + if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) { //uint2 screenPos = make_uint2(30000,30000); - const float d = depth.tex2D(x, y); + const float d = depth[y*pitch4+x]; // Find the virtual screen position of current point const float3 camPos = (d > camera.minDepth && d < camera.maxDepth) ? pose * camera.screenToCam(x,y,d) : make_float3(0.0f,0.0f,0.0f); @@ -69,34 +64,42 @@ __device__ inline uint2 convertToScreen<ViewPortMode::Warping>(const Parameters screenPos.x >= params.camera.width || screenPos.y >= params.camera.height) screenPos = make_float3(30000,30000,0); - screen_out(x,y) = make_short2(screenPos.x, screenPos.y); - depth_out(x,y) = screenPos.z; + + screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(screenPos.z); } } -void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> &depth_out, - TextureObject<short2> &screen_out, const Parameters ¶ms, +void ftl::cuda::screen_coord(const cv::cuda::GpuMat &depth, cv::cuda::GpuMat &depth_out, + cv::cuda::GpuMat &screen_out, const Parameters ¶ms, const float4x4 &pose, const Camera &camera, cudaStream_t stream) { - const dim3 gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth.cols + THREADS_X - 1)/THREADS_X, (depth.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + depth_out.create(depth.size(), CV_16S); + screen_out.create(depth.size(), CV_16SC2); if (params.projection == Projection::PERSPECTIVE) { switch (params.viewPortMode) { - case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; + case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::PERSPECTIVE><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; } } else if (params.projection == Projection::EQUIRECTANGULAR) { switch (params.viewPortMode) { - case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; + case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::EQUIRECTANGULAR><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; } } else if (params.projection == Projection::ORTHOGRAPHIC) { switch (params.viewPortMode) { - case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; - case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth, depth_out, screen_out, params, pose, camera); break; + case ViewPortMode::Disabled: screen_coord_kernel<ViewPortMode::Disabled, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Clipping: screen_coord_kernel<ViewPortMode::Clipping, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; + case ViewPortMode::Stretch: screen_coord_kernel<ViewPortMode::Stretch, Projection::ORTHOGRAPHIC><<<gridSize, blockSize, 0, stream>>>(depth.ptr<float>(), depth_out.ptr<short>(), screen_out.ptr<short2>(), depth.step1(), depth_out.step1(), params, pose, camera); break; } } cudaSafeCall( cudaGetLastError() ); @@ -109,12 +112,16 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> & * Convert source screen position to output screen coordinates. Assumes a * constant depth of 1m instead of using a depth channel input. */ - __global__ void screen_coord_kernel(TextureObject<float> depth_out, - TextureObject<short2> screen_out, Camera vcamera, float4x4 pose, Camera camera) { + __global__ void screen_coord_kernel( + short* __restrict__ depth_out, + short2* __restrict__ screen_out, + int pitch4, int pitch2, + Camera vcamera, float4x4 pose, Camera camera) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x >= 0 && y >= 0 && x < depth_out.width() && y < depth_out.height()) { + if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) { //uint2 screenPos = make_uint2(30000,30000); const float d = camera.maxDepth; @@ -128,15 +135,21 @@ void ftl::cuda::screen_coord(TextureObject<float> &depth, TextureObject<float> & screenPos.y >= vcamera.height) screenPos = make_uint2(30000,30000); - screen_out(x,y) = make_short2(screenPos.x, screenPos.y); - depth_out(x,y) = camPos.z; + screen_out[y*pitch4+x] = make_short2(screenPos.x, screenPos.y); + depth_out[y*pitch2+x] = cudatl::float2fixed<10>(camPos.z); } } -void ftl::cuda::screen_coord(TextureObject<float> &depth_out, TextureObject<short2> &screen_out, const Parameters ¶ms, const float4x4 &pose, const Camera &camera, cudaStream_t stream) { - const dim3 gridSize((screen_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (screen_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); +void ftl::cuda::screen_coord(cv::cuda::GpuMat &depth_out, cv::cuda::GpuMat &screen_out, const Parameters ¶ms, const float4x4 &pose, const Camera &camera, cudaStream_t stream) { + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((camera.width + THREADS_X - 1)/THREADS_X, (camera.height + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + depth_out.create(camera.height, camera.width, CV_16S); + screen_out.create(camera.height, camera.width, CV_16SC2); - screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, screen_out, params.camera, pose, camera); + screen_coord_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out.ptr<short>(), screen_out.ptr<short2>(), screen_out.step1()/2, depth_out.step1(), params.camera, pose, camera); cudaSafeCall( cudaGetLastError() ); } diff --git a/components/renderers/cpp/src/splatter_cuda.hpp b/components/renderers/cpp/src/splatter_cuda.hpp index 41088cc226e34f4916c6919d8edc465906dbd499..aee5d0bc7dab2357c34713ec4eb628d141b0f248 100644 --- a/components/renderers/cpp/src/splatter_cuda.hpp +++ b/components/renderers/cpp/src/splatter_cuda.hpp @@ -7,26 +7,26 @@ namespace ftl { namespace cuda { void screen_coord( - ftl::cuda::TextureObject<float> &depth, - ftl::cuda::TextureObject<float> &depth_out, - ftl::cuda::TextureObject<short2> &screen_out, + const cv::cuda::GpuMat &depth, + cv::cuda::GpuMat &depth_out, + cv::cuda::GpuMat &screen_out, const ftl::render::Parameters ¶ms, const float4x4 &pose, const ftl::rgbd::Camera &camera, cudaStream_t stream); void screen_coord( - ftl::cuda::TextureObject<float> &depth_out, - ftl::cuda::TextureObject<short2> &screen_out, + cv::cuda::GpuMat &depth_out, + cv::cuda::GpuMat &screen_out, const ftl::render::Parameters ¶ms, const float4x4 &pose, const ftl::rgbd::Camera &camera, cudaStream_t stream); void triangle_render1( - ftl::cuda::TextureObject<float> &depth_in, - ftl::cuda::TextureObject<int> &depth_out, - ftl::cuda::TextureObject<short2> &screen, + const cv::cuda::GpuMat &depth_in, // short + cv::cuda::GpuMat &depth_out, // int + const cv::cuda::GpuMat &screen, // short2 const ftl::render::Parameters ¶ms, cudaStream_t stream); @@ -170,6 +170,15 @@ namespace cuda { ftl::cuda::TextureObject<float> &d2, float factor, cudaStream_t stream); + void reverse_verify( + cv::cuda::GpuMat &depth_in, + const cv::cuda::GpuMat &depth_original, + const float4x4 &transformR, + const float4x4 &transform, + const ftl::rgbd::Camera &vintrin, + const ftl::rgbd::Camera &ointrin, + cudaStream_t stream); + void fix_bad_colour( ftl::cuda::TextureObject<float> &depth, ftl::cuda::TextureObject<uchar4> &out, diff --git a/components/renderers/cpp/src/triangle_render.cu b/components/renderers/cpp/src/triangle_render.cu index dccdd276ad1ab8a6520a19a5b908c51db6b54823..301cf2a226d0cff47c9a1672fe7d78b124f3fc99 100644 --- a/components/renderers/cpp/src/triangle_render.cu +++ b/components/renderers/cpp/src/triangle_render.cu @@ -2,6 +2,7 @@ #include "splatter_cuda.hpp" #include <ftl/rgbd/camera.hpp> #include <ftl/cuda_common.hpp> +#include <ftl/cuda/fixed.hpp> #include <ftl/cuda/weighting.hpp> @@ -107,7 +108,7 @@ float getZAtCoordinate(const float3 &barycentricCoord, const float (&tri)[3]) { * being inside or outside (using bary centric coordinate method). If inside * then atomically write to the depth map. */ -__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters ¶ms, TextureObject<int> &depth_out) { +__device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Parameters ¶ms, int* depth_out, int out_pitch4) { const int minX = min(v[0].x, min(v[1].x, v[2].x)); const int minY = min(v[0].y, min(v[1].y, v[2].y)); const int maxX = max(v[0].x, max(v[1].x, v[2].x)); @@ -124,7 +125,7 @@ __device__ void drawTriangle(const float (&d)[3], const short2 (&v)[3], const Pa if (sx < params.camera.width && sx >= 0 && sy < params.camera.height && sy >= 0 && isBarycentricCoordInBounds(baryCentricCoordinate)) { float new_depth = getZAtCoordinate(baryCentricCoordinate, d); - atomicMin(&depth_out(sx,sy), int(new_depth*100000.0f)); + atomicMin(&depth_out[sx+sy*out_pitch4], int(new_depth*100000.0f)); } } } @@ -145,11 +146,11 @@ __device__ inline bool isValidTriangle(const short2 (&v)[3]) { * which verticies to load. */ template <int A, int B> -__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const TextureObject<float> &depth_in, const TextureObject<short2> &screen) { - d[1] = depth_in.tex2D(x+A,y); - d[2] = depth_in.tex2D(x,y+B); - v[1] = screen.tex2D(x+A,y); - v[2] = screen.tex2D(x,y+B); +__device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const short* __restrict__ depth_in, const short2* __restrict__ screen, int pitch4, int pitch2) { + d[1] = fixed2float<10>(depth_in[y*pitch2+x+A]); + d[2] = fixed2float<10>(depth_in[(y+B)*pitch2+x]); + v[1] = screen[y*pitch4+x+A]; + v[2] = screen[(y+B)*pitch4+x]; return isValidTriangle(v); } @@ -157,37 +158,44 @@ __device__ bool loadTriangle(int x, int y, float (&d)[3], short2 (&v)[3], const * Convert source screen position to output screen coordinates. */ __global__ void triangle_render_kernel( - TextureObject<float> depth_in, - TextureObject<int> depth_out, - TextureObject<short2> screen, Parameters params) { + const short* __restrict__ depth_in, + int* depth_out, + const short2* __restrict__ screen, + int width, int height, int pitch2, int pitch4, int out_pitch4, Parameters params) { + const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x >= 1 && x < depth_in.width()-1 && y >= 1 && y < depth_in.height()-1) { + if (x >= 1 && x < width-1 && y >= 1 && y < height-1) { float d[3]; - d[0] = depth_in.tex2D(x,y); + d[0] = fixed2float<10>(depth_in[y*pitch2+x]); short2 v[3]; - v[0] = screen.tex2D(x,y); + v[0] = screen[y*pitch4+x]; if (v[0].x < 30000) { // Calculate discontinuity threshold. //const float threshold = (params.depthCoef / ((params.depthCoef / d[0]) - params.disconDisparities)) - d[0]; // Draw (optionally) 4 triangles as a diamond pattern around the central point. - if (loadTriangle<1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out); - if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out); - if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out); - if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen)) drawTriangle(d, v, params, depth_out); + if (loadTriangle<1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4); + if (loadTriangle<1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4); + if (loadTriangle<-1,1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4); + if (loadTriangle<-1,-1>(x, y, d, v, depth_in, screen, pitch4, pitch2)) drawTriangle(d, v, params, depth_out, out_pitch4); } } } -void ftl::cuda::triangle_render1(TextureObject<float> &depth_in, TextureObject<int> &depth_out, TextureObject<short2> &screen, const Parameters ¶ms, cudaStream_t stream) { - const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); - const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); +void ftl::cuda::triangle_render1(const cv::cuda::GpuMat &depth_in, cv::cuda::GpuMat &depth_out, const cv::cuda::GpuMat &screen, const Parameters ¶ms, cudaStream_t stream) { + static constexpr int THREADS_X = 8; + static constexpr int THREADS_Y = 8; + + const dim3 gridSize((depth_in.cols + THREADS_X - 1)/THREADS_X, (depth_in.rows + THREADS_Y - 1)/THREADS_Y); + const dim3 blockSize(THREADS_X, THREADS_Y); + + depth_out.create(params.camera.height, params.camera.width, CV_32S); - triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, screen, params); + triangle_render_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in.ptr<short>(), depth_out.ptr<int>(), screen.ptr<short2>(), depth_in.cols, depth_in.rows, depth_in.step1(), screen.step1()/2, depth_out.step1(), params); cudaSafeCall( cudaGetLastError() ); } @@ -202,7 +210,8 @@ __global__ void merge_convert_kernel( if (x < 0 || x >= depth_in.width() || y < 0 || y >= depth_in.height()) return; - float a = float(depth_in.tex2D(x,y))*alpha; + int d = depth_in.tex2D(x,y); + float a = float(d)*alpha; float b = depth_out.tex2D(x,y); depth_out(x,y) = min(a,b); } diff --git a/components/streams/src/feed.cpp b/components/streams/src/feed.cpp index d5d5533b6a23064f5efafda322b6675e47433f4d..f9b468605bcc336e3bf6d89f137eae7fe252a087 100644 --- a/components/streams/src/feed.cpp +++ b/components/streams/src/feed.cpp @@ -15,6 +15,7 @@ #include <ftl/operators/detectandtrack.hpp> #include <ftl/operators/weighting.hpp> #include <ftl/operators/mvmls.hpp> +#include <ftl/operators/fusion.hpp> #include <ftl/operators/clipping.hpp> #include <ftl/operators/poser.hpp> #include <ftl/operators/gt_analysis.hpp> @@ -533,6 +534,7 @@ void Feed::_createPipeline(uint32_t fsid) { p->append<ftl::operators::BorderMask>("border_mask"); p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity"); p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false); + p->append<ftl::operators::Fusion>("fusion")->value("enabled", false); p->append<ftl::operators::DisplayMask>("display_mask")->value("enabled", false); p->append<ftl::operators::Poser>("poser")->value("enabled", true); p->append<ftl::operators::GTAnalysis>("gtanalyse"); diff --git a/lib/cudatl/include/cudatl/fixed.hpp b/lib/cudatl/include/cudatl/fixed.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12afa70213cbd9326703ada220fdab5e723595aa --- /dev/null +++ b/lib/cudatl/include/cudatl/fixed.hpp @@ -0,0 +1,23 @@ +#ifndef _CUDATL_FIXED_HPP_ +#define _CUDATL_FIXED_HPP_ + +namespace cudatl +{ + +template <int FRAC> +__device__ inline float fixed2float(short v); + +template <int FRAC> +__device__ inline short float2fixed(float v); + +template <int FRAC> +__device__ inline float fixed2float8(int8_t v); + +template <int FRAC> +__device__ inline int8_t float2fixed8(float v); + +} + +#include <cudatl/impl/fixed.hpp> + +#endif diff --git a/lib/cudatl/include/cudatl/halfwarp.hpp b/lib/cudatl/include/cudatl/halfwarp.hpp index 3085381ae4d6b8ca33f2e690232e7204828f6832..c615fe73507b21a41f9bc7b2acf46955eb17c74b 100644 --- a/lib/cudatl/include/cudatl/halfwarp.hpp +++ b/lib/cudatl/include/cudatl/halfwarp.hpp @@ -3,15 +3,18 @@ #include <cuda_runtime.h> -namespace cudatl { +namespace cudatl +{ static constexpr int HALF_WARP_SIZE = 16; static constexpr unsigned int HALF_MASK1 = 0xFFFF0000; static constexpr unsigned int HALF_MASK2 = 0x0000FFFF; template <typename T> -__device__ inline T halfWarpMin(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMin(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -19,8 +22,10 @@ __device__ inline T halfWarpMin(T e) { } template <typename T> -__device__ inline T halfWarpMax(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpMax(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -28,8 +33,10 @@ __device__ inline T halfWarpMax(T e) { } template <typename T> -__device__ inline T halfWarpSum(T e) { - for (int i = WARP_SIZE/4; i > 0; i /= 2) { +__device__ inline T halfWarpSum(T e) +{ + for (int i = WARP_SIZE/4; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } diff --git a/lib/cudatl/include/cudatl/host_utility.hpp b/lib/cudatl/include/cudatl/host_utility.hpp index 39f066c3573f10db41a0d34b0bcbc0a93d93a971..16019260f89fe1cc643544e99a7a5fd8fb49c69d 100644 --- a/lib/cudatl/include/cudatl/host_utility.hpp +++ b/lib/cudatl/include/cudatl/host_utility.hpp @@ -4,9 +4,11 @@ #include <cuda_runtime.hpp> #include <string> -namespace cudatl { +namespace cudatl +{ -inline safeCall(cudaError_t e) { +inline safeCall(cudaError_t e) +{ if (e != cudaSuccess) throw new std::exception(std::string("Cuda Error "+std::to_string(int(e)))); } diff --git a/lib/cudatl/include/cudatl/impl/fixed.hpp b/lib/cudatl/include/cudatl/impl/fixed.hpp new file mode 100644 index 0000000000000000000000000000000000000000..083641732acb6735212e899a84e1e530dc37e1d7 --- /dev/null +++ b/lib/cudatl/include/cudatl/impl/fixed.hpp @@ -0,0 +1,23 @@ +template <int FRAC> +__device__ inline float cudatl::fixed2float(short v) +{ + return float(v) / float(1 << FRAC); +} + +template <int FRAC> +__device__ inline short cudatl::float2fixed(float v) +{ + return short(v * float(1 << FRAC)); +} + +template <int FRAC> +__device__ inline float cudatl::fixed2float8(int8_t v) +{ + return float(v) / float(1 << FRAC); +} + +template <int FRAC> +__device__ inline int8_t cudatl::float2fixed8(float v) +{ + return int8_t(v * float(1 << FRAC)); +} diff --git a/lib/cudatl/include/cudatl/memory.hpp b/lib/cudatl/include/cudatl/memory.hpp index 3b249dc043e1ad09365a8d8662b3280dc352710e..a46402a07de4ede20c43a46792dc524fd098c232 100644 --- a/lib/cudatl/include/cudatl/memory.hpp +++ b/lib/cudatl/include/cudatl/memory.hpp @@ -3,25 +3,27 @@ #include <cudatl/host_utility.hpp> -namespace cudatl { +namespace cudatl +{ template <typename T> -T *allocate(size_t size) { -#ifdef USE_GPU +T *allocate(size_t size) +{ T *ptr; cudatl::safeCall(cudaMalloc(&ptr, size*sizeof(T))); return ptr; -#else - return new T[size]; -#endif } template <typename T> -T *allocate(size_t width, size_t height, uint &pitch) { - if (width == 1 || height == 1) { +T *allocate(size_t width, size_t height, uint &pitch) +{ + if (width == 1 || height == 1) + { pitch = width; return allocateMemory<T>((width > height) ? width : height); - } else { + } + else + { T *ptr; size_t ptmp; cudatl::safeCall(cudaMallocPitch(&ptr, &ptmp, width*sizeof(T), height)); @@ -31,7 +33,8 @@ T *allocate(size_t width, size_t height, uint &pitch) { } template <typename T> -void free(T *ptr) { +void free(T *ptr) +{ cudatl::safeCall(cudaFree(ptr)); } diff --git a/lib/cudatl/include/cudatl/warp.hpp b/lib/cudatl/include/cudatl/warp.hpp index 1a41181e6ec5ab2478dcfa7abcfedc8079d0b0b9..6a8dd898d77058d1f16ffc95fc0350efefd927b5 100644 --- a/lib/cudatl/include/cudatl/warp.hpp +++ b/lib/cudatl/include/cudatl/warp.hpp @@ -5,14 +5,17 @@ #define __cuda__ __host__ __device__ -namespace cudatl { +namespace cudatl +{ static constexpr int WARP_SIZE = 32; static constexpr unsigned int FULL_MASK = 0xFFFFFFFF; template <typename T> -__device__ inline T warpMin(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMin(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = min(e, other); } @@ -20,8 +23,10 @@ __device__ inline T warpMin(T e) { } template <typename T> -__device__ inline T warpMax(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpMax(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e = max(e, other); } @@ -29,8 +34,10 @@ __device__ inline T warpMax(T e) { } template <typename T> -__device__ inline T warpSum(T e) { - for (int i = WARP_SIZE/2; i > 0; i /= 2) { +__device__ inline T warpSum(T e) +{ + for (int i = WARP_SIZE/2; i > 0; i /= 2) + { const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); e += other; } @@ -43,9 +50,11 @@ __device__ inline T warpSum(T e) { * TODO: This could be more efficient, perhaps with _shfl_XXX */ template <typename T> -inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) { +inline __device__ int warpScan(volatile T *s_Data, int tix, T threshold) +{ const int thread = tix%32; - for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) { + for (uint offset = 1; offset < WARP_SIZE; offset <<= 1) + { __syncwarp(); const uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; __syncwarp();