diff --git a/CMakeLists.txt b/CMakeLists.txt index 8db5a8886616aa2b6d829e9090d40cbd973e4d88..6cb60d3926d30ee89edf8b98d5480d1383ff950d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,8 @@ check_language(CUDA) if (CUDA_TOOLKIT_ROOT_DIR) enable_language(CUDA) set(CMAKE_CUDA_FLAGS "") -set(CMAKE_CUDA_FLAGS_DEBUG "-g -DDEBUG -D_DEBUG") -set(CMAKE_CUDA_FLAGS_RELEASE "") +set(CMAKE_CUDA_FLAGS_DEBUG "--gpu-architecture=compute_61 -g -DDEBUG -D_DEBUG") +set(CMAKE_CUDA_FLAGS_RELEASE "--gpu-architecture=compute_61") set(HAVE_CUDA TRUE) include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) diff --git a/applications/reconstruct/CMakeLists.txt b/applications/reconstruct/CMakeLists.txt index 00e44a444ba45f6cb7b4f68ed2441a1c3ac4fd4c..ae06aab6d142ba8a2ad3a1e0970905e3a70468e9 100644 --- a/applications/reconstruct/CMakeLists.txt +++ b/applications/reconstruct/CMakeLists.txt @@ -19,6 +19,8 @@ set(REPSRC #src/virtual_source.cpp src/splat_render.cpp src/dibr.cu + src/depth_camera.cu + src/depth_camera.cpp ) add_executable(ftl-reconstruct ${REPSRC}) diff --git a/applications/reconstruct/include/ftl/depth_camera.hpp b/applications/reconstruct/include/ftl/depth_camera.hpp index f40581b65cd71b9f239b2c0e82dfd616deb7ff28..83d9e1bcbedf42025f569068332008251e5651b9 100644 --- a/applications/reconstruct/include/ftl/depth_camera.hpp +++ b/applications/reconstruct/include/ftl/depth_camera.hpp @@ -23,7 +23,10 @@ namespace voxhash { struct __align__(16) DepthCameraCUDA { cudaTextureObject_t depth; + cudaTextureObject_t depth2; + cudaTextureObject_t points; cudaTextureObject_t colour; + cudaTextureObject_t normal; DepthCameraParams params; float4x4 pose; float4x4 poseInverse; @@ -35,58 +38,23 @@ struct DepthCamera { // Host part // /////////////// - __host__ - DepthCamera() { - /*d_depthData = NULL; - d_colorData = NULL; - d_depthArray = NULL; - d_colorArray = NULL;*/ - - depth_mat_ = nullptr; - colour_mat_ = nullptr; - depth_tex_ = nullptr; - colour_tex_ = nullptr; - } - - __host__ - void alloc(const DepthCameraParams& params) { //! todo resizing??? - depth_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_32FC1); - colour_mat_ = new cv::cuda::GpuMat(params.m_imageHeight, params.m_imageWidth, CV_8UC4); - depth_tex_ = new ftl::cuda::TextureObject<float>((cv::cuda::PtrStepSz<float>)*depth_mat_); - colour_tex_ = new ftl::cuda::TextureObject<uchar4>((cv::cuda::PtrStepSz<uchar4>)*colour_mat_); - data.depth = depth_tex_->cudaTexture(); - data.colour = colour_tex_->cudaTexture(); - data.params = params; - } - - //__host__ - //void updateParams(const DepthCameraParams& params) { - // updateConstantDepthCameraParams(params); - //} - - __host__ - void updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream) { - depth_mat_->upload(depth, stream); - colour_mat_->upload(rgb, stream); - } - - __host__ - void free() { - if (depth_mat_) delete depth_mat_; - if (colour_mat_) delete colour_mat_; - delete depth_tex_; - delete colour_tex_; - } - - - // TODO(Nick) Should not need to pass all these pointers to device - cv::cuda::GpuMat *depth_mat_; - cv::cuda::GpuMat *colour_mat_; + __host__ DepthCamera(); + + __host__ void alloc(const DepthCameraParams& params, bool withNormals=false); + + __host__ void updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream); + + __host__ void free(); + + __host__ void _computeNormals(cudaStream_t stream); + ftl::cuda::TextureObject<float> *depth_tex_; + ftl::cuda::TextureObject<int> *depth2_tex_; + ftl::cuda::TextureObject<float4> *points_tex_; ftl::cuda::TextureObject<uchar4> *colour_tex_; - //cudaTextureObject_t depth_obj_; - //cudaTextureObject_t colour_obj_; + ftl::cuda::TextureObject<float4> *normal_tex_; + // This part is sent to device DepthCameraCUDA data; }; } diff --git a/applications/reconstruct/include/ftl/voxel_hash.hpp b/applications/reconstruct/include/ftl/voxel_hash.hpp index c0a228834337e60f5807313735eedee25eb51dcb..98c2eca90530c309b5d2e46848493634aaaa053c 100644 --- a/applications/reconstruct/include/ftl/voxel_hash.hpp +++ b/applications/reconstruct/include/ftl/voxel_hash.hpp @@ -37,7 +37,7 @@ typedef signed char schar; #include <ftl/depth_camera.hpp> #define SDF_BLOCK_SIZE 8 -#define SDF_BLOCK_SIZE_OLAP 7 +#define SDF_BLOCK_SIZE_OLAP 8 #ifndef MINF #define MINF __int_as_float(0xff800000) @@ -73,6 +73,7 @@ struct __align__(16) HashEntry { HashEntryHead head; uint voxels[16]; // 512 bits, 1 bit per voxel + //uint validity[16]; // Is the voxel valid, 512 bit /*__device__ void operator=(const struct HashEntry& e) { ((long long*)this)[0] = ((const long long*)&e)[0]; @@ -270,11 +271,13 @@ struct HashData { } __device__ - bool isSDFBlockInCameraFrustumApprox(const HashParams &hashParams, const DepthCameraParams &camera, const int3& sdfBlock) { + bool isInBoundingBox(const HashParams &hashParams, const int3& sdfBlock) { // NOTE (Nick): Changed, just assume all voxels are potentially in frustrum //float3 posWorld = virtualVoxelPosToWorld(SDFBlockToVirtualVoxelPos(sdfBlock)) + hashParams.m_virtualVoxelSize * 0.5f * (SDF_BLOCK_SIZE - 1.0f); //return camera.isInCameraFrustumApprox(hashParams.m_rigidTransformInverse, posWorld); - return true; + return !(hashParams.m_flags & ftl::voxhash::kFlagClipping) || sdfBlock.x > hashParams.m_minBounds.x && sdfBlock.x < hashParams.m_maxBounds.x && + sdfBlock.y > hashParams.m_minBounds.y && sdfBlock.y < hashParams.m_maxBounds.y && + sdfBlock.z > hashParams.m_minBounds.z && sdfBlock.z < hashParams.m_maxBounds.z; } //! computes the (local) virtual voxel pos of an index; idx in [0;511] diff --git a/applications/reconstruct/include/ftl/voxel_hash_params.hpp b/applications/reconstruct/include/ftl/voxel_hash_params.hpp index cb94404c3602cd237fed4bd683e7196b3adf4ab6..480e16d478a7a3c82d046f6de464d7bb20c04f64 100644 --- a/applications/reconstruct/include/ftl/voxel_hash_params.hpp +++ b/applications/reconstruct/include/ftl/voxel_hash_params.hpp @@ -12,35 +12,29 @@ namespace ftl { namespace voxhash { +static const unsigned int kFlagClipping = 0x00000001; +static const unsigned int kFlagMLS = 0x00000002; + //TODO might have to be split into static and dynamics struct __align__(16) HashParams { HashParams() { } - float4x4 m_rigidTransform; - float4x4 m_rigidTransformInverse; - unsigned int m_hashNumBuckets; - unsigned int m_deprecated1; - unsigned int m_deprecated2; //m_hashMaxCollisionLinkedListSize; - unsigned int m_numSDFBlocks; - - int m_SDFBlockSize; float m_virtualVoxelSize; - unsigned int m_numOccupiedBlocks; //occupied blocks in the viewing frustum - float m_maxIntegrationDistance; float m_truncScale; float m_truncation; unsigned int m_integrationWeightSample; unsigned int m_integrationWeightMax; - float3 m_streamingVoxelExtents; - int3 m_streamingGridDimensions; - int3 m_streamingMinGridPos; - unsigned int m_streamingInitialChunkListSize; - uint2 m_dummy; + int3 m_minBounds; + int3 m_maxBounds; + float m_spatialSmoothing; + float m_colourSmoothing; + float m_confidenceThresh; + unsigned int m_flags; }; } // namespace voxhash diff --git a/applications/reconstruct/src/depth_camera.cpp b/applications/reconstruct/src/depth_camera.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9a5291f932ce0681a8d36aa9ebcfb08bc292a411 --- /dev/null +++ b/applications/reconstruct/src/depth_camera.cpp @@ -0,0 +1,54 @@ +#include <ftl/depth_camera.hpp> +#include "depth_camera_cuda.hpp" +#include <opencv2/core/cuda_stream_accessor.hpp> + +using ftl::voxhash::DepthCamera; +using ftl::voxhash::DepthCameraCUDA; + +DepthCamera::DepthCamera() { + depth_tex_ = nullptr; + depth2_tex_ = nullptr; + points_tex_ = nullptr; + colour_tex_ = nullptr; + normal_tex_ = nullptr; +} + +void DepthCamera::alloc(const DepthCameraParams& params, bool withNormals) { //! todo resizing??? + depth_tex_ = new ftl::cuda::TextureObject<float>(params.m_imageWidth, params.m_imageHeight); + depth2_tex_ = new ftl::cuda::TextureObject<int>(params.m_imageWidth, params.m_imageHeight); + points_tex_ = new ftl::cuda::TextureObject<float4>(params.m_imageWidth, params.m_imageHeight); + colour_tex_ = new ftl::cuda::TextureObject<uchar4>(params.m_imageWidth, params.m_imageHeight); + data.depth = depth_tex_->cudaTexture(); + data.depth2 = depth2_tex_->cudaTexture(); + data.points = points_tex_->cudaTexture(); + data.colour = colour_tex_->cudaTexture(); + data.params = params; + + if (withNormals) { + normal_tex_ = new ftl::cuda::TextureObject<float4>(params.m_imageWidth, params.m_imageHeight); + data.normal = normal_tex_->cudaTexture(); + } else { + data.normal = 0; + } +} + +void DepthCamera::free() { + delete depth_tex_; + delete colour_tex_; + delete depth2_tex_; + delete points_tex_; + if (normal_tex_) delete normal_tex_; +} + +void DepthCamera::updateData(const cv::Mat &depth, const cv::Mat &rgb, cv::cuda::Stream &stream) { + depth_tex_->upload(depth, cv::cuda::StreamAccessor::getStream(stream)); + colour_tex_->upload(rgb, cv::cuda::StreamAccessor::getStream(stream)); + //if (normal_mat_) { + // _computeNormals(cv::cuda::StreamAccessor::getStream(stream)); + //} +} + +void DepthCamera::_computeNormals(cudaStream_t stream) { + //ftl::cuda::point_cloud((float3*)point_mat_->data, data, stream); + //ftl::cuda::compute_normals((float3*)point_mat_->data, normal_tex_, stream); +} diff --git a/applications/reconstruct/src/depth_camera.cu b/applications/reconstruct/src/depth_camera.cu new file mode 100644 index 0000000000000000000000000000000000000000..8490c38b2422bdd1834244bf5acf974a875b3a1e --- /dev/null +++ b/applications/reconstruct/src/depth_camera.cu @@ -0,0 +1,598 @@ +#include <ftl/cuda_common.hpp> +#include <ftl/cuda_util.hpp> +#include <ftl/depth_camera.hpp> +#include "depth_camera_cuda.hpp" + +#define T_PER_BLOCK 16 +#define MINF __int_as_float(0xff800000) + +using ftl::voxhash::DepthCameraCUDA; +using ftl::voxhash::HashData; +using ftl::voxhash::HashParams; +using ftl::cuda::TextureObject; +using ftl::render::SplatParams; + +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; +extern __constant__ HashParams c_hashParams; + +__global__ void clear_depth_kernel(ftl::cuda::TextureObject<float> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = 1000.0f; //PINF; + //colour(x,y) = make_uchar4(76,76,82,0); + } +} + +void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + +__global__ void clear_depth_kernel(ftl::cuda::TextureObject<int> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = 0x7FFFFFFF; //PINF; + //colour(x,y) = make_uchar4(76,76,82,0); + } +} + +void ftl::cuda::clear_depth(const ftl::cuda::TextureObject<int> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_depth_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + +__global__ void clear_points_kernel(ftl::cuda::TextureObject<float4> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = make_float4(MINF,MINF,MINF,MINF); + //colour(x,y) = make_uchar4(76,76,82,0); + } +} + +void ftl::cuda::clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_points_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + +__global__ void clear_colour_kernel(ftl::cuda::TextureObject<uchar4> depth) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < depth.width() && y < depth.height()) { + depth(x,y) = make_uchar4(76,76,76,76); + //colour(x,y) = make_uchar4(76,76,82,0); + } +} + +void ftl::cuda::clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream) { + const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK); + clear_colour_kernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(depth); +} + +// ===== Type convert ===== + +template <typename A, typename B> +__global__ void convert_kernel(const ftl::cuda::TextureObject<A> in, ftl::cuda::TextureObject<B> out, float scale) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < in.width() && y < in.height()) { + out(x,y) = ((float)in.tex2D((int)x,(int)y)) * scale; + } +} + +void ftl::cuda::float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureObject<int> &out, float scale, cudaStream_t stream) { + const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + convert_kernel<float,int><<<gridSize, blockSize, 0, stream>>>(in, out, scale); +} + +void ftl::cuda::int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, float scale, cudaStream_t stream) { + const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + convert_kernel<int,float><<<gridSize, blockSize, 0, stream>>>(in, out, scale); +} + +/// ===== MLS Smooth + +// TODO:(Nick) Put this in a common location (used in integrators.cu) +extern __device__ float spatialWeighting(float r); + +/* + * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005). + * Real-time foreground-background segmentation using codebook model. + * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004 + */ + __device__ float colordiffFloat(const uchar4 &pa, const uchar4 &pb) { + const float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z; + const float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z; + const float xv_2 = pow(pb.x * pa.x + pb.y * pa.y + pb.z * pa.z, 2); + const float p_2 = xv_2 / v_2; + return sqrt(x_2 - p_2); +} + +__device__ float colordiffFloat2(const uchar4 &pa, const uchar4 &pb) { + float3 delta = make_float3((float)pa.x - (float)pb.x, (float)pa.y - (float)pb.y, (float)pa.z - (float)pb.z); + return length(delta); +} + +/* + * Colour weighting as suggested in: + * C. Kuster et al. Spatio-Temporal Geometry Fusion for Multiple Hybrid Cameras using Moving Least Squares Surfaces. 2014. + * c = colour distance + */ + __device__ float colourWeighting(float c) { + const float h = c_hashParams.m_colourSmoothing; + if (c >= h) return 0.0f; + float ch = c / h; + ch = 1.0f - ch*ch; + return ch*ch*ch*ch; +} + +#define WINDOW_RADIUS 9 + +__device__ float mlsCamera(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const float3 pf = camera.poseInverse * mPos; + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float weights = 0.0f; + + //#pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + //if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + float weight = spatialWeighting(length(pf - camPos)); + + if (weight > 0.0f) { + uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); + weight *= colourWeighting(colordiffFloat2(c1,c2)); + + if (weight > 0.0f) { + wpos += weight* (camera.pose * camPos); + weights += weight; + } + } + //} + } + } + + //wpos += (camera.pose * pos); + + return weights; +} + +__device__ float mlsCameraBest(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const float3 pf = camera.poseInverse * mPos; + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float weights = 0.0f; + + //#pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + //if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float depth = tex2D<float>(camera.depth, screenPos.x+u, screenPos.y+v); + const float3 camPos = camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + float weight = spatialWeighting(length(pf - camPos)); + + if (weight > 0.0f) { + uchar4 c2 = tex2D<uchar4>(camera.colour, screenPos.x+u, screenPos.y+v); + weight *= colourWeighting(colordiffFloat2(c1,c2)); + + if (weight > weights) { + pos = weight* (camera.pose * camPos); + weights = weight; + } + } + //} + } + } + + wpos += pos; + //wpos += (camera.pose * pos); + + return weights; +} + +__device__ float mlsCameraPoint(int cam, const float3 &mPos, uchar4 c1, float3 &wpos) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + + const float3 pf = camera.poseInverse * mPos; + float3 pos = make_float3(0.0f, 0.0f, 0.0f); + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + float weights = 0.0f; + + + //float depth = tex2D<float>(camera.depth, screenPos.x, screenPos.y); + const float3 worldPos = make_float3(tex2D<float4>(camera.points, screenPos.x, screenPos.y)); + if (worldPos.z == MINF) return 0.0f; + + float weight = spatialWeighting(length(mPos - worldPos)); + + if (weight > 0.0f) { + wpos += weight* (worldPos); + weights += weight; + } + + return weights; +} + +__global__ void mls_smooth_kernel(ftl::cuda::TextureObject<float4> output, HashParams hashParams, int numcams, int cam) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = output.width(); + const int height = output.height(); + + const DepthCameraCUDA &mainCamera = c_cameras[cam]; + + if (x < width && y < height) { + + const float depth = tex2D<float>(mainCamera.depth, x, y); + const uchar4 c1 = tex2D<uchar4>(mainCamera.colour, x, y); + + float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + float weights = 0.0f; + + if (depth >= mainCamera.params.m_sensorDepthWorldMin && depth <= mainCamera.params.m_sensorDepthWorldMax) { + float3 mPos = mainCamera.pose * mainCamera.params.kinectDepthToSkeleton(x, y, depth); + + if ((!(hashParams.m_flags & ftl::voxhash::kFlagClipping)) || (mPos.x > hashParams.m_minBounds.x && mPos.x < hashParams.m_maxBounds.x && + mPos.y > hashParams.m_minBounds.y && mPos.y < hashParams.m_maxBounds.y && + mPos.z > hashParams.m_minBounds.z && mPos.z < hashParams.m_maxBounds.z)) { + + if (hashParams.m_flags & ftl::voxhash::kFlagMLS) { + for (uint cam2=0; cam2<numcams; ++cam2) { + if (cam2 == cam) continue; //weights += mlsCamera(cam2, mPos, c1, wpos); + weights += mlsCameraBest(cam2, mPos, c1, wpos); + } + wpos /= weights; + } else { + weights = 1000.0f; + wpos = mPos; + } + + //output(x,y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + + const uint2 screenPos = make_uint2(mainCamera.params.cameraToKinectScreenInt(mainCamera.poseInverse * wpos)); + if (screenPos.x < output.width() && screenPos.y < output.height()) { + output(screenPos.x,screenPos.y) = (weights >= hashParams.m_confidenceThresh) ? make_float4(wpos, 0.0f) : make_float4(MINF,MINF,MINF,MINF); + } + } + } + } +} + +void ftl::cuda::mls_smooth(TextureObject<float4> &output, const HashParams &hashParams, int numcams, int cam, cudaStream_t stream) { + const dim3 gridSize((output.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + mls_smooth_kernel<<<gridSize, blockSize, 0, stream>>>(output, hashParams, numcams, cam); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} + +#define RESAMPLE_RADIUS 7 + +__global__ void mls_resample_kernel(ftl::cuda::TextureObject<int> depthin, ftl::cuda::TextureObject<uchar4> colourin, ftl::cuda::TextureObject<float> depthout, HashParams hashParams, int numcams, SplatParams params) { + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = depthin.width(); + const int height = depthin.height(); + + if (x < width && y < height) { + + //const int depth = depthin.tex2D((int)x, (int)y); + //if (depth != 0x7FFFFFFF) { + // depthout(x,y) = (float)depth / 1000.0f; + // return; + //} + + struct map_t { + int d; + int quad; + }; + + map_t mappings[5]; + int mapidx = 0; + + for (int v=-RESAMPLE_RADIUS; v<=RESAMPLE_RADIUS; ++v) { + for (int u=-RESAMPLE_RADIUS; u<=RESAMPLE_RADIUS; ++u) { + + const int depth = depthin.tex2D((int)x+u, (int)y+v); + const uchar4 c1 = colourin.tex2D((int)x+u, (int)y+v); + + if (depth != 0x7FFFFFFF) { + int i=0; + for (i=0; i<mapidx; ++i) { + if (abs(mappings[i].d - depth) < 100) { + if (u < 0 && v < 0) mappings[i].quad |= 0x1; + if (u > 0 && v < 0) mappings[i].quad |= 0x2; + if (u > 0 && v > 0) mappings[i].quad |= 0x4; + if (u < 0 && v > 0) mappings[i].quad |= 0x8; + break; + } + } + if (i == mapidx && i < 5) { + mappings[mapidx].d = depth; + mappings[mapidx].quad = 0; + if (u < 0 && v < 0) mappings[mapidx].quad |= 0x1; + if (u > 0 && v < 0) mappings[mapidx].quad |= 0x2; + if (u > 0 && v > 0) mappings[mapidx].quad |= 0x4; + if (u < 0 && v > 0) mappings[mapidx].quad |= 0x8; + ++mapidx; + } else { + //printf("EXCEEDED\n"); + } + } + } + } + + int bestdepth = 1000000; + //int count = 0; + for (int i=0; i<mapidx; ++i) { + if (__popc(mappings[i].quad) >= 3 && mappings[i].d < bestdepth) bestdepth = mappings[i].d; + //if (mappings[i].quad == 15 && mappings[i].d < bestdepth) bestdepth = mappings[i].d; + //if (mappings[i].quad == 15) count ++; + } + + //depthout(x,y) = (mapidx == 5) ? 3.0f : 0.0f; + + if (bestdepth < 1000000) { + depthout(x,y) = (float)bestdepth / 1000.0f; + } + } +} + +void ftl::cuda::mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const HashParams &hashParams, int numcams, const SplatParams ¶ms, cudaStream_t stream) { + const dim3 gridSize((depthin.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthin.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + mls_resample_kernel<<<gridSize, blockSize, 0, stream>>>(depthin, colourin, depthout, hashParams, numcams, params); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} + + +/// ===== Median Filter ====== + +#define WINDOW_SIZE 3 +#define MEDIAN_RADIUS 3 +#define MEDIAN_SIZE (((MEDIAN_RADIUS*2)+1)*((MEDIAN_RADIUS*2)+1)) + +__global__ void medianFilterKernel(TextureObject<int> inputImageKernel, TextureObject<float> outputImagekernel) +{ + // Set row and colum for thread. + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + int filterVector[MEDIAN_SIZE] = {0}; //Take fiter window + if((row<=MEDIAN_RADIUS) || (col<=MEDIAN_RADIUS) || (row>=inputImageKernel.height()-MEDIAN_RADIUS) || (col>=inputImageKernel.width()-MEDIAN_RADIUS)) + outputImagekernel(col, row) = 0.0f; //Deal with boundry conditions + else { + for (int v = -MEDIAN_RADIUS; v <= MEDIAN_RADIUS; v++) { + for (int u = -MEDIAN_RADIUS; u <= MEDIAN_RADIUS; u++){ + filterVector[(v+MEDIAN_RADIUS)*(2*MEDIAN_RADIUS+1)+u+MEDIAN_RADIUS] = inputImageKernel((col+u), (row+v)); // setup the filterign window. + } + } + for (int i = 0; i < MEDIAN_SIZE; i++) { + for (int j = i + 1; j < MEDIAN_SIZE; j++) { + if (filterVector[i] > filterVector[j]) { + //Swap the variables. + char tmp = filterVector[i]; + filterVector[i] = filterVector[j]; + filterVector[j] = tmp; + } + } + } + outputImagekernel(col, row) = (float)filterVector[MEDIAN_SIZE/2+1] / 1000.0f; //Set the output variables. + } +} + +void ftl::cuda::median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream) { + const dim3 gridSize((in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + medianFilterKernel<<<gridSize, blockSize, 0, stream>>>(in, out); +} + + +/// ===== Hole Fill ===== + +__device__ inline float distance2(float3 a, float3 b) { + const float x = a.x-b.x; + const float y = a.y-b.y; + const float z = a.z-b.z; + return x*x+y*y+z*z; +} + +#define SPLAT_RADIUS 7 +#define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1) +#define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS) + +__global__ void hole_fill_kernel( + TextureObject<int> depth_in, + TextureObject<float> depth_out, DepthCameraParams params) { + // Read an NxN region and + // - interpolate a depth value for this pixel + // - interpolate an rgb value for this pixel + // Must respect depth discontinuities. + // How much influence a given neighbour has depends on its depth value + + __shared__ float3 positions[SPLAT_BUFFER_SIZE]; + + const float voxelSize = c_hashParams.m_virtualVoxelSize; + + const int i = threadIdx.y*blockDim.y + threadIdx.x; + const int bx = blockIdx.x*blockDim.x; + const int by = blockIdx.y*blockDim.y; + const int x = bx + threadIdx.x; + const int y = by + threadIdx.y; + + // const float camMinDepth = params.camera.m_sensorDepthWorldMin; + // const float camMaxDepth = params.camera.m_sensorDepthWorldMax; + + for (int j=i; j<SPLAT_BUFFER_SIZE; j+=T_PER_BLOCK) { + const unsigned int sx = (j % SPLAT_BOUNDS)+bx-SPLAT_RADIUS; + const unsigned int sy = (j / SPLAT_BOUNDS)+by-SPLAT_RADIUS; + if (sx >= depth_in.width() || sy >= depth_in.height()) { + positions[j] = make_float3(1000.0f,1000.0f, 1000.0f); + } else { + positions[j] = params.kinectDepthToSkeleton(sx, sy, (float)depth_in.tex2D((int)sx,(int)sy) / 1000.0f); + } + } + + __syncthreads(); + + if (x >= depth_in.width() || y >= depth_in.height()) return; + + const float voxelSquared = voxelSize*voxelSize; + float mindepth = 1000.0f; + //int minidx = -1; + // float3 minpos; + + //float3 validPos[MAX_VALID]; + //int validIndices[MAX_VALID]; + //int validix = 0; + + for (int v=-SPLAT_RADIUS; v<=SPLAT_RADIUS; ++v) { + for (int u=-SPLAT_RADIUS; u<=SPLAT_RADIUS; ++u) { + //const int idx = (threadIdx.y+v)*SPLAT_BOUNDS+threadIdx.x+u; + const int idx = (threadIdx.y+v+SPLAT_RADIUS)*SPLAT_BOUNDS+threadIdx.x+u+SPLAT_RADIUS; + + float3 posp = positions[idx]; + const float d = posp.z; + //if (d < camMinDepth || d > camMaxDepth) continue; + + float3 pos = params.kinectDepthToSkeleton(x, y, d); + float dist = distance2(pos, posp); + + if (dist < voxelSquared) { + // Valid so check for minimum + //validPos[validix] = pos; + //validIndices[validix++] = idx; + if (d < mindepth) { + mindepth = d; + //minidx = idx; + // minpos = pos; + } + } + } + } + + depth_out(x,y) = mindepth; +} + +void ftl::cuda::hole_fill(const TextureObject<int> &depth_in, + const TextureObject<float> &depth_out, const DepthCameraParams ¶ms, cudaStream_t stream) +{ + + const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + hole_fill_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params); + cudaSafeCall( cudaGetLastError() ); + + #ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + #endif +} + + +/// ===== Point cloud from depth ===== + +__global__ void point_cloud_kernel(float3* output, DepthCameraCUDA depthCameraData) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = depthCameraData.params.m_imageWidth; + const int height = depthCameraData.params.m_imageHeight; + + if (x < width && y < height) { + float depth = tex2D<float>(depthCameraData.depth, x, y); + + output[y*width+x] = (depth >= depthCameraData.params.m_sensorDepthWorldMin && depth <= depthCameraData.params.m_sensorDepthWorldMax) ? + depthCameraData.params.kinectDepthToSkeleton(x, y, depth) : + make_float3(MINF, MINF, MINF); + } +} + +void ftl::cuda::point_cloud(float3* output, const DepthCameraCUDA &depthCameraData, cudaStream_t stream) { + const dim3 gridSize((depthCameraData.params.m_imageWidth + T_PER_BLOCK - 1)/T_PER_BLOCK, (depthCameraData.params.m_imageHeight + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + point_cloud_kernel<<<gridSize, blockSize, 0, stream>>>(output, depthCameraData); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif +} + +/// ===== NORMALS ===== + + +__global__ void compute_normals_kernel(const float3 *input, ftl::cuda::TextureObject<float4> output) +{ + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + const int width = output.width(); + + if(x >= output.width() || y >= output.height()) return; + + output(x,y) = make_float4(MINF, MINF, MINF, MINF); + + if(x > 0 && x < output.width()-1 && y > 0 && y < output.height()-1) + { + // TODO:(Nick) Should use a 7x7 window + const float3 CC = input[(y+0)*width+(x+0)]; + const float3 PC = input[(y+1)*width+(x+0)]; + const float3 CP = input[(y+0)*width+(x+1)]; + const float3 MC = input[(y-1)*width+(x+0)]; + const float3 CM = input[(y+0)*width+(x-1)]; + + if(CC.x != MINF && PC.x != MINF && CP.x != MINF && MC.x != MINF && CM.x != MINF) + { + const float3 n = cross(PC-MC, CP-CM); + //const float l = length(n); + + //if(l > 0.0f) + //{ + output(x,y) = make_float4(n, 1.0f); //make_float4(n/-l, 1.0f); + //} + } + } +} + +void ftl::cuda::compute_normals(const float3 *input, ftl::cuda::TextureObject<float4> *output, cudaStream_t stream) { + const dim3 gridSize((output->width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (output->height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + compute_normals_kernel<<<gridSize, blockSize, 0, stream>>>(input, *output); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); + //cutilCheckMsg(__FUNCTION__); +#endif +} \ No newline at end of file diff --git a/applications/reconstruct/src/depth_camera_cuda.hpp b/applications/reconstruct/src/depth_camera_cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a55bd53ad5b3cca95bbb1be6f3547f020ed56f2f --- /dev/null +++ b/applications/reconstruct/src/depth_camera_cuda.hpp @@ -0,0 +1,35 @@ +#ifndef _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_ +#define _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_ + +#include <ftl/depth_camera.hpp> +#include <ftl/voxel_hash.hpp> +#include "splat_params.hpp" + +namespace ftl { +namespace cuda { + +void clear_depth(const TextureObject<float> &depth, cudaStream_t stream); +void clear_depth(const TextureObject<int> &depth, cudaStream_t stream); +void clear_points(const ftl::cuda::TextureObject<float4> &depth, cudaStream_t stream); +void clear_colour(const ftl::cuda::TextureObject<uchar4> &depth, cudaStream_t stream); + +void median_filter(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, cudaStream_t stream); + +void int_to_float(const ftl::cuda::TextureObject<int> &in, ftl::cuda::TextureObject<float> &out, float scale, cudaStream_t stream); + +void float_to_int(const ftl::cuda::TextureObject<float> &in, ftl::cuda::TextureObject<int> &out, float scale, cudaStream_t stream); + +void mls_smooth(TextureObject<float4> &output, const ftl::voxhash::HashParams &hashParams, int numcams, int cam, cudaStream_t stream); + +void mls_resample(const TextureObject<int> &depthin, const TextureObject<uchar4> &colourin, TextureObject<float> &depthout, const ftl::voxhash::HashParams &hashParams, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); + +void hole_fill(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const DepthCameraParams ¶ms, cudaStream_t stream); + +void point_cloud(float3* output, const ftl::voxhash::DepthCameraCUDA &depthCameraData, cudaStream_t stream); + +void compute_normals(const float3 *points, ftl::cuda::TextureObject<float4> *normals, cudaStream_t stream); + +} +} + +#endif // _FTL_RECONSTRUCTION_CAMERA_CUDA_HPP_ diff --git a/applications/reconstruct/src/dibr.cu b/applications/reconstruct/src/dibr.cu index 575e3c45f9348c37c8397683db36d87edda677e4..9558a0e0d2ac4b7a3978ccd48a6f6791f99a337f 100644 --- a/applications/reconstruct/src/dibr.cu +++ b/applications/reconstruct/src/dibr.cu @@ -23,8 +23,54 @@ __global__ void clearColourKernel(TextureObject<uchar4> colour) { } } + +__global__ void dibr_depthmap_kernel( + TextureObject<int> depth, int numcams, SplatParams params) { + + const int i = threadIdx.y*blockDim.y + threadIdx.x; + const int bx = blockIdx.x*blockDim.x; + const int by = blockIdx.y*blockDim.y; + const int x = bx + threadIdx.x; + const int y = by + threadIdx.y; + + for (int j=0; j<numcams; ++j) { + const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; + + float4 d = tex2D<float4>(camera.points, x, y); + if (d.z < 0.0f) continue; + //if (d >= params.camera.m_sensorDepthWorldMax) continue; + + //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); + + const float3 worldPos = make_float3(d); + const float3 camPos = params.m_viewMatrix * worldPos; + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); + + if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + + const unsigned int cx = screenPos.x; + const unsigned int cy = screenPos.y; + + + if (cx < depth.width() && cy < depth.height()) { + //float camd = depth_in.tex2D((int)cx,(int)cy); + //atomicMin(&depth(x,y), idepth); + //float camdiff = fabs(camPos.z-camd); + //if (camdiff < 0.1f) { + //colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); + //} else { + //colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255); + //} + + atomicMin(&depth(cx,cy), camPos.z * 1000.0f); + } + } +} + + __global__ void dibr_kernel( - TextureObject<float> depth_in, + TextureObject<int> depth_in, TextureObject<uchar4> colour_out, int numcams, SplatParams params) { const int i = threadIdx.y*blockDim.y + threadIdx.x; @@ -36,12 +82,13 @@ __global__ void dibr_kernel( for (int j=0; j<numcams; ++j) { const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; - float d = tex2D<float>(camera.depth, x, y); - if (d < 0.01f) continue; - if (d >= params.camera.m_sensorDepthWorldMax) continue; + float4 d = tex2D<float4>(camera.points, x, y); + if (d.z < 0.0f) continue; + //if (d >= params.camera.m_sensorDepthWorldMax) continue; - const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); + //const float3 worldPos = camera.pose * camera.params.kinectDepthToSkeleton(x, y, d); + const float3 worldPos = make_float3(d); const float3 camPos = params.m_viewMatrix * worldPos; const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); const uint2 screenPos = make_uint2(make_int2(screenPosf)); @@ -53,18 +100,28 @@ __global__ void dibr_kernel( if (cx < colour_out.width() && cy < colour_out.height()) { - float camd = depth_in.tex2D((int)cx,(int)cy); + //float camd = depth_in.tex2D((int)cx,(int)cy); //atomicMin(&depth(x,y), idepth); - float camdiff = fabs(camPos.z-camd); - if (camdiff < 0.1f) { - colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); - } else { + //float camdiff = fabs(camPos.z-camd); + //if (camdiff < 0.1f) { + + if (depth_in(cx,cy) == (int)(camPos.z * 1000.0f)) { + colour_out(cx,cy) = tex2D<uchar4>(camera.colour,x,y); + //colour_out(cx,cy) = (j==0) ? make_uchar4(20,255,0,255) : make_uchar4(20,0,255,255); + } + + + //} else { //colour_out(cx,cy) = make_uchar4(camdiff * 100, 0, 0, 255); - } + //} } } } +__device__ inline float4 make_float4(const uchar4 &c) { + return make_float4(c.x,c.y,c.z,c.w); +} + __global__ void dibr_kernel_rev( TextureObject<float> depth_in, TextureObject<uchar4> colour_out, int numcams, SplatParams params) { @@ -80,7 +137,10 @@ __global__ void dibr_kernel_rev( if (camd >= params.camera.m_sensorDepthWorldMax) return; const float3 worldPos = params.m_viewMatrixInverse * params.camera.kinectDepthToSkeleton(x, y, camd); - + float mindiff = 1000.0f; + float4 col = make_float4(0.0f,0.0f,0.0f,0.0f); + int count = 0; + for (int j=0; j<numcams; ++j) { const ftl::voxhash::DepthCameraCUDA camera = c_cameras[j]; @@ -95,24 +155,67 @@ __global__ void dibr_kernel_rev( if (cx < camera.params.m_imageWidth && cy < camera.params.m_imageHeight) { float d = tex2D<float>(camera.depth, (int)cx, (int)cy); - float camdiff = fabs(camPos.z-d); - if (camdiff < 0.1f) { - colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy); - } else { + float camdiff = fabs(camPos.z-d); + + if (camdiff < mindiff) { + mindiff = camdiff; + col += make_float4(tex2D<uchar4>(camera.colour,cx,cy)); + ++count; + } + + //if (camdiff < 0.1f) { + // colour_out(x,y) = tex2D<uchar4>(camera.colour,cx,cy); + //} else { //colour_out(x,y) = make_uchar4(camdiff * 100, 0, 0, 255); - } + //} } } + + if (count > 0) { + col = col / (float)count; + colour_out(x,y) = make_uchar4(col.x,col.y,col.z,255); + } else { + colour_out(x,y) = make_uchar4(76,76,76,255); + } +} + +void ftl::cuda::dibr(const TextureObject<int> &depth_out, + const TextureObject<uchar4> &colour_out, int numcams, const SplatParams ¶ms, cudaStream_t stream) { + + const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif + + dibr_depthmap_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, numcams, params); + dibr_kernel<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params); + cudaSafeCall( cudaGetLastError() ); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif } -void ftl::cuda::dibr(const TextureObject<float> &depth_in, +void ftl::cuda::dibr(const TextureObject<float> &depth_out, const TextureObject<uchar4> &colour_out, int numcams, const SplatParams ¶ms, cudaStream_t stream) { - const dim3 gridSize((depth_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 gridSize((depth_out.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth_out.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + clearColourKernel<<<gridSize, blockSize, 0, stream>>>(colour_out); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif - dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_in, colour_out, numcams, params); - cudaSafeCall( cudaGetLastError() ); + dibr_kernel_rev<<<gridSize, blockSize, 0, stream>>>(depth_out, colour_out, numcams, params); + cudaSafeCall( cudaGetLastError() ); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif } diff --git a/applications/reconstruct/src/integrators.cu b/applications/reconstruct/src/integrators.cu index 1259210b4ea1b292f1adf00dfd0517bf04734c40..ae84d964af63d4a1d0eb5dc800b59939792ec8bc 100644 --- a/applications/reconstruct/src/integrators.cu +++ b/applications/reconstruct/src/integrators.cu @@ -7,6 +7,8 @@ #include <ftl/cuda_common.hpp> #define T_PER_BLOCK 8 +#define NUM_CUDA_BLOCKS 10000 +#define WARP_SIZE 32 using ftl::voxhash::HashData; using ftl::voxhash::HashParams; @@ -15,45 +17,13 @@ using ftl::voxhash::HashEntry; using ftl::voxhash::HashEntryHead; using ftl::voxhash::FREE_ENTRY; +extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; +extern __constant__ HashParams c_hashParams; + __device__ float4 make_float4(uchar4 c) { return make_float4(static_cast<float>(c.x), static_cast<float>(c.y), static_cast<float>(c.z), static_cast<float>(c.w)); } -inline __device__ uchar4 bilinearFilterColor(const DepthCameraParams& cameraParams, const float2& screenPos, cudaTextureObject_t colorTextureRef) { - //const DepthCameraParams& cameraParams = c_depthCameraParams; - const int imageWidth = cameraParams.m_imageWidth; - const int imageHeight = cameraParams.m_imageHeight; - const int2 p00 = make_int2(screenPos.x+0.5f, screenPos.y+0.5f); - const int2 dir = sign(make_float2(screenPos.x - p00.x, screenPos.y - p00.y)); - - const int2 p01 = p00 + make_int2(0.0f, dir.y); - const int2 p10 = p00 + make_int2(dir.x, 0.0f); - const int2 p11 = p00 + make_int2(dir.x, dir.y); - - const float alpha = (screenPos.x - p00.x)*dir.x; - const float beta = (screenPos.y - p00.y)*dir.y; - - float4 s0 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w0 = 0.0f; - if(p00.x >= 0 && p00.x < imageWidth && p00.y >= 0 && p00.y < imageHeight) { uchar4 v00 = tex2D<uchar4>(colorTextureRef, p00.x, p00.y); if(v00.x != 0) { s0 += (1.0f-alpha)*make_float4(v00); w0 += (1.0f-alpha); } } - if(p10.x >= 0 && p10.x < imageWidth && p10.y >= 0 && p10.y < imageHeight) { uchar4 v10 = tex2D<uchar4>(colorTextureRef, p10.x, p10.y); if(v10.x != 0) { s0 += alpha *make_float4(v10); w0 += alpha ; } } - - float4 s1 = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float w1 = 0.0f; - if(p01.x >= 0 && p01.x < imageWidth && p01.y >= 0 && p01.y < imageHeight) { uchar4 v01 = tex2D<uchar4>(colorTextureRef, p01.x, p01.y); if(v01.x != 0) { s1 += (1.0f-alpha)*make_float4(v01); w1 += (1.0f-alpha);} } - if(p11.x >= 0 && p11.x < imageWidth && p11.y >= 0 && p11.y < imageHeight) { uchar4 v11 = tex2D<uchar4>(colorTextureRef, p11.x, p11.y); if(v11.x != 0) { s1 += alpha *make_float4(v11); w1 += alpha ;} } - - const float4 p0 = s0/w0; - const float4 p1 = s1/w1; - - float4 ss = make_float4(0.0f, 0.0f, 0.0f, 0.0f); float ww = 0.0f; - if(w0 > 0.0f) { ss += (1.0f-beta)*p0; ww += (1.0f-beta); } - if(w1 > 0.0f) { ss += beta *p1; ww += beta ; } - - if(ww > 0.0f) { - ss /= ww; - return make_uchar4(ss.x,ss.y,ss.z,ss.w); - } else return make_uchar4(0, 0, 0, 0); -} - __device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) { float x = c1.x-c2.x; float y = c1.y-c2.y; @@ -61,6 +31,11 @@ __device__ float colourDistance(const uchar4 &c1, const uchar3 &c2) { return x*x + y*y + z*z; } +/* + * Kim, K., Chalidabhongse, T. H., Harwood, D., & Davis, L. (2005). + * Real-time foreground-background segmentation using codebook model. + * Real-Time Imaging. https://doi.org/10.1016/j.rti.2004.12.004 + */ __device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) { float x_2 = pb.x * pb.x + pb.y * pb.y + pb.z * pb.z; float v_2 = pa.x * pa.x + pa.y * pa.y + pa.z * pa.z; @@ -69,81 +44,20 @@ __device__ bool colordiff(const uchar4 &pa, const uchar3 &pb, float epsilon) { return sqrt(x_2 - p_2) < epsilon; } -#define NUM_CUDA_BLOCKS 10000 - - - -/*__global__ void integrateRegistrationKernel(HashData hashData, HashParams hashParams, DepthCameraParams cameraParams, cudaTextureObject_t depthT, cudaTextureObject_t colourT) { - - // Stride over all allocated blocks - for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { - - //TODO check if we should load this in shared memory - HashEntry& entry = hashData.d_hashCompactified[bi]; - - - int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(entry.pos); - - uint i = threadIdx.x; //inside of an SDF block - int3 pi = pi_base + make_int3(hashData.delinearizeVoxelIndex(i)); - float3 pf = hashData.virtualVoxelPosToWorld(pi); - - pf = hashParams.m_rigidTransformInverse * pf; - uint2 screenPos = make_uint2(cameraParams.cameraToKinectScreenInt(pf)); - - // For this voxel in hash, get its screen position and check it is on screen - if (screenPos.x < cameraParams.m_imageWidth && screenPos.y < cameraParams.m_imageHeight) { //on screen - - //float depth = g_InputDepth[screenPos]; - float depth = tex2D<float>(depthT, screenPos.x, screenPos.y); - //if (depth > 20.0f) return; - - uchar4 color = make_uchar4(0, 0, 0, 0); - color = tex2D<uchar4>(colourT, screenPos.x, screenPos.y); - - // Depth is within accepted max distance from camera - if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) - float depthZeroOne = cameraParams.cameraToKinectProjZ(depth); - - // Calculate SDF of this voxel wrt the depth map value - float sdf = depth - pf.z; - float truncation = hashData.getTruncation(depth); - - if (sdf > -truncation) { - float weightUpdate = max(hashParams.m_integrationWeightSample * 1.5f * (1.0f-depthZeroOne), 1.0f); - - Voxel curr; //construct current voxel - curr.sdf = sdf; - curr.weight = weightUpdate; - curr.color = make_uchar3(color.x, color.y, color.z); - - uint idx = entry.ptr + i; - - Voxel out; - const Voxel &v1 = curr; - const Voxel &v0 = hashData.d_SDFBlocks[idx]; - - float redshift = (v0.weight > 0) ? 1.0f - ((v1.sdf - v0.sdf) / hashParams.m_truncation)*0.5f : 1.0f; - - out.color.x = min(max(v1.color.x*redshift,0.0f),255.0f); - out.color.y = min(max(v1.color.y*redshift,0.0f),255.0f); - out.color.z = min(max(v1.color.z*(1.0f / redshift),0.0f),255.0f); - - out.sdf = (v0.sdf * (float)v0.weight + v1.sdf * (float)v1.weight) / ((float)v0.weight + (float)v1.weight); - out.weight = min(c_hashParams.m_integrationWeightMax, (unsigned int)v0.weight + (unsigned int)v1.weight); - - hashData.d_SDFBlocks[idx] = out; - - } - } - } - - } // Stride loop -}*/ - -extern __constant__ ftl::voxhash::DepthCameraCUDA c_cameras[MAX_CAMERAS]; +/* + * Guennebaud, G.; Gross, M. Algebraic point set surfaces. ACMTransactions on Graphics Vol. 26, No. 3, Article No. 23, 2007. + * Used in: FusionMLS: Highly dynamic 3D reconstruction with consumer-grade RGB-D cameras + * r = distance between points + * h = smoothing parameter in meters (default 4cm) + */ +__device__ float spatialWeighting(float r) { + const float h = c_hashParams.m_spatialSmoothing; + if (r >= h) return 0.0f; + float rh = r / h; + rh = 1.0f - rh*rh; + return rh*rh*rh*rh; +} -#define WARP_SIZE 32 __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParams, int numcams) { __shared__ uint all_warp_ballot; @@ -192,6 +106,10 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam //printf("screen pos %d\n", color.x); //return; + // TODO:(Nick) Accumulate weighted positions + // TODO:(Nick) Accumulate weighted normals + // TODO:(Nick) Accumulate weights + // Depth is within accepted max distance from camera if (depth > 0.01f && depth < hashParams.m_maxIntegrationDistance) { // valid depth and color (Nick: removed colour check) //camdepths[count] = depth; @@ -244,7 +162,8 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam // Calculate voxel sign values across a warp int warpNum = i / WARP_SIZE; - uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1); + //uint ballot_result = __ballot_sync(0xFFFFFFFF, (oldVoxel.sdf >= 0.0f) ? 0 : 1); + uint ballot_result = __ballot_sync(0xFFFFFFFF, (fabs(oldVoxel.sdf) <= hashParams.m_virtualVoxelSize && oldVoxel.weight > 0) ? 1 : 0); // Aggregate each warp result into voxel mask if (i % WARP_SIZE == 0) { @@ -254,9 +173,136 @@ __global__ void integrateDepthMapsKernel(HashData hashData, HashParams hashParam __syncthreads(); // Work out if block is occupied or not and save voxel masks + // TODO:(Nick) Is it faster to do this in a separate garbage kernel? + if (i < 16) { + const uint v = voxels[i]; + hashData.d_hashCompactified[bi]->voxels[i] = v; + const uint mask = 0x0000FFFF; + uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF); + uint b2 = __ballot_sync(mask, v == 0); + if (i == 0) { + if (b1 != mask && b2 != mask) hashData.d_hashCompactified[bi]->head.flags |= ftl::voxhash::kFlagSurface; + else hashData.d_hashCompactified[bi]->head.flags &= ~ftl::voxhash::kFlagSurface; + } + } + + } +} + +#define WINDOW_RADIUS 1 +#define PATCH_SIZE 32 + +__global__ void integrateMLSKernel(HashData hashData, HashParams hashParams, int numcams) { + __shared__ uint voxels[16]; + + const uint i = threadIdx.x; //inside of an SDF block + const int3 po = make_int3(hashData.delinearizeVoxelIndex(i)); + const int warpNum = i / WARP_SIZE; + const int lane = i % WARP_SIZE; + + // Stride over all allocated blocks + for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { + + //TODO check if we should load this in shared memory + //HashEntryHead entry = hashData.d_hashCompactified[bi]->head; + + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(hashData.d_hashCompactified[bi]->head.posXYZ)); + + //uint idx = entry.offset + i; + const int3 pi = pi_base + po; + const float3 pfb = hashData.virtualVoxelPosToWorld(pi); + //int count = 0; + //float camdepths[MAX_CAMERAS]; + + //Voxel oldVoxel; // = hashData.d_SDFBlocks[idx]; + //hashData.deleteVoxel(oldVoxel); + + //float3 awpos = make_float3(0.0f); + //float3 awnorm = make_float3(0.0f); + //float aweights = 0.0f; + float sdf = 0.0f; + float weights = 0.0f; + + // Preload depth values + // 1. Find min and max screen positions + // 2. Subtract/Add WINDOW_RADIUS to min/max + // ... check that the buffer is not too small to cover this + // ... if buffer not big enough then don't buffer at all. + // 3. Populate shared mem depth map buffer using all threads + // 4. Adjust window lookups to use shared mem buffer + + //uint cam=0; + for (uint cam=0; cam<numcams; ++cam) { + const ftl::voxhash::DepthCameraCUDA &camera = c_cameras[cam]; + const uint height = camera.params.m_imageHeight; + const uint width = camera.params.m_imageWidth; + + const float3 pf = camera.poseInverse * pfb; + const uint2 screenPos = make_uint2(camera.params.cameraToKinectScreenInt(pf)); + + //float3 wpos = make_float3(0.0f); + float3 wnorm = make_float3(0.0f); + + + #pragma unroll + for (int v=-WINDOW_RADIUS; v<=WINDOW_RADIUS; ++v) { + for (int u=-WINDOW_RADIUS; u<=WINDOW_RADIUS; ++u) { + if (screenPos.x+u < width && screenPos.y+v < height) { //on screen + float4 depth = tex2D<float4>(camera.points, screenPos.x+u, screenPos.y+v); + if (depth.z == MINF) continue; + + //float4 normal = tex2D<float4>(camera.normal, screenPos.x+u, screenPos.y+v); + const float3 camPos = camera.poseInverse * make_float3(depth); //camera.pose * camera.params.kinectDepthToSkeleton(screenPos.x+u, screenPos.y+v, depth); + const float weight = spatialWeighting(length(pf - camPos)); + + //wpos += weight*worldPos; + sdf += weight*(camPos.z - pf.z); + //sdf += camPos.z - pf.z; + //wnorm += weight*make_float3(normal); + //weights += 1.0f; + weights += weight; + } + } + } + + //awpos += wpos; + //aweights += weights; + } + + //awpos /= aweights; + //wnorm /= weights; + + sdf /= weights; + + //float sdf = (aweights == 0.0f) ? MINF : length(pfb - awpos); + //float sdf = wnorm.x * (pfb.x - wpos.x) + wnorm.y * (pfb.y - wpos.y) + wnorm.z * (pfb.z - wpos.z); + + //printf("WEIGHTS: %f\n", weights); + + //if (weights < 0.00001f) sdf = 0.0f; + + // Calculate voxel sign values across a warp + int warpNum = i / WARP_SIZE; + + //uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) < hashParams.m_virtualVoxelSize && aweights >= 0.5f) ? 1 : 0); + //uint solid_ballot = __ballot_sync(0xFFFFFFFF, (fabs(sdf) <= hashParams.m_virtualVoxelSize) ? 1 : 0); + //uint solid_ballot = __ballot_sync(0xFFFFFFFF, (aweights >= 0.0f) ? 1 : 0); + uint solid_ballot = __ballot_sync(0xFFFFFFFF, (sdf < 0.0f ) ? 1 : 0); + + // Aggregate each warp result into voxel mask + if (i % WARP_SIZE == 0) { + voxels[warpNum] = solid_ballot; + //valid[warpNum] = valid_ballot; + } + + __syncthreads(); + + // Work out if block is occupied or not and save voxel masks + // TODO:(Nick) Is it faster to do this in a separate garbage kernel? if (i < 16) { const uint v = voxels[i]; hashData.d_hashCompactified[bi]->voxels[i] = v; + //hashData.d_hashCompactified[bi]->validity[i] = valid[i]; const uint mask = 0x0000FFFF; uint b1 = __ballot_sync(mask, v == 0xFFFFFFFF); uint b2 = __ballot_sync(mask, v == 0); @@ -277,7 +323,7 @@ const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); //if (hashParams.m_numOccupiedBlocks > 0) { //this guard is important if there is no depth in the current frame (i.e., no blocks were allocated) - integrateDepthMapsKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); + integrateMLSKernel << <gridSize, blockSize, 0, stream >> >(hashData, hashParams, numcams); //} //cudaSafeCall( cudaGetLastError() ); diff --git a/applications/reconstruct/src/scene_rep_hash_sdf.cu b/applications/reconstruct/src/scene_rep_hash_sdf.cu index 5c5ebf6fdb73be59bf028d850ce4fa41e6986925..247247b6cc5279186f9f9bdc81bbe627ab0621ea 100644 --- a/applications/reconstruct/src/scene_rep_hash_sdf.cu +++ b/applications/reconstruct/src/scene_rep_hash_sdf.cu @@ -168,41 +168,6 @@ extern "C" void resetHashBucketMutexCUDA(HashData& hashData, const HashParams& h } -__device__ -unsigned int linearizeChunkPos(const int3& chunkPos) -{ - int3 p = chunkPos-c_hashParams.m_streamingMinGridPos; - return p.z * c_hashParams.m_streamingGridDimensions.x * c_hashParams.m_streamingGridDimensions.y + - p.y * c_hashParams.m_streamingGridDimensions.x + - p.x; -} - -__device__ -int3 worldToChunks(const float3& posWorld) -{ - float3 p; - p.x = posWorld.x/c_hashParams.m_streamingVoxelExtents.x; - p.y = posWorld.y/c_hashParams.m_streamingVoxelExtents.y; - p.z = posWorld.z/c_hashParams.m_streamingVoxelExtents.z; - - float3 s; - s.x = (float)sign(p.x); - s.y = (float)sign(p.y); - s.z = (float)sign(p.z); - - return make_int3(p+s*0.5f); -} - -__device__ -bool isSDFBlockStreamedOut(const int3& sdfBlock, const HashData& hashData, const unsigned int* d_bitMask) //TODO MATTHIAS (-> move to HashData) -{ - float3 posWorld = hashData.virtualVoxelPosToWorld(hashData.SDFBlockToVirtualVoxelPos(sdfBlock)); // sdfBlock is assigned to chunk by the bottom right sample pos - - uint index = linearizeChunkPos(worldToChunks(posWorld)); - uint nBitsInT = 32; - return ((d_bitMask[index/nBitsInT] & (0x1 << (index%nBitsInT))) != 0x0); -} - // Note: bitMask used for Streaming out code... could be set to nullptr if not streaming out // Note: Allocations might need to be around fat rays since multiple voxels could correspond // to same depth map pixel at larger distances. @@ -272,7 +237,7 @@ __global__ void allocKernel(HashData hashData, HashParams hashParams, int camnum while(iter < g_MaxLoopIterCount) { //check if it's in the frustum and not checked out - if (hashData.isSDFBlockInCameraFrustumApprox(hashParams, cameraParams, idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) { + if (hashData.isInBoundingBox(hashParams, idCurrentVoxel)) { //} && !isSDFBlockStreamedOut(idCurrentVoxel, hashData, d_bitMask)) { hashData.allocBlock(idCurrentVoxel); //printf("Allocate block: %d\n",idCurrentVoxel.x); } diff --git a/applications/reconstruct/src/splat_render.cpp b/applications/reconstruct/src/splat_render.cpp index a9d5e57dc712829b592f56e8226abe0f563dc665..51d30d638223185f417cb049996b32650b06b151 100644 --- a/applications/reconstruct/src/splat_render.cpp +++ b/applications/reconstruct/src/splat_render.cpp @@ -1,6 +1,7 @@ #include "splat_render.hpp" #include "splat_render_cuda.hpp" #include "compactors.hpp" +#include "depth_camera_cuda.hpp" using ftl::render::Splatter; @@ -20,7 +21,7 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { cudaSafeCall(cudaSetDevice(scene_->getCUDADevice())); if ((unsigned int)depth1_.width() != camera.width || (unsigned int)depth1_.height() != camera.height) { - depth1_ = ftl::cuda::TextureObject<uint>(camera.width, camera.height); + depth1_ = ftl::cuda::TextureObject<int>(camera.width, camera.height); } if ((unsigned int)colour1_.width() != camera.width || (unsigned int)colour1_.height() != camera.height) { colour1_ = ftl::cuda::TextureObject<uchar4>(camera.width, camera.height); @@ -47,11 +48,25 @@ void Splatter::render(ftl::rgbd::Source *src, cudaStream_t stream) { params.camera.m_sensorDepthWorldMax = camera.maxDepth; params.camera.m_sensorDepthWorldMin = camera.minDepth; - ftl::cuda::compactifyAllocated(scene_->getHashData(), scene_->getHashParams(), stream); - LOG(INFO) << "Occupied: " << scene_->getOccupiedCount(); - ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream); - ftl::cuda::splat_points(depth1_, depth2_, params, stream); - ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); + //ftl::cuda::compactifyAllocated(scene_->getHashData(), scene_->getHashParams(), stream); + //LOG(INFO) << "Occupied: " << scene_->getOccupiedCount(); + + if (scene_->value("voxels", false)) { + ftl::cuda::isosurface_point_image(scene_->getHashData(), depth1_, params, stream); + ftl::cuda::splat_points(depth1_, depth2_, params, stream); + ftl::cuda::dibr(depth2_, colour1_, scene_->cameraCount(), params, stream); + } else { + //ftl::cuda::clear_colour(colour1_, stream); + ftl::cuda::clear_depth(depth1_, stream); + ftl::cuda::clear_depth(depth2_, stream); + ftl::cuda::dibr(depth1_, colour1_, scene_->cameraCount(), params, stream); + //ftl::cuda::hole_fill(depth1_, depth2_, params.camera, stream); + ftl::cuda::int_to_float(depth1_, depth2_, 1.0f / 1000.0f, stream); + //ftl::cuda::mls_resample(depth1_, colour1_, depth2_, scene_->getHashParams(), scene_->cameraCount(), params, stream); + } + + //ftl::cuda::median_filter(depth1_, depth2_, stream); + //ftl::cuda::splat_points(depth1_, depth2_, params, stream); // TODO: Second pass diff --git a/applications/reconstruct/src/splat_render.cu b/applications/reconstruct/src/splat_render.cu index 0e3598f129bf40c72294ab19501b1fd3c95bb1e9..3addf108dcf89ea8f3069e3ba0daea59e67ae918 100644 --- a/applications/reconstruct/src/splat_render.cu +++ b/applications/reconstruct/src/splat_render.cu @@ -13,7 +13,7 @@ using ftl::cuda::TextureObject; using ftl::render::SplatParams; -__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth) { +__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<int> depth) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -76,9 +76,65 @@ __device__ inline bool getVoxel(uint *voxels, int ix) { return voxels[ix/32] & (0x1 << (ix % 32)); } -__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<uint> depth, SplatParams params) { +__global__ void occupied_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) { + __shared__ uint voxels[16]; + __shared__ ftl::voxhash::HashEntryHead block; + + // Stride over all allocated blocks + for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) { + __syncthreads(); + + const uint i = threadIdx.x; //inside of an SDF block + + if (i == 0) block = hashData.d_hashCompactified[bi]->head; + if (i < 16) { + voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + //valid[i] = hashData.d_hashCompactified[bi]->validity[i]; + } + + // Make sure all hash entries are cached + __syncthreads(); + + const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ)); + const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i)); + const int3 pi = pi_base + vp; + const float3 worldPos = hashData.virtualVoxelPosToWorld(pi); + + const bool v = getVoxel(voxels, i); + + uchar4 color = make_uchar4(255,0,0,255); + bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); + + + // Only for surface voxels, work out screen coordinates + if (!is_surface) continue; + + // TODO: For each original camera, render a new depth map + + const float3 camPos = params.m_viewMatrix * worldPos; + const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos); + const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f) + + //printf("Worldpos: %f,%f,%f\n", camPos.x, camPos.y, camPos.z); + + if (camPos.z < params.camera.m_sensorDepthWorldMin) continue; + + const unsigned int x = screenPos.x; + const unsigned int y = screenPos.y; + const int idepth = static_cast<int>(camPos.z * 1000.0f); + + // See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds + if (x < depth.width() && y < depth.height()) { + atomicMin(&depth(x,y), idepth); + } + + } // Stride +} + +__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) { // TODO:(Nick) Reduce bank conflicts by aligning these __shared__ uint voxels[16]; + //__shared__ uint valid[16]; __shared__ ftl::voxhash::HashEntryHead block; // Stride over all allocated blocks @@ -88,7 +144,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture const uint i = threadIdx.x; //inside of an SDF block if (i == 0) block = hashData.d_hashCompactified[bi]->head; - if (i < 16) voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + if (i < 16) { + voxels[i] = hashData.d_hashCompactified[bi]->voxels[i]; + //valid[i] = hashData.d_hashCompactified[bi]->validity[i]; + } // Make sure all hash entries are cached __syncthreads(); @@ -115,10 +174,10 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture int edgeZ = (vp.z == 0 ) ? 1 : 0; uchar4 color = make_uchar4(255,0,0,255); - bool is_surface = false; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); + bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2); //if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255); - if (!is_surface && v) continue; + if (v) continue; // !getVoxel(valid, i) //if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0); @@ -136,7 +195,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture //if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue; const bool vox = getVoxel(voxels, hashData.linearizeVoxelPos(uvi)); - if (vox) { + if (vox) { //getVoxel(valid, hashData.linearizeVoxelPos(uvi))) { is_surface = true; // Should break but is slower? } @@ -174,7 +233,7 @@ __global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, Texture } void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, - const TextureObject<uint> &depth, + const TextureObject<int> &depth, const SplatParams ¶ms, cudaStream_t stream) { const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); @@ -182,16 +241,21 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, clearDepthKernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(hashData, depth); - //cudaSafeCall( cudaDeviceSynchronize() ); +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE; const dim3 gridSize(NUM_CUDA_BLOCKS, 1); const dim3 blockSize(threadsPerBlock, 1); - isosurface_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params); + occupied_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params); cudaSafeCall( cudaGetLastError() ); - //cudaSafeCall( cudaDeviceSynchronize() ); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif } // ---- Pass 2: Expand the point splats ---------------------------------------- @@ -199,7 +263,7 @@ void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData, #define SPLAT_RADIUS 7 #define SPLAT_BOUNDS (2*SPLAT_RADIUS+T_PER_BLOCK+1) #define SPLAT_BUFFER_SIZE (SPLAT_BOUNDS*SPLAT_BOUNDS) -#define MAX_VALID 8 +#define MAX_VALID 100 __device__ float distance2(float3 a, float3 b) { const float x = a.x-b.x; @@ -209,7 +273,7 @@ __device__ float distance2(float3 a, float3 b) { } __global__ void splatting_kernel( - TextureObject<uint> depth_in, + TextureObject<int> depth_in, TextureObject<float> depth_out, SplatParams params) { // Read an NxN region and // - interpolate a depth value for this pixel @@ -240,7 +304,7 @@ __global__ void splatting_kernel( __syncthreads(); - if (x >= depth_in.width() && y >= depth_in.height()) return; + if (x >= depth_in.width() || y >= depth_in.height()) return; const float voxelSquared = params.voxelSize*params.voxelSize; float mindepth = 1000.0f; @@ -266,7 +330,7 @@ __global__ void splatting_kernel( if (dist < voxelSquared) { // Valid so check for minimum //validPos[validix] = pos; - validIndices[validix++] = idx; + //validIndices[validix++] = idx; if (d < mindepth) { mindepth = d; minidx = idx; @@ -287,8 +351,8 @@ __global__ void splatting_kernel( float contrib = 0.0f; float3 pos = params.camera.kinectDepthToSkeleton(x, y, mindepth); // TODO:(Nick) Mindepth assumption is poor choice. - for (int j=0; j<validix; ++j) { - const int idx = validIndices[j]; + //for (int j=0; j<validix; ++j) { + const int idx = minidx; //validIndices[j]; float3 posp = positions[idx]; //float3 pos = params.camera.kinectDepthToSkeleton(x, y, posp.z); float3 delta = (posp - pos) / 2*params.voxelSize; @@ -308,7 +372,7 @@ __global__ void splatting_kernel( contrib += c; depth += posp.z * c; } - } + //} // Normalise //colour.x /= contrib; @@ -320,7 +384,7 @@ __global__ void splatting_kernel( //colour_out(x,y) = make_uchar4(colour.x, colour.y, colour.z, 255); } -void ftl::cuda::splat_points(const TextureObject<uint> &depth_in, +void ftl::cuda::splat_points(const TextureObject<int> &depth_in, const TextureObject<float> &depth_out, const SplatParams ¶ms, cudaStream_t stream) { @@ -329,4 +393,8 @@ void ftl::cuda::splat_points(const TextureObject<uint> &depth_in, splatting_kernel<<<gridSize, blockSize, 0, stream>>>(depth_in, depth_out, params); cudaSafeCall( cudaGetLastError() ); + +#ifdef _DEBUG + cudaSafeCall(cudaDeviceSynchronize()); +#endif } diff --git a/applications/reconstruct/src/splat_render.hpp b/applications/reconstruct/src/splat_render.hpp index a36e80ad833f258eced3acd63560e6b315995391..7737bb5d903d725a55b81d75812b99b52e7ace66 100644 --- a/applications/reconstruct/src/splat_render.hpp +++ b/applications/reconstruct/src/splat_render.hpp @@ -32,7 +32,7 @@ class Splatter { private: int device_; - ftl::cuda::TextureObject<uint> depth1_; + ftl::cuda::TextureObject<int> depth1_; ftl::cuda::TextureObject<uchar4> colour1_; ftl::cuda::TextureObject<float> depth2_; ftl::cuda::TextureObject<uchar4> colour2_; diff --git a/applications/reconstruct/src/splat_render_cuda.hpp b/applications/reconstruct/src/splat_render_cuda.hpp index deb702e1661ca8f36fcea7aa6b4b6a115439cc7b..d8fd9cf55649280c20805c8d4f8b9f7b758b2223 100644 --- a/applications/reconstruct/src/splat_render_cuda.hpp +++ b/applications/reconstruct/src/splat_render_cuda.hpp @@ -15,7 +15,7 @@ namespace cuda { * of objects up to at most truncation depth. */ void isosurface_point_image(const ftl::voxhash::HashData& hashData, - const ftl::cuda::TextureObject<uint> &depth, + const ftl::cuda::TextureObject<int> &depth, const ftl::render::SplatParams ¶ms, cudaStream_t stream); //void isosurface_point_image_stereo(const ftl::voxhash::HashData& hashData, @@ -25,14 +25,17 @@ void isosurface_point_image(const ftl::voxhash::HashData& hashData, // TODO: isosurface_point_cloud -void splat_points(const ftl::cuda::TextureObject<uint> &depth_in, +void splat_points(const ftl::cuda::TextureObject<int> &depth_in, const ftl::cuda::TextureObject<float> &depth_out, const ftl::render::SplatParams ¶ms, cudaStream_t stream); -void dibr(const ftl::cuda::TextureObject<float> &depth_in, +void dibr(const ftl::cuda::TextureObject<int> &depth_out, const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); +void dibr(const ftl::cuda::TextureObject<float> &depth_out, + const ftl::cuda::TextureObject<uchar4> &colour_out, int numcams, const ftl::render::SplatParams ¶ms, cudaStream_t stream); + } } diff --git a/applications/reconstruct/src/voxel_scene.cpp b/applications/reconstruct/src/voxel_scene.cpp index 51f9bc75b88b0d8ae73d41d8ba05fb248cb5ae00..e82da537bbf2df6789467f561d24492f04556ca3 100644 --- a/applications/reconstruct/src/voxel_scene.cpp +++ b/applications/reconstruct/src/voxel_scene.cpp @@ -2,6 +2,7 @@ #include "compactors.hpp" #include "garbage.hpp" #include "integrators.hpp" +#include "depth_camera_cuda.hpp" #include <opencv2/core/cuda_stream_accessor.hpp> @@ -88,6 +89,8 @@ bool SceneRep::_initCUDA() { // TODO:(Nick) Check memory is sufficient // TODO:(Nick) Find out what our compute capability should be. + LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor; + return true; } @@ -138,7 +141,8 @@ int SceneRep::upload() { cam.params.m_imageHeight = in->parameters().height; cam.params.m_sensorDepthWorldMax = in->parameters().maxDepth; cam.params.m_sensorDepthWorldMin = in->parameters().minDepth; - cam.gpu.alloc(cam.params); + cam.gpu.alloc(cam.params, true); + LOG(INFO) << "GPU Allocated camera " << i; } } @@ -163,8 +167,9 @@ int SceneRep::upload() { if (depth.cols == 0) continue; // Must be in RGBA for GPU - Mat rgba; - cv::cvtColor(rgb,rgba, cv::COLOR_BGR2BGRA); + Mat rgbt, rgba; + cv::cvtColor(rgb,rgbt, cv::COLOR_BGR2Lab); + cv::cvtColor(rgbt,rgba, cv::COLOR_BGR2BGRA); // Send to GPU and merge view into scene //cam.gpu.updateParams(cam.params); @@ -178,7 +183,9 @@ int SceneRep::upload() { //if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream))); //allocate all hash blocks which are corresponding to depth map entries - _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream)); + if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream)); + + // Calculate normals } // Must have finished all allocations and rendering before next integration @@ -211,7 +218,7 @@ void SceneRep::integrate() { void SceneRep::garbage() { //_compactifyAllocated(); - _garbageCollect(); + if (value("voxels", false)) _garbageCollect(); //cudaSafeCall(cudaStreamSynchronize(integ_stream_)); } @@ -237,21 +244,11 @@ void SceneRep::garbage() { m_numIntegratedFrames++; }*/ -void SceneRep::setLastRigidTransform(const Eigen::Matrix4f& lastRigidTransform) { - m_hashParams.m_rigidTransform = MatrixConversion::toCUDA(lastRigidTransform); - m_hashParams.m_rigidTransformInverse = MatrixConversion::toCUDA(lastRigidTransform.inverse()); //m_hashParams.m_rigidTransform.getInverse(); -} - /*void SceneRep::setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) { setLastRigidTransform(lastRigidTransform); _compactifyHashEntries(); }*/ - -const Eigen::Matrix4f SceneRep::getLastRigidTransform() const { - return MatrixConversion::toEigen(m_hashParams.m_rigidTransform); -} - /* Nick: To reduce weights between frames */ void SceneRep::nextFrame() { if (do_reset_) { @@ -260,7 +257,7 @@ void SceneRep::nextFrame() { _create(_parametersFromConfig()); } else { //ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_); - if (reg_mode_) ftl::cuda::clearVoxels(m_hashData, m_hashParams); + //if (reg_mode_) ftl::cuda::clearVoxels(m_hashData, m_hashParams); //else ftl::cuda::starveVoxels(m_hashData, m_hashParams, integ_stream_); m_numIntegratedFrames = 0; } @@ -269,10 +266,6 @@ void SceneRep::nextFrame() { //! resets the hash to the initial state (i.e., clears all data) void SceneRep::reset() { m_numIntegratedFrames = 0; - - //m_hashParams.m_rigidTransform.setIdentity(); - //m_hashParams.m_rigidTransformInverse.setIdentity(); - m_hashParams.m_numOccupiedBlocks = 0; m_hashData.updateParams(m_hashParams); resetCUDA(m_hashData, m_hashParams); } @@ -288,22 +281,27 @@ HashParams SceneRep::_parametersFromConfig() { HashParams params; // First camera view is set to identity pose to be at the centre of // the virtual coordinate space. - params.m_rigidTransform.setIdentity(); - params.m_rigidTransformInverse.setIdentity(); params.m_hashNumBuckets = value("hashNumBuckets", 100000); - params.m_SDFBlockSize = SDF_BLOCK_SIZE; - params.m_numSDFBlocks = value("hashNumSDFBlocks",500000); params.m_virtualVoxelSize = value("SDFVoxelSize", 0.006f); params.m_maxIntegrationDistance = value("SDFMaxIntegrationDistance", 10.0f); params.m_truncation = value("SDFTruncation", 0.1f); params.m_truncScale = value("SDFTruncationScale", 0.01f); params.m_integrationWeightSample = value("SDFIntegrationWeightSample", 10); params.m_integrationWeightMax = value("SDFIntegrationWeightMax", 255); - // Note (Nick): We are not streaming voxels in/out of GPU - //params.m_streamingVoxelExtents = MatrixConversion::toCUDA(gas.s_streamingVoxelExtents); - //params.m_streamingGridDimensions = MatrixConversion::toCUDA(gas.s_streamingGridDimensions); - //params.m_streamingMinGridPos = MatrixConversion::toCUDA(gas.s_streamingMinGridPos); - //params.m_streamingInitialChunkListSize = gas.s_streamingInitialChunkListSize; + params.m_spatialSmoothing = value("spatialSmoothing", 0.04f); // 4cm + params.m_colourSmoothing = value("colourSmoothing", 20.0f); + params.m_confidenceThresh = value("confidenceThreshold", 20.0f); + params.m_flags = 0; + params.m_flags |= (value("clipping", false)) ? ftl::voxhash::kFlagClipping : 0; + params.m_flags |= (value("mls", false)) ? ftl::voxhash::kFlagMLS : 0; + params.m_maxBounds = make_int3( + value("bbox_x_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), + value("bbox_y_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), + value("bbox_z_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE)); + params.m_minBounds = make_int3( + value("bbox_x_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), + value("bbox_y_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), + value("bbox_z_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE)); return params; } @@ -362,8 +360,20 @@ void SceneRep::_compactifyAllocated() { else ftl::cuda::integrateRegistration(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_); }*/ +extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height); + void SceneRep::_integrateDepthMaps() { - ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_); + //cudaSafeCall(cudaDeviceSynchronize()); + + for (size_t i=0; i<cameras_.size(); ++i) { + //ftl::cuda::clear_depth(*(cameras_[i].gpu.depth2_tex_), integ_stream_); + ftl::cuda::clear_points(*(cameras_[i].gpu.points_tex_), integ_stream_); + ftl::cuda::mls_smooth(*(cameras_[i].gpu.points_tex_), m_hashParams, cameras_.size(), i, integ_stream_); + //ftl::cuda::int_to_float(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), 1.0f / 1000.0f, integ_stream_); + //ftl::cuda::hole_fill(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), cameras_[i].params, integ_stream_); + //bilateralFilterFloatMap(cameras_[i].gpu.depth_tex_->devicePtr(), cameras_[i].gpu.depth3_tex_->devicePtr(), 3, 7, cameras_[i].gpu.depth_tex_->width(), cameras_[i].gpu.depth_tex_->height()); + } + if (value("voxels", false)) ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_); } void SceneRep::_garbageCollect() { diff --git a/components/common/cpp/CMakeLists.txt b/components/common/cpp/CMakeLists.txt index 911b7d1d69225d55e84cf6ab2fd87e0d1fc81a79..60577c96040dae74e93ccfe6f8ee4ae0c7abc856 100644 --- a/components/common/cpp/CMakeLists.txt +++ b/components/common/cpp/CMakeLists.txt @@ -5,6 +5,7 @@ set(COMMONSRC src/configurable.cpp src/loguru.cpp src/opencv_to_pcl.cpp + src/cuda_common.cpp ) check_function_exists(uriParseSingleUriA HAVE_URIPARSESINGLE) diff --git a/components/common/cpp/include/ftl/cuda_common.hpp b/components/common/cpp/include/ftl/cuda_common.hpp index 5a2d1de0823acea508747527fd839280d01fc710..58e72ceb35c11c6d79ab17e675ce5bf1573018c7 100644 --- a/components/common/cpp/include/ftl/cuda_common.hpp +++ b/components/common/cpp/include/ftl/cuda_common.hpp @@ -54,8 +54,11 @@ class TextureObject { __host__ __device__ inline const T &operator()(int u, int v) const { return ptr_[u+v*pitch2_]; } __host__ __device__ inline T &operator()(int u, int v) { return ptr_[u+v*pitch2_]; } + + void upload(const cv::Mat &, cudaStream_t stream=0); + void download(cv::Mat &, cudaStream_t stream=0) const; - void free() { + __host__ void free() { if (texobj_ != 0) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); if (ptr_ && needsfree_) cudaFree(ptr_); ptr_ = nullptr; @@ -220,8 +223,40 @@ template <typename T> TextureObject<T>::~TextureObject() { //if (needsdestroy_) cudaSafeCall( cudaDestroyTextureObject (texobj_) ); //if (needsfree_) cudaFree(ptr_); + free(); } +template <> +void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream); + +template <> +void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream); + +template <> +void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream); + +template <> +void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream); + +template <> +void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream); + + +template <> +void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const; + +template <> +void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const; + +template <> +void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const; + +template <> +void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const; + +template <> +void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const; + } } diff --git a/components/common/cpp/src/cuda_common.cpp b/components/common/cpp/src/cuda_common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..571d6816b63413163471ef9006ae1d28031511fd --- /dev/null +++ b/components/common/cpp/src/cuda_common.cpp @@ -0,0 +1,59 @@ +#include <ftl/cuda_common.hpp> + +using ftl::cuda::TextureObject; + +template <> +void TextureObject<uchar4>::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar4), m.rows, cudaMemcpyHostToDevice, stream)); +} + +template <> +void TextureObject<float>::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float), m.rows, cudaMemcpyHostToDevice, stream)); +} + +template <> +void TextureObject<float2>::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float2), m.rows, cudaMemcpyHostToDevice, stream)); +} + +template <> +void TextureObject<float4>::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(float4), m.rows, cudaMemcpyHostToDevice, stream)); +} + +template <> +void TextureObject<uchar>::upload(const cv::Mat &m, cudaStream_t stream) { + cudaSafeCall(cudaMemcpy2DAsync(devicePtr(), pitch(), m.data, m.step, m.cols * sizeof(uchar), m.rows, cudaMemcpyHostToDevice, stream)); +} + + +template <> +void TextureObject<uchar4>::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), CV_8UC4); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar4), m.rows, cudaMemcpyDeviceToHost, stream)); +} + +template <> +void TextureObject<float>::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), CV_32FC1); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float), m.rows, cudaMemcpyDeviceToHost, stream)); +} + +template <> +void TextureObject<float2>::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), CV_32FC2); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float2), m.rows, cudaMemcpyDeviceToHost, stream)); +} + +template <> +void TextureObject<float4>::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), CV_32FC4); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(float4), m.rows, cudaMemcpyDeviceToHost, stream)); +} + +template <> +void TextureObject<uchar>::download(cv::Mat &m, cudaStream_t stream) const { + m.create(height(), width(), CV_8UC1); + cudaSafeCall(cudaMemcpy2DAsync(m.data, m.step, devicePtr(), pitch(), m.cols * sizeof(uchar), m.rows, cudaMemcpyDeviceToHost, stream)); +} diff --git a/components/rgbd-sources/src/source.cpp b/components/rgbd-sources/src/source.cpp index 8747b60fbed095c9c09d264b76490d848a7f758d..48244edafa48b7836237d4c31663e315e0e0d096 100644 --- a/components/rgbd-sources/src/source.cpp +++ b/components/rgbd-sources/src/source.cpp @@ -264,11 +264,17 @@ void Source::writeFrames(const ftl::cuda::TextureObject<uchar4> &rgb, const ftl: void Source::writeFrames(const ftl::cuda::TextureObject<uchar4> &rgb, const ftl::cuda::TextureObject<float> &depth, cudaStream_t stream) { if (!impl_) { UNIQUE_LOCK(mutex_,lk); - rgb_.create(rgb.height(), rgb.width(), CV_8UC4); - cudaSafeCall(cudaMemcpy2DAsync(rgb_.data, rgb_.step, rgb.devicePtr(), rgb.pitch(), rgb_.cols * sizeof(uchar4), rgb_.rows, cudaMemcpyDeviceToHost, stream)); - depth_.create(depth.height(), depth.width(), CV_32FC1); - cudaSafeCall(cudaMemcpy2DAsync(depth_.data, depth_.step, depth.devicePtr(), depth.pitch(), depth_.cols * sizeof(float), depth_.rows, cudaMemcpyDeviceToHost, stream)); + rgb.download(rgb_, stream); + //rgb_.create(rgb.height(), rgb.width(), CV_8UC4); + //cudaSafeCall(cudaMemcpy2DAsync(rgb_.data, rgb_.step, rgb.devicePtr(), rgb.pitch(), rgb_.cols * sizeof(uchar4), rgb_.rows, cudaMemcpyDeviceToHost, stream)); + depth.download(depth_, stream); + //depth_.create(depth.height(), depth.width(), CV_32FC1); + //cudaSafeCall(cudaMemcpy2DAsync(depth_.data, depth_.step, depth.devicePtr(), depth.pitch(), depth_.cols * sizeof(float), depth_.rows, cudaMemcpyDeviceToHost, stream)); + stream_ = stream; + cudaSafeCall(cudaStreamSynchronize(stream_)); + cv::cvtColor(rgb_,rgb_, cv::COLOR_BGRA2BGR); + cv::cvtColor(rgb_,rgb_, cv::COLOR_Lab2BGR); } }