#include <ftl/voxel_scene.hpp> #include "compactors.hpp" #include "garbage.hpp" #include "integrators.hpp" #include "depth_camera_cuda.hpp" #include <opencv2/core/cuda_stream_accessor.hpp> #include <vector> using namespace ftl::voxhash; using ftl::rgbd::Source; using ftl::Configurable; using cv::Mat; using std::vector; #define SAFE_DELETE_ARRAY(a) { delete [] (a); (a) = NULL; } extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t); extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, int camid, const DepthCameraParams &depthCameraParams, cudaStream_t); //extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData); //extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); //extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams); //extern "C" void integrateDepthMapCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, cudaStream_t); //extern "C" void bindInputDepthColorTextures(const DepthCameraData& depthCameraData); SceneRep::SceneRep(nlohmann::json &config) : Configurable(config), m_frameCount(0), do_reset_(false) { _initCUDA(); // Allocates voxel structure on GPU _create(_parametersFromConfig()); on("SDFVoxelSize", [this](const ftl::config::Event &e) { do_reset_ = true; }); on("hashNumSDFBlocks", [this](const ftl::config::Event &e) { do_reset_ = true; }); on("hashNumBuckets", [this](const ftl::config::Event &e) { do_reset_ = true; }); on("hashMaxCollisionLinkedListSize", [this](const ftl::config::Event &e) { do_reset_ = true; }); on("SDFTruncation", [this](const ftl::config::Event &e) { m_hashParams.m_truncation = value("SDFTruncation", 0.1f); }); on("SDFTruncationScale", [this](const ftl::config::Event &e) { m_hashParams.m_truncScale = value("SDFTruncationScale", 0.01f); }); on("SDFMaxIntegrationDistance", [this](const ftl::config::Event &e) { m_hashParams.m_maxIntegrationDistance = value("SDFMaxIntegrationDistance", 10.0f); }); on("showRegistration", [this](const ftl::config::Event &e) { reg_mode_ = value("showRegistration", false); }); reg_mode_ = value("showRegistration", false); cudaSafeCall(cudaStreamCreate(&integ_stream_)); //integ_stream_ = 0; } SceneRep::~SceneRep() { _destroy(); cudaStreamDestroy(integ_stream_); } bool SceneRep::_initCUDA() { // Do an initial CUDA check int cuda_device_count = 0; cudaSafeCall(cudaGetDeviceCount(&cuda_device_count)); CHECK_GE(cuda_device_count, 1) << "No CUDA devices found"; LOG(INFO) << "CUDA Devices (" << cuda_device_count << "):"; vector<cudaDeviceProp> properties(cuda_device_count); for (int i=0; i<cuda_device_count; i++) { cudaSafeCall(cudaGetDeviceProperties(&properties[i], i)); LOG(INFO) << " - " << properties[i].name; } int desired_device = value("cudaDevice", 0); cuda_device_ = (desired_device < cuda_device_count) ? desired_device : cuda_device_count-1; cudaSafeCall(cudaSetDevice(cuda_device_)); // TODO:(Nick) Check memory is sufficient // TODO:(Nick) Find out what our compute capability should be. LOG(INFO) << "CUDA Compute: " << properties[cuda_device_].major << "." << properties[cuda_device_].minor; return true; } void SceneRep::addSource(ftl::rgbd::Source *src) { auto &cam = cameras_.emplace_back(); cam.source = src; cam.params.m_imageWidth = 0; } extern "C" void updateCUDACameraConstant(ftl::voxhash::DepthCameraCUDA *data, int count); void SceneRep::_updateCameraConstant() { std::vector<ftl::voxhash::DepthCameraCUDA> cams(cameras_.size()); for (size_t i=0; i<cameras_.size(); ++i) { cams[i] = cameras_[i].gpu.data; cams[i].pose = MatrixConversion::toCUDA(cameras_[i].source->getPose().cast<float>()); cams[i].poseInverse = MatrixConversion::toCUDA(cameras_[i].source->getPose().cast<float>().inverse()); } updateCUDACameraConstant(cams.data(), cams.size()); } int SceneRep::upload() { int active = 0; for (size_t i=0; i<cameras_.size(); ++i) { cameras_[i].source->grab(); } for (size_t i=0; i<cameras_.size(); ++i) { auto &cam = cameras_[i]; if (!cam.source->isReady()) { cam.params.m_imageWidth = 0; // TODO(Nick) : Free gpu allocs if was ready before LOG(INFO) << "Source not ready: " << cam.source->getURI(); continue; } else { auto in = cam.source; cam.params.fx = in->parameters().fx; cam.params.fy = in->parameters().fy; cam.params.mx = -in->parameters().cx; cam.params.my = -in->parameters().cy; // Only now do we have camera parameters for allocations... if (cam.params.m_imageWidth == 0) { cam.params.m_imageWidth = in->parameters().width; cam.params.m_imageHeight = in->parameters().height; cam.params.m_sensorDepthWorldMax = in->parameters().maxDepth; cam.params.m_sensorDepthWorldMin = in->parameters().minDepth; cam.gpu.alloc(cam.params, true); LOG(INFO) << "GPU Allocated camera " << i; } } cam.params.flags = m_frameCount; } _updateCameraConstant(); //cudaSafeCall(cudaDeviceSynchronize()); for (size_t i=0; i<cameras_.size(); ++i) { auto &cam = cameras_[i]; // Get the RGB-Depth frame from input Source *input = cam.source; Mat rgb, depth; // TODO(Nick) Direct GPU upload to save copy input->getFrames(rgb,depth); active += 1; if (depth.cols == 0) continue; // Must be in RGBA for GPU Mat rgbt, rgba; cv::cvtColor(rgb,rgbt, cv::COLOR_BGR2Lab); cv::cvtColor(rgbt,rgba, cv::COLOR_BGR2BGRA); // Send to GPU and merge view into scene //cam.gpu.updateParams(cam.params); cam.gpu.updateData(depth, rgba, cam.stream); //setLastRigidTransform(input->getPose().cast<float>()); //make the rigid transform available on the GPU //m_hashData.updateParams(m_hashParams, cv::cuda::StreamAccessor::getStream(cam.stream)); //if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream))); //allocate all hash blocks which are corresponding to depth map entries if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream)); // Calculate normals } // Must have finished all allocations and rendering before next integration cudaSafeCall(cudaDeviceSynchronize()); return active; } void SceneRep::integrate() { /*for (size_t i=0; i<cameras_.size(); ++i) { auto &cam = cameras_[i]; setLastRigidTransform(cam.source->getPose().cast<float>()); //m_hashData.updateParams(m_hashParams); //generate a linear hash array with only occupied entries _compactifyVisible(cam.params); //volumetrically integrate the depth data into the depth SDFBlocks //_integrateDepthMap(cam.gpu, cam.params); //_garbageCollect(); m_numIntegratedFrames++; }*/ _compactifyAllocated(); _integrateDepthMaps(); } void SceneRep::garbage() { //_compactifyAllocated(); if (value("voxels", false)) _garbageCollect(); //cudaSafeCall(cudaStreamSynchronize(integ_stream_)); } /*void SceneRep::integrate(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams, unsigned int* d_bitMask) { setLastRigidTransform(lastRigidTransform); //make the rigid transform available on the GPU m_hashData.updateParams(m_hashParams); //allocate all hash blocks which are corresponding to depth map entries _alloc(depthCameraData, depthCameraParams, d_bitMask); //generate a linear hash array with only occupied entries _compactifyHashEntries(); //volumetrically integrate the depth data into the depth SDFBlocks _integrateDepthMap(depthCameraData, depthCameraParams); _garbageCollect(depthCameraData); m_numIntegratedFrames++; }*/ /*void SceneRep::setLastRigidTransformAndCompactify(const Eigen::Matrix4f& lastRigidTransform, const DepthCameraData& depthCameraData) { setLastRigidTransform(lastRigidTransform); _compactifyHashEntries(); }*/ /* Nick: To reduce weights between frames */ void SceneRep::nextFrame() { if (do_reset_) { do_reset_ = false; _destroy(); _create(_parametersFromConfig()); } else { //ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_); //if (reg_mode_) ftl::cuda::clearVoxels(m_hashData, m_hashParams); //else ftl::cuda::starveVoxels(m_hashData, m_hashParams, integ_stream_); m_numIntegratedFrames = 0; } } //! resets the hash to the initial state (i.e., clears all data) void SceneRep::reset() { m_numIntegratedFrames = 0; m_hashData.updateParams(m_hashParams); resetCUDA(m_hashData, m_hashParams); } unsigned int SceneRep::getOccupiedCount() { unsigned int count; cudaSafeCall(cudaMemcpy(&count, m_hashData.d_hashCompactifiedCounter, sizeof(unsigned int), cudaMemcpyDeviceToHost)); return count+1; //there is one more free than the address suggests (0 would be also a valid address) } HashParams SceneRep::_parametersFromConfig() { //auto &cfg = ftl::config::resolve(config); HashParams params; // First camera view is set to identity pose to be at the centre of // the virtual coordinate space. params.m_hashNumBuckets = value("hashNumBuckets", 100000); params.m_virtualVoxelSize = value("SDFVoxelSize", 0.006f); params.m_maxIntegrationDistance = value("SDFMaxIntegrationDistance", 10.0f); params.m_truncation = value("SDFTruncation", 0.1f); params.m_truncScale = value("SDFTruncationScale", 0.01f); params.m_integrationWeightSample = value("SDFIntegrationWeightSample", 10); params.m_integrationWeightMax = value("SDFIntegrationWeightMax", 255); params.m_spatialSmoothing = value("spatialSmoothing", 0.04f); // 4cm params.m_colourSmoothing = value("colourSmoothing", 20.0f); params.m_confidenceThresh = value("confidenceThreshold", 20.0f); params.m_flags = 0; params.m_flags |= (value("clipping", false)) ? ftl::voxhash::kFlagClipping : 0; params.m_flags |= (value("mls", false)) ? ftl::voxhash::kFlagMLS : 0; params.m_maxBounds = make_int3( value("bbox_x_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), value("bbox_y_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), value("bbox_z_max", 2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE)); params.m_minBounds = make_int3( value("bbox_x_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), value("bbox_y_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE), value("bbox_z_min", -2.0f) / (params.m_virtualVoxelSize*SDF_BLOCK_SIZE)); return params; } void SceneRep::_create(const HashParams& params) { m_hashParams = params; m_hashData.allocate(m_hashParams); reset(); } void SceneRep::_destroy() { m_hashData.free(); } void SceneRep::_alloc(int camid, cudaStream_t stream) { // NOTE (nick): We might want this later... /*if (false) { // TODO(Nick) Make this work without memcpy to host first allocate until all blocks are allocated unsigned int prevFree = 0; //getHeapFreeCount(); while (1) { resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream); allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream); unsigned int currFree = getHeapFreeCount(); if (prevFree != currFree) { prevFree = currFree; } else { break; } } } else {*/ //this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame) resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream); allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream); //} } void SceneRep::_compactifyVisible(const DepthCameraParams &camera) { //const DepthCameraData& depthCameraData) { ftl::cuda::compactifyOccupied(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance //m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU } void SceneRep::_compactifyAllocated() { ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance //std::cout << "Occ blocks = " << m_hashParams.m_numOccupiedBlocks << std::endl; //m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU } /*void SceneRep::_integrateDepthMap(const DepthCameraData& depthCameraData, const DepthCameraParams& depthCameraParams) { if (!reg_mode_) ftl::cuda::integrateDepthMap(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_); else ftl::cuda::integrateRegistration(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_); }*/ extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height); void SceneRep::_integrateDepthMaps() { //cudaSafeCall(cudaDeviceSynchronize()); for (size_t i=0; i<cameras_.size(); ++i) { if (!cameras_[i].source->isReady()) continue; //ftl::cuda::clear_depth(*(cameras_[i].gpu.depth2_tex_), integ_stream_); ftl::cuda::clear_points(*(cameras_[i].gpu.points_tex_), integ_stream_); ftl::cuda::mls_smooth(*(cameras_[i].gpu.points_tex_), m_hashParams, cameras_.size(), i, integ_stream_); //ftl::cuda::int_to_float(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), 1.0f / 1000.0f, integ_stream_); //ftl::cuda::hole_fill(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), cameras_[i].params, integ_stream_); //bilateralFilterFloatMap(cameras_[i].gpu.depth_tex_->devicePtr(), cameras_[i].gpu.depth3_tex_->devicePtr(), 3, 7, cameras_[i].gpu.depth_tex_->width(), cameras_[i].gpu.depth_tex_->height()); } if (value("voxels", false)) ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_); } void SceneRep::_garbageCollect() { //ftl::cuda::garbageCollectIdentify(m_hashData, m_hashParams, integ_stream_); resetHashBucketMutexCUDA(m_hashData, m_hashParams, integ_stream_); //needed if linked lists are enabled -> for memeory deletion ftl::cuda::garbageCollectFree(m_hashData, m_hashParams, integ_stream_); }