Skip to content
Snippets Groups Projects
Commit f66bd0d0 authored by Nicolas Pope's avatar Nicolas Pope
Browse files

WIP Refactor GPU camera

parent b63a157f
No related branches found
No related tags found
1 merge request!109Resolves #173 remove voxel code
Pipeline #13555 failed
......@@ -5,9 +5,7 @@
set(REPSRC
src/main.cpp
src/voxel_scene.cpp
src/scene_rep_hash_sdf.cu
#src/ray_cast_sdf.cu
src/voxel_render.cu
src/camera_util.cu
#src/ray_cast_sdf.cpp
src/registration.cpp
......
......@@ -19,6 +19,8 @@ bool ILW::process(ftl::rgbd::FrameSet &fs) {
for (int j=0; j<3; ++j) {
_phase2(fs);
}
// TODO: Break if no time left
}
return true;
......@@ -26,10 +28,14 @@ bool ILW::process(ftl::rgbd::FrameSet &fs) {
bool ILW::_phase0(ftl::rgbd::FrameSet &fs) {
// Clear points channel...
// Upload camera data?
}
bool ILW::_phase1(ftl::rgbd::FrameSet &fs) {
// Run correspondence kernel to find points
// For each camera combination
}
bool ILW::_phase2(ftl::rgbd::FrameSet &fs) {
......
......@@ -13,6 +13,12 @@ struct ILWData{
ftl::cuda::TextureObject<float4> correspondence;
ftl::cuda::TextureObject<float4> points;
// Residual potential energy
ftl::cuda::TextureObject<float> residual;
// Flow magnitude
ftl::cuda::TextureObject<float> flow;
};
}
......
#include "splat_render_cuda.hpp"
#include <cuda_runtime.h>
#include <ftl/cuda_matrix_util.hpp>
#include "splat_params.hpp"
#define T_PER_BLOCK 8
#define NUM_GROUPS_X 1024
#define NUM_CUDA_BLOCKS 10000
using ftl::cuda::TextureObject;
using ftl::render::SplatParams;
__global__ void clearDepthKernel(ftl::voxhash::HashData hashData, TextureObject<int> depth) {
const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < depth.width() && y < depth.height()) {
depth(x,y) = 0x7f800000; //PINF;
//colour(x,y) = make_uchar4(76,76,82,0);
}
}
#define SDF_BLOCK_SIZE_PAD 8
#define SDF_BLOCK_BUFFER 512 // > 8x8x8
#define SDF_DX 1
#define SDF_DY SDF_BLOCK_SIZE_PAD
#define SDF_DZ (SDF_BLOCK_SIZE_PAD*SDF_BLOCK_SIZE_PAD)
#define LOCKED 0x7FFFFFFF
//! computes the (local) virtual voxel pos of an index; idx in [0;511]
__device__
int3 pdelinVoxelIndex(uint idx) {
int x = idx % SDF_BLOCK_SIZE_PAD;
int y = (idx % (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD)) / SDF_BLOCK_SIZE_PAD;
int z = idx / (SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD);
return make_int3(x,y,z);
}
//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
__device__
uint plinVoxelPos(const int3& virtualVoxelPos) {
return
virtualVoxelPos.z * SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD +
virtualVoxelPos.y * SDF_BLOCK_SIZE_PAD +
virtualVoxelPos.x;
}
//! computes the linearized index of a local virtual voxel pos; pos in [0;7]^3
__device__
uint plinVoxelPos(int x, int y, int z) {
return
z * SDF_BLOCK_SIZE_PAD * SDF_BLOCK_SIZE_PAD +
y * SDF_BLOCK_SIZE_PAD + x;
}
__device__
void deleteVoxel(ftl::voxhash::Voxel& v) {
v.color = make_uchar3(0,0,0);
v.weight = 0;
v.sdf = PINF;
}
__device__ inline int3 blockDelinear(const int3 &base, uint i) {
return make_int3(base.x + (i & 0x1), base.y + (i & 0x2), base.z + (i & 0x4));
}
__device__ inline uint blockLinear(int x, int y, int z) {
return x + (y << 1) + (z << 2);
}
__device__ inline bool getVoxel(uint *voxels, int ix) {
return voxels[ix/32] & (0x1 << (ix % 32));
}
__global__ void occupied_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) {
__shared__ uint voxels[16];
__shared__ ftl::voxhash::HashEntryHead block;
// Stride over all allocated blocks
for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
__syncthreads();
const uint i = threadIdx.x; //inside of an SDF block
if (i == 0) block = hashData.d_hashCompactified[bi]->head;
if (i < 16) {
voxels[i] = hashData.d_hashCompactified[bi]->voxels[i];
//valid[i] = hashData.d_hashCompactified[bi]->validity[i];
}
// Make sure all hash entries are cached
__syncthreads();
const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ));
const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i));
const int3 pi = pi_base + vp;
const float3 worldPos = hashData.virtualVoxelPosToWorld(pi);
const bool v = getVoxel(voxels, i);
uchar4 color = make_uchar4(255,0,0,255);
bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2);
// Only for surface voxels, work out screen coordinates
if (!is_surface) continue;
// TODO: For each original camera, render a new depth map
const float3 camPos = params.m_viewMatrix * worldPos;
const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f)
//printf("Worldpos: %f,%f,%f\n", camPos.x, camPos.y, camPos.z);
if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
const unsigned int x = screenPos.x;
const unsigned int y = screenPos.y;
const int idepth = static_cast<int>(camPos.z * 1000.0f);
// See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds
if (x < depth.width() && y < depth.height()) {
atomicMin(&depth(x,y), idepth);
}
} // Stride
}
__global__ void isosurface_image_kernel(ftl::voxhash::HashData hashData, TextureObject<int> depth, SplatParams params) {
// TODO:(Nick) Reduce bank conflicts by aligning these
__shared__ uint voxels[16];
//__shared__ uint valid[16];
__shared__ ftl::voxhash::HashEntryHead block;
// Stride over all allocated blocks
for (int bi=blockIdx.x; bi<*hashData.d_hashCompactifiedCounter; bi+=NUM_CUDA_BLOCKS) {
__syncthreads();
const uint i = threadIdx.x; //inside of an SDF block
if (i == 0) block = hashData.d_hashCompactified[bi]->head;
if (i < 16) {
voxels[i] = hashData.d_hashCompactified[bi]->voxels[i];
//valid[i] = hashData.d_hashCompactified[bi]->validity[i];
}
// Make sure all hash entries are cached
__syncthreads();
const int3 pi_base = hashData.SDFBlockToVirtualVoxelPos(make_int3(block.posXYZ));
const int3 vp = make_int3(hashData.delinearizeVoxelIndex(i));
const int3 pi = pi_base + vp;
//const uint j = plinVoxelPos(vp); // Padded linear index
const float3 worldPos = hashData.virtualVoxelPosToWorld(pi);
// Load distances and colours into shared memory + padding
//const ftl::voxhash::Voxel &v = hashData.d_SDFBlocks[block.ptr + i];
//voxels[j] = v;
const bool v = getVoxel(voxels, i);
//__syncthreads();
//if (voxels[j].weight == 0) continue;
if (vp.x == 7 || vp.y == 7 || vp.z == 7) continue;
int edgeX = (vp.x == 0 ) ? 1 : 0;
int edgeY = (vp.y == 0 ) ? 1 : 0;
int edgeZ = (vp.z == 0 ) ? 1 : 0;
uchar4 color = make_uchar4(255,0,0,255);
bool is_surface = v; //((params.m_flags & ftl::render::kShowBlockBorders) && edgeX + edgeY + edgeZ >= 2);
//if (is_surface) color = make_uchar4(255,(vp.x == 0 && vp.y == 0 && vp.z == 0) ? 255 : 0,0,255);
if (v) continue; // !getVoxel(valid, i)
//if (vp.z == 7) voxels[j].color = make_uchar3(0,255,(voxels[j].sdf < 0.0f) ? 255 : 0);
// Identify surfaces through sign change. Since we only check in one direction
// it is fine to check for any sign change?
#pragma unroll
for (int u=0; u<=1; u++) {
for (int v=0; v<=1; v++) {
for (int w=0; w<=1; w++) {
const int3 uvi = make_int3(vp.x+u,vp.y+v,vp.z+w);
// Skip these cases since we didn't load voxels properly
//if (uvi.x == 8 || uvi.z == 8 || uvi.y == 8) continue;
const bool vox = getVoxel(voxels, hashData.linearizeVoxelPos(uvi));
if (vox) { //getVoxel(valid, hashData.linearizeVoxelPos(uvi))) {
is_surface = true;
// Should break but is slower?
}
}
}
}
// Only for surface voxels, work out screen coordinates
if (!is_surface) continue;
// TODO: For each original camera, render a new depth map
const float3 camPos = params.m_viewMatrix * worldPos;
const float2 screenPosf = params.camera.cameraToKinectScreenFloat(camPos);
const uint2 screenPos = make_uint2(make_int2(screenPosf)); // + make_float2(0.5f, 0.5f)
//printf("Worldpos: %f,%f,%f\n", camPos.x, camPos.y, camPos.z);
if (camPos.z < params.camera.m_sensorDepthWorldMin) continue;
// For this voxel in hash, get its screen position and check it is on screen
// Convert depth map to int by x1000 and use atomicMin
//const int pixsize = static_cast<int>((c_hashParams.m_virtualVoxelSize*params.camera.fx/(camPos.z*0.8f)))+1; // Magic number increase voxel to ensure coverage
const unsigned int x = screenPos.x;
const unsigned int y = screenPos.y;
const int idepth = static_cast<int>(camPos.z * 1000.0f);
// See: Gunther et al. 2013. A GPGPU-based Pipeline for Accelerated Rendering of Point Clouds
if (x < depth.width() && y < depth.height()) {
atomicMin(&depth(x,y), idepth);
}
} // Stride
}
void ftl::cuda::isosurface_point_image(const ftl::voxhash::HashData& hashData,
const TextureObject<int> &depth,
const SplatParams &params, cudaStream_t stream) {
const dim3 clear_gridSize((depth.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
const dim3 clear_blockSize(T_PER_BLOCK, T_PER_BLOCK);
clearDepthKernel<<<clear_gridSize, clear_blockSize, 0, stream>>>(hashData, depth);
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
const unsigned int threadsPerBlock = SDF_BLOCK_SIZE*SDF_BLOCK_SIZE*SDF_BLOCK_SIZE;
const dim3 gridSize(NUM_CUDA_BLOCKS, 1);
const dim3 blockSize(threadsPerBlock, 1);
occupied_image_kernel<<<gridSize, blockSize, 0, stream>>>(hashData, depth, params);
cudaSafeCall( cudaGetLastError() );
#ifdef _DEBUG
cudaSafeCall(cudaDeviceSynchronize());
#endif
}
#include <ftl/voxel_scene.hpp>
#include "compactors.hpp"
#include "garbage.hpp"
#include "integrators.hpp"
#include "depth_camera_cuda.hpp"
#include <opencv2/core/cuda_stream_accessor.hpp>
......@@ -16,9 +13,9 @@ using std::vector;
#define SAFE_DELETE_ARRAY(a) { delete [] (a); (a) = NULL; }
extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t);
extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, int camid, const DepthCameraParams &depthCameraParams, cudaStream_t);
//extern "C" void resetCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
//extern "C" void resetHashBucketMutexCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, cudaStream_t);
//extern "C" void allocCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, int camid, const DepthCameraParams &depthCameraParams, cudaStream_t);
//extern "C" void fillDecisionArrayCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams, const DepthCameraData& depthCameraData);
//extern "C" void compactifyHashCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
//extern "C" unsigned int compactifyHashAllInOneCUDA(ftl::voxhash::HashData& hashData, const ftl::voxhash::HashParams& hashParams);
......@@ -184,7 +181,7 @@ int SceneRep::upload() {
//if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream)));
//allocate all hash blocks which are corresponding to depth map entries
if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
//if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
// Calculate normals
}
......@@ -264,7 +261,7 @@ int SceneRep::upload(ftl::rgbd::FrameSet &fs) {
//if (i > 0) cudaSafeCall(cudaStreamSynchronize(cv::cuda::StreamAccessor::getStream(cameras_[i-1].stream)));
//allocate all hash blocks which are corresponding to depth map entries
if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
//if (value("voxels", false)) _alloc(i, cv::cuda::StreamAccessor::getStream(cam.stream));
// Calculate normals
}
......@@ -299,7 +296,7 @@ void SceneRep::integrate() {
void SceneRep::garbage() {
//_compactifyAllocated();
if (value("voxels", false)) _garbageCollect();
//if (value("voxels", false)) _garbageCollect();
//cudaSafeCall(cudaStreamSynchronize(integ_stream_));
}
......@@ -419,19 +416,19 @@ void SceneRep::_alloc(int camid, cudaStream_t stream) {
}
else {*/
//this version is faster, but it doesn't guarantee that all blocks are allocated (staggers alloc to the next frame)
resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream);
allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream);
//resetHashBucketMutexCUDA(m_hashData, m_hashParams, stream);
//allocCUDA(m_hashData, m_hashParams, camid, cameras_[camid].params, stream);
//}
}
void SceneRep::_compactifyVisible(const DepthCameraParams &camera) { //const DepthCameraData& depthCameraData) {
ftl::cuda::compactifyOccupied(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance
//ftl::cuda::compactifyOccupied(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance
//m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU
}
void SceneRep::_compactifyAllocated() {
ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance
//ftl::cuda::compactifyAllocated(m_hashData, m_hashParams, integ_stream_); //this version uses atomics over prefix sums, which has a much better performance
//std::cout << "Occ blocks = " << m_hashParams.m_numOccupiedBlocks << std::endl;
//m_hashData.updateParams(m_hashParams); //make sure numOccupiedBlocks is updated on the GPU
}
......@@ -441,7 +438,7 @@ void SceneRep::_compactifyAllocated() {
else ftl::cuda::integrateRegistration(m_hashData, m_hashParams, depthCameraData, depthCameraParams, integ_stream_);
}*/
extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height);
//extern "C" void bilateralFilterFloatMap(float* d_output, float* d_input, float sigmaD, float sigmaR, unsigned int width, unsigned int height);
void SceneRep::_integrateDepthMaps() {
//cudaSafeCall(cudaDeviceSynchronize());
......@@ -456,11 +453,11 @@ void SceneRep::_integrateDepthMaps() {
//ftl::cuda::hole_fill(*(cameras_[i].gpu.depth2_tex_), *(cameras_[i].gpu.depth_tex_), cameras_[i].params, integ_stream_);
//bilateralFilterFloatMap(cameras_[i].gpu.depth_tex_->devicePtr(), cameras_[i].gpu.depth3_tex_->devicePtr(), 3, 7, cameras_[i].gpu.depth_tex_->width(), cameras_[i].gpu.depth_tex_->height());
}
if (value("voxels", false)) ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_);
//if (value("voxels", false)) ftl::cuda::integrateDepthMaps(m_hashData, m_hashParams, cameras_.size(), integ_stream_);
}
void SceneRep::_garbageCollect() {
//ftl::cuda::garbageCollectIdentify(m_hashData, m_hashParams, integ_stream_);
resetHashBucketMutexCUDA(m_hashData, m_hashParams, integ_stream_); //needed if linked lists are enabled -> for memeory deletion
ftl::cuda::garbageCollectFree(m_hashData, m_hashParams, integ_stream_);
//resetHashBucketMutexCUDA(m_hashData, m_hashParams, integ_stream_); //needed if linked lists are enabled -> for memeory deletion
//ftl::cuda::garbageCollectFree(m_hashData, m_hashParams, integ_stream_);
}
......@@ -2,23 +2,69 @@
#ifndef _FTL_RGBD_CAMERA_PARAMS_HPP_
#define _FTL_RGBD_CAMERA_PARAMS_HPP_
#include <vector_types.h>
#include <cuda_runtime.h>
#include <ftl/cuda_util.hpp>
namespace ftl{
namespace rgbd {
struct Camera {
double fx;
double fy;
double cx;
double cy;
unsigned int width;
unsigned int height;
double minDepth;
double maxDepth;
double baseline;
double doffs;
/**
* All properties associated with cameras. This structure is designed to
* operate on CPU and GPU.
*/
struct __align__(16) Camera {
double fx; // Focal length X
double fy; // Focal length Y (usually same as fx)
double cx; // Principle point Y
double cy; // Principle point Y
unsigned int width; // Pixel width
unsigned int height; // Pixel height
double minDepth; // Near clip in meters
double maxDepth; // Far clip in meters
double baseline; // For stereo pair
double doffs; // Disparity offset
/**
* Convert camera coordinates into screen coordinates.
*/
template <typename T> __device__ T camToScreen(const float3 &pos) const;
/**
* Convert screen plus depth into camera coordinates.
*/
__device__ float3 screenToCam(uint ux, uint uy, float depth) const;
};
};
};
// ---- IMPLEMENTATIONS --------------------------------------------------------
template <> __device__
inline float2 ftl::rgbd::Camera::camToScreen<float2>(const float3 &pos) const {
return make_float2(
pos.x*fx/pos.z + cx,
pos.y*fy/pos.z + cy);
}
template <> __device__
inline int2 ftl::rgbd::Camera::camToScreen<int2>(const float3 &pos) const {
float2 pImage = camToScreen<float2>(pos);
return make_int2(pImage + make_float2(0.5f, 0.5f));
}
template <> __device__
inline uint2 ftl::rgbd::Camera::camToScreen<uint2>(const float3 &pos) const {
int2 p = camToScreen<int2>(pos);
return make_uint2(p.x, p.y);
}
__device__
inline float3 ftl::rgbd::Camera::screenToCam(uint ux, uint uy, float depth) const {
const float x = ((float)ux-cx) / fx;
const float y = ((float)uy-cy) / fy;
return make_float3(depth*x, depth*y, depth);
}
#endif
#ifndef _FTL_SCENE_FRAMESCENE_HPP_
#define _FTL_SCENE_FRAMESCENE_HPP_
#include <ftl/scene/scene.hpp>
namespace ftl {
namespace scene {
/**
* A scene represented internally as a set of image frames that together
* define a point cloud.
*/
class FrameScene : public ftl::scene::Scene {
public:
FrameScene();
~FrameScene();
bool update(ftl::rgbd::FrameSet &);
bool render(ftl::rgbd::Source *, ftl::rgbd::Frame &);
bool encode(std::vector<uint8_t> &);
bool decode(const std::vector<uint8_t> &);
};
}
}
#endif // _FTL_SCENE_FRAMESCENE_HPP_
......@@ -2,19 +2,20 @@
#define _FTL_RECONSTRUCT_SCENE_HPP_
namespace ftl {
namespace scene {
class Scene {
public:
Scene();
~Scene();
virtual ~Scene();
void getFrame(eigen::Matrix4f &pose, ftl::rgbd::Frame &);
ftl::rgbd::Frame getFrame(eigen::Matrix4f &pose);
virtual bool render(ftl::rgbd::Source *, ftl::rgbd::Frame &)=0;
void getFrameSet(ftl::rgbd::FrameSet &);
ftl::rgbd::FrameSet &getFrameSet();
virtual bool encode(std::vector<uint8_t> &)=0;
virtual bool decode(const std::vector<uint8_t> &)=0;
};
}
} // scene
} // ftl
#endif // _FTL_RECONSTRUCT_SCENE_HPP_
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment