From e98725a4c2784a165835f1cd559cfaa7dd6939ee Mon Sep 17 00:00:00 2001 From: Nicolas Pope <nicolas.pope@utu.fi> Date: Thu, 20 Feb 2020 13:29:05 +0200 Subject: [PATCH] Improvements to depth alignment --- applications/gui/src/camera.cpp | 6 +- applications/gui/src/src_window.cpp | 2 +- .../common/cpp/include/ftl/cuda_half.hpp | 5 + .../common/cpp/include/ftl/cuda_texture.hpp | 9 + components/common/cpp/include/ftl/traits.hpp | 1 + .../operators/include/ftl/operators/mvmls.hpp | 2 + .../operators/src/fusion/correspondence.cu | 545 +++++++++++++----- .../src/fusion/correspondence_common.hpp | 16 +- .../src/fusion/correspondence_depth.cu | 41 +- .../src/fusion/correspondence_util.cu | 108 +++- components/operators/src/fusion/mvmls.cpp | 155 ++++- .../operators/src/fusion/mvmls_cuda.hpp | 48 ++ 12 files changed, 763 insertions(+), 175 deletions(-) diff --git a/applications/gui/src/camera.cpp b/applications/gui/src/camera.cpp index 961728d88..15e9ce265 100644 --- a/applications/gui/src/camera.cpp +++ b/applications/gui/src/camera.cpp @@ -347,10 +347,14 @@ void ftl::gui::Camera::_draw(std::vector<ftl::rgbd::FrameSet*> &fss) { // FIXME: Should perhaps remain locked until after end is called? // Definitely: causes flashing if not. - UNIQUE_LOCK(fs->mtx,lk); + //UNIQUE_LOCK(fs->mtx,lk); + fs->mtx.lock(); renderer_->submit(fs, Channels<0>(channel_), transforms_[fs->id]); } renderer_->end(); + for (auto *fs : fss) { + fs->mtx.unlock(); + } } if (!post_pipe_) { diff --git a/applications/gui/src/src_window.cpp b/applications/gui/src/src_window.cpp index 1fb77074c..bf3712995 100644 --- a/applications/gui/src/src_window.cpp +++ b/applications/gui/src/src_window.cpp @@ -237,8 +237,8 @@ void SourceWindow::_checkFrameSets(int id) { p->append<ftl::operators::CullWeight>("remove_weights")->value("enabled", false); p->append<ftl::operators::DegradeWeight>("degrade"); p->append<ftl::operators::VisCrossSupport>("viscross")->set("enabled", false); - p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false); p->append<ftl::operators::CullDiscontinuity>("remove_discontinuity"); + p->append<ftl::operators::MultiViewMLS>("mvmls")->value("enabled", false); pre_pipelines_.push_back(p); framesets_.push_back(new ftl::rgbd::FrameSet); diff --git a/components/common/cpp/include/ftl/cuda_half.hpp b/components/common/cpp/include/ftl/cuda_half.hpp index f02b2d5ba..d35be793b 100644 --- a/components/common/cpp/include/ftl/cuda_half.hpp +++ b/components/common/cpp/include/ftl/cuda_half.hpp @@ -15,6 +15,11 @@ __host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half4>() { return cudaCreateChannelDesc<short4>(); } +template <> +__host__ inline cudaChannelFormatDesc cudaCreateChannelDesc<half>() { + return cudaCreateChannelDesc<short>(); +} + #ifdef __CUDACC__ // half4 functions diff --git a/components/common/cpp/include/ftl/cuda_texture.hpp b/components/common/cpp/include/ftl/cuda_texture.hpp index fa6c41866..c795cd4f7 100644 --- a/components/common/cpp/include/ftl/cuda_texture.hpp +++ b/components/common/cpp/include/ftl/cuda_texture.hpp @@ -15,6 +15,7 @@ template <> struct Float<uchar4> { typedef float4 type; }; template <> struct Float<uint8_t> { typedef float type; }; template <> struct Float<short2> { typedef float2 type; }; template <> struct Float<short> { typedef float type; }; +template <> struct Float<half> { typedef float type; }; template <typename T> struct ScaleValue; @@ -25,6 +26,7 @@ template <> struct ScaleValue<float> { static constexpr float value = 1.0f; }; template <> struct ScaleValue<float4> { static constexpr float value = 1.0f; }; template <> struct ScaleValue<half4> { static constexpr float value = 1.0f; }; template <> struct ScaleValue<short> { static constexpr float value = 32000.0f; }; +template <> struct ScaleValue<half> { static constexpr float value = 1.0f; }; /** * Represent a CUDA texture object. Instances of this class can be used on both @@ -110,12 +112,19 @@ class TextureObject : public TextureObjectBase { #ifdef __CUDACC__ __device__ inline T tex2D(int u, int v) const { return ::tex2D<T>(texobj_, u, v); } + __device__ inline T tex2D(const int2 &p) const { return ::tex2D<T>(texobj_, p.x, p.y); } + __device__ inline T tex2D(const short2 &p) const { return ::tex2D<T>(texobj_, p.x, p.y); } __device__ inline T tex2D(unsigned int u, unsigned int v) const { return ::tex2D<T>(texobj_, (int)u, (int)v); } __device__ inline typename Float<T>::type tex2D(float u, float v) const { return ::tex2D<typename Float<T>::type>(texobj_, u, v) * ScaleValue<T>::value; } + __device__ inline typename Float<T>::type tex2D(const float2 &p) const { return ::tex2D<typename Float<T>::type>(texobj_, p.x, p.y) * ScaleValue<T>::value; } #endif __host__ __device__ inline const T &operator()(int u, int v) const { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + __host__ __device__ inline const T &operator()(const int2 &p) const { return reinterpret_cast<T*>(ptr_)[p.x+p.y*pitch2_]; } + __host__ __device__ inline const T &operator()(const float2 &p) const { return reinterpret_cast<T*>(ptr_)[int(p.x+0.5f)+int(p.y+0.5f)*pitch2_]; } __host__ __device__ inline T &operator()(int u, int v) { return reinterpret_cast<T*>(ptr_)[u+v*pitch2_]; } + __host__ __device__ inline T &operator()(const int2 &p) { return reinterpret_cast<T*>(ptr_)[p.x+p.y*pitch2_]; } + __host__ __device__ inline T &operator()(const float2 &p) { return reinterpret_cast<T*>(ptr_)[int(p.x+0.5f)+int(p.y+0.5f)*pitch2_]; } /** * Cast a base texture object to this type of texture object. If the diff --git a/components/common/cpp/include/ftl/traits.hpp b/components/common/cpp/include/ftl/traits.hpp index 8abf07fc0..df44e36c7 100644 --- a/components/common/cpp/include/ftl/traits.hpp +++ b/components/common/cpp/include/ftl/traits.hpp @@ -39,6 +39,7 @@ template <> struct OpenCVType<float2> { static constexpr int value = CV_32FC2; } template <> struct OpenCVType<float3> { static constexpr int value = CV_32FC3; }; template <> struct OpenCVType<float4> { static constexpr int value = CV_32FC4; }; template <> struct OpenCVType<half4> { static constexpr int value = CV_16FC4; }; +template <> struct OpenCVType<half> { static constexpr int value = CV_16F; }; } } diff --git a/components/operators/include/ftl/operators/mvmls.hpp b/components/operators/include/ftl/operators/mvmls.hpp index 824d88e3d..3e5c476d6 100644 --- a/components/operators/include/ftl/operators/mvmls.hpp +++ b/components/operators/include/ftl/operators/mvmls.hpp @@ -20,6 +20,8 @@ class MultiViewMLS : public ftl::operators::Operator { std::vector<ftl::cuda::TextureObject<float4>*> centroid_vert_; std::vector<ftl::cuda::TextureObject<half4>*> normals_horiz_; std::vector<ftl::cuda::TextureObject<float>*> contributions_; + + ftl::cuda::TextureObject<half> costs_[2]; }; } diff --git a/components/operators/src/fusion/correspondence.cu b/components/operators/src/fusion/correspondence.cu index 2df21a24d..b19a84af9 100644 --- a/components/operators/src/fusion/correspondence.cu +++ b/components/operators/src/fusion/correspondence.cu @@ -48,12 +48,12 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams ¶ms, }*/ __device__ inline float distance(float4 p1, float4 p2) { - return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 10.0f); + return min(1.0f, max(max(fabsf(p1.x - p2.x),fabsf(p1.y - p2.y)), fabsf(p1.z - p2.z))/ 255.0f); //return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f); } __device__ inline int halfWarpCensus(float e) { - float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16 : 0, WARP_SIZE); + float e0 = __shfl_sync(FULL_MASK, e, (threadIdx.x >= 16) ? 16+6 : 0+6, WARP_SIZE); int c = (e > e0) ? 1 << (threadIdx.x % 16) : 0; for (int i = WARP_SIZE/4; i > 0; i /= 2) { const int other = __shfl_xor_sync(FULL_MASK, c, i, WARP_SIZE); @@ -90,172 +90,443 @@ __device__ float dissimilarity(const float4 &l, const float4 &rp, const float4 & return min(min(rpd, rnd), rcd); } +/** + * One thread from each half warp writes a cost value to the correct cost + * memory location. + */ +__device__ inline void writeCost(TextureObject<half> &costs, int x, int y, int i, float cost) { + if ((threadIdx.x & 0xF) == 0) costs(x>>2, y>>2 + (costs.height()/16)*i) = __float2half(cost); +} + +__device__ inline float readCost(TextureObject<half> &costs, int x, int y, int i) { + return __half2float(costs(x>>2, y>>2 + (costs.height()/16)*i)); +} +template <int SIZE> +__device__ inline void writeCost(float (&costs)[SIZE][WARP_SIZE*2], int i, float cost) { + costs[i][threadIdx.x + threadIdx.y*WARP_SIZE] = cost; +} + +template <int SIZE> +__device__ inline float readCost(const float (&costs)[SIZE][WARP_SIZE*2], int i) { + return costs[i][threadIdx.x + threadIdx.y*WARP_SIZE]; +} + +__device__ inline void writeCost(float (&costs)[WARP_SIZE*2], float cost) { + costs[threadIdx.x + threadIdx.y*WARP_SIZE] = cost; +} -template<int COR_STEPS, int FUNCTION> -__global__ void corresponding_point_kernel( +__device__ inline float readCost(const float (&costs)[WARP_SIZE*2]) { + return costs[threadIdx.x + threadIdx.y*WARP_SIZE]; +} + +template<int COR_STEPS> +__global__ void corresponding_colour_epi_kernel( + TextureObject<uchar4> colour1, + TextureObject<uchar4> colour2, TextureObject<float> d1, TextureObject<float> d2, - TextureObject<uchar4> c1, - TextureObject<uchar4> c2, TextureObject<short2> screenOut, TextureObject<float> conf, - TextureObject<uint8_t> mask, + //TextureObject<uint8_t> mask, + //TextureObject<half> costs, float4x4 pose, Camera cam1, Camera cam2, ftl::cuda::MvMLSParams params) { + + __shared__ float match_costs[COR_STEPS][WARP_SIZE*2]; + __shared__ float aggr_costs_f[COR_STEPS][WARP_SIZE*2]; + __shared__ float aggr_costs_b[COR_STEPS][WARP_SIZE*2]; + __shared__ float min_costs[WARP_SIZE*2]; + + const int2 pt = block4x4<2>(); + + // FIXME: Don't return like this due to warp operations + if (pt.x >= 0 && pt.y >=0 && pt.x < colour1.width() && pt.y < colour1.height()) { + screenOut(pt) = make_short2(-1,-1); + conf(pt) = PINF; + + const float depth1 = d1.tex2D(pt); + // TODO: If all depths within a half warp here are bad then can return + // early. + //if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return; + + short2 bestScreen = make_short2(-1,-1); + float bestcost = PINF; + float bestcost2 = PINF; + + const float3 camPosOrigin = pose * cam1.screenToCam(pt.x,pt.y,depth1); + const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin); + const float3 camPosDistant = pose * cam1.screenToCam(pt.x,pt.y,depth1 + 0.4f); + const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant); + const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x); + const float depthM = 0.4f / (lineDistant.x - lineOrigin.x); + //const float sub_pixel = depthM / (params.sub_pixel * depth1); + const float sub_pixel = params.sub_pixel; //depthM / (((depth1*depth1) / (cam1.fx * cam1.baseline)) / float(COR_STEPS/2)); + float2 linePos; + linePos.x = lineOrigin.x - float((COR_STEPS/2)) * sub_pixel; + linePos.y = lineOrigin.y - (float((COR_STEPS/2)) * lineM * sub_pixel); + + // generate census for camera1 + /*float4 c1_px_cy = colour1.tex2D(float(x)-0.5f,float(y)); + float4 c1_nx_cy = colour1.tex2D(float(x)+0.5f,float(y)); + float4 c1_cx_py = colour1.tex2D(float(x),float(y)-0.5f); + float4 c1_cx_ny = colour1.tex2D(float(x),float(y)+0.5f);*/ + float4 c1 = colour1.tex2D(float(pt.x)+0.5f,float(pt.y)+0.5f); + + int bestStep = COR_STEPS/2; + + float avgcost = 0.0f; + + //writeCost(min_costs, PINF); + + // Generate matching costs + #pragma unroll + for (int i=0; i<COR_STEPS; ++i) { + //writeCost(aggr_costs_f, i, 0.0f); + //writeCost(aggr_costs_b, i, 0.0f); + + // generate census at that location+0.5f + float4 c2 = colour2.tex2D(linePos); + /*float d = min( + distance(c1_px_cy-c1_nx_cy, c2-c1_nx_cy), + distance(c1_cx_py-c1_cx_ny, c2-c1_cx_ny) + );*/ + float d = distance(c1,c2); + //d *= d; + //d = (ftl::cuda::halfWarpSum(d*d) / 16.0f); + + if (linePos.x < 0 || linePos.x >= cam2.width || linePos.y < 0 || linePos.y >= cam2.height) d = PINF; + //if (ftl::cuda::halfWarpMax(d) == PINF) d = PINF; + + writeCost(match_costs, i, d); + bestcost = min(bestcost, d); + avgcost += d; + + // Move to next correspondence check point + linePos.x += sub_pixel*1.0f; + linePos.y += sub_pixel*lineM; + } + + writeCost(min_costs, bestcost); + bestcost = PINF; + //avgcost = avgcost / float(COR_STEPS); + + __syncwarp(); + + float confidence = 0.0f; + + // Forward aggregate matching costs + for (int i=0; i<COR_STEPS; ++i) { + float cost = readCost(match_costs,i); + float dcost = cost - ((avgcost-cost)/float(COR_STEPS-1)); + float variance = (dcost < 0.0f) ? square(dcost) : 1.0f; + cost /= sqrt(variance); + cost = (ftl::cuda::halfWarpSum(cost*cost) / 16.0f); + + cost = ftl::cuda::halfWarpSum(variance) / 16.0f; + + //float cost2 = (i>0) ? readCost(match_costs,i-1) + params.P1 : PINF; + //float cost3 = (i<COR_STEPS-1) ? readCost(match_costs,i+1) + params.P1 : PINF; + //float cost4 = readCost(min_costs) + params.P2; + + // Instead of the above... + // Add the min costs of neighbours +1, 0 and -1 steps + + //float mincost = min(cost,min(cost2,cost3)); + + //cost += (ftl::cuda::halfWarpSum(mincost) - mincost) / 15.0f; // - readCost(min_costs); // / 15.0f; + //cost /= 2.0f; + //writeCost(aggr_costs_f, i, cost); + //writeCost(min_costs, min(readCost(min_costs),cost)); + + bestStep = (cost < bestcost) ? i : bestStep; + bestcost = min(cost, bestcost); + //avgcost += cost; + } + + confidence = sqrt(confidence / float(COR_STEPS)); + //confidence = max(0.0f, (fabsf(avgcost-bestcost) / confidence) - 1.0f); // / (3.0f * confidence); + bestcost = sqrt(bestcost); + + // Backward aggregate costs + //for (int i=COR_STEPS-1; i>=0; --i) { + /*float cost = ftl::cuda::halfWarpSum(readCost(match_costs,i)); + if (i<COR_STEPS-1) cost += ftl::cuda::halfWarpSum(readCost(aggr_costs_b,i+1)+15.0f*10.0f) - readCost(aggr_costs_b,i+1); + if (i<COR_STEPS-2) cost += ftl::cuda::halfWarpSum(readCost(aggr_costs_b,i+2)+15.0f*30.0f) - readCost(aggr_costs_b,i+2); + float N = (i<COR_STEPS-2) ? 46.0f : (i<COR_STEPS-1) ? 31.0f : 16.0f; + writeCost(aggr_costs_b, i, cost / N);*/ + //} + + //__syncwarp(); + + // Find best cost and use that + /*for (int i=0; i<COR_STEPS; ++i) { + float cost = readCost(aggr_costs_f,i) + readCost(aggr_costs_b,i); + bestStep = (cost < bestcost) ? i : bestStep; + bestcost = min(cost, bestcost); + }*/ + + float bestadjust = float(bestStep-(COR_STEPS/2))*depthM*sub_pixel; + bestScreen = make_short2(lineOrigin.x + float(bestStep-(COR_STEPS/2))*sub_pixel + 0.5f, + lineOrigin.y + float(bestStep-(COR_STEPS/2))*lineM*sub_pixel + 0.5f); + + // Detect matches to boundaries, and discard those + //uint stepMask = 1 << bestStep; + //if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestcost = PINF; + + // TODO: Validate depth? + + // Check that no boundaries were matched in 4x4 block + //bestcost = ftl::cuda::halfWarpMax(bestcost); + + //float confidence = (avgcost == PINF) ? 1.0f : (bestcost / (avgcost/float(COR_STEPS))); + + // If a match was found then record it + if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth) { + // Delay making the depth change until later. + + //if (y == 200) printf("Cost: %d\n", bestScreen.x); + + /*if (x % 2 == 0 && y % 2 == 0) { + const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x); + if (fabs(d2.tex2D(bestScreen.x, bestScreen.y) - (camPosOrigin.z + float(bestStep-(COR_STEPS/2))*depthM2*sub_pixel)) < 0.1f*camPosOrigin.z) { + float4 c2 = colour2.tex2D(float(bestScreen.x)+0.5f, float(bestScreen.y)+0.5f); + colour1(x,y) = make_uchar4(c2.x, c2.y, c2.z, 0.0f); + } + }*/ + + const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x); + if (fabsf(d2.tex2D(bestScreen) - (camPosOrigin.z + float(bestStep-(COR_STEPS/2))*depthM2*sub_pixel)) < 0.1f*camPosOrigin.z) { + if (bestcost < PINF && bestStep != 0 && bestStep != COR_STEPS-1) { + conf(pt) = bestcost; + //colour1(x,y) = make_uchar4(bestc2.x, bestc2.y, bestc2.z, 0.0f); + //mask(x,y) = mask(x,y) | Mask::kMask_Correspondence; + screenOut(pt) = bestScreen; + } + } + } + + // Doesn't work + //if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost > 0.75f) { + // mask(x,y) = mask(x,y) | Mask::kMask_Bad; + //} + } +} + +template <int COR_STEPS> +__global__ void aggregate_colour_costs_kernel( + TextureObject<float> d1, + TextureObject<float> d2, + TextureObject<half> costs1, + TextureObject<half> costs2, + TextureObject<short2> screenOut, + TextureObject<float> conf, + float4x4 pose, + Camera cam1, + Camera cam2, + ftl::cuda::MvMLSParams params + ) { //const int tid = (threadIdx.x + threadIdx.y * blockDim.x); const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE; const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y; - - if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) { - screenOut(x,y) = make_short2(-1,-1); + // FIXME: Don't return like this due to warp operations + if (x >= 0 && y >=0 && x < d1.width() && y < d1.height()) { + screenOut(x,y) = make_short2(-1,-1); + conf(x,y) = 0.0f; - //const float3 world1 = make_float3(p1.tex2D(x, y)); - const float depth1 = d1.tex2D(x,y); //(pose1_inv * world1).z; // Initial starting depth - if (depth1 < cam1.minDepth || depth1 > cam1.maxDepth) return; + const float depth1 = d1.tex2D(x,y); + // TODO: If all depths within a half warp here are bad then can return + // early. + if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return; - // TODO: Temporary hack to ensure depth1 is present - //const float4 temp = vout.tex2D(x,y); - //vout(x,y) = make_float4(depth1, 0.0f, temp.z, temp.w); - - //const float3 world1 = pose1 * cam1.screenToCam(x,y,depth1); - - const auto colour1 = c1.tex2D((float)x+0.5f, (float)y+0.5f); - - //float bestdepth = 0.0f; short2 bestScreen = make_short2(-1,-1); - //float bestdepth = 0.0f; - //float bestdepth2 = 0.0f; - float bestweight = 0.0f; - float bestcolour = 0.0f; - //float bestdweight = 0.0f; - float totalcolour = 0.0f; - //int count = 0; - //float contrib = 0.0f; + float bestcost = PINF; + float bestcost2 = PINF; const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1); const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin); - const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 10.0f); + const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 0.4f); const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant); - const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x); - const float depthM = 10.0f / (lineDistant.x - lineOrigin.x); + const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x); + const float depthM = 0.4f / (lineDistant.x - lineOrigin.x); const float depthM2 = (camPosDistant.z - camPosOrigin.z) / (lineDistant.x - lineOrigin.x); + const float sub_pixel = depthM / (((depth1*depth1) / (cam1.fx * cam1.baseline)) / float(COR_STEPS/2)); + float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2 * sub_pixel); float2 linePos; - linePos.x = lineOrigin.x - ((COR_STEPS/2)); - linePos.y = lineOrigin.y - (((COR_STEPS/2)) * lineM); - //float depthPos = depth1 - (float((COR_STEPS/2)) * depthM); - float depthPos2 = camPosOrigin.z - (float((COR_STEPS/2)) * depthM2); - //const float depthCoef = cam1.baseline*cam1.fx; - - const float depthCoef = (1.0f / cam1.fx) * 10.0f; - + linePos.x = lineOrigin.x - float((COR_STEPS/2)) * sub_pixel; + linePos.y = lineOrigin.y - (float((COR_STEPS/2)) * lineM * sub_pixel); + uint badMask = 0; int bestStep = COR_STEPS/2; + //float bestSource = 0.0f; + // Scan from -COR_STEPS/2 to +COR_STEPS/2, where COR_STEPS is the number + // of pixels to check in the second camera for the current first camera + // pixel. - // Project to p2 using cam2 - // Each thread takes a possible correspondence and calculates a weighting - //const int lane = tid % WARP_SIZE; + #pragma unroll for (int i=0; i<COR_STEPS; ++i) { - //float weight = 1.0f; //(linePos.x >= cam2.width || linePos.y >= cam2.height) ? 0.0f : 1.0f; - - // Generate a colour correspondence value - const auto colour2 = c2.tex2D(linePos.x, linePos.y); - - // TODO: Check if other colour dissimilarities are better... - const float cweight = ftl::cuda::colourWeighting(colour1, colour2, params.colour_smooth); - - // Generate a depth correspondence value - const float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f)); - - // Record which correspondences are invalid - badMask |= ( - depth2 <= cam2.minDepth || - depth2 >= cam2.maxDepth || - linePos.x < 0.5f || - linePos.y < 0.5f || - linePos.x >= d2.width()-0.5f || - linePos.y >= d2.height()-0.5f - ) ? 1 << i : 0; - - //if (FUNCTION == 1) { - // TODO: Spatial smooth must be disparity discon threshold of largest depth in original camera space. - // ie. if depth1 > depth2 then depth1 has the largest error potential and the resolution at that - // depth is used to determine the spatial smoothing amount. - - const float maxdepth = max(depth1, depth2); - const float smooth = depthCoef * maxdepth; - float weight = ftl::cuda::weighting(fabs(depth2 - depthPos2), cweight*smooth); - //weight = ftl::cuda::halfWarpSum(weight); - //} else { - // const float dweight = ftl::cuda::weighting(fabs(depth2 - depthPos2), params.spatial_smooth); - // weight *= weightFunction<FUNCTION>(params, dweight, cweight); - //} - //const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range); - - //weight *= weightFunction<FUNCTION>(params, dweight, cweight); - - //++count; - - bestcolour = max(cweight, bestcolour); - totalcolour += cweight; - - //bestdepth = (weight > bestweight) ? depthPos : bestdepth; - bestStep = (weight > bestweight) ? i : bestStep; - - bestweight = max(bestweight, weight); + // generate census at that location+0.5f + float depth2 = d2.tex2D(int(linePos.x+0.5f), int(linePos.y+0.5f)); + const float sub_pixel2 = (((depth2*depth2) / (cam2.fx * cam2.baseline)) / float(COR_STEPS/2)); + int index = int((depthPos2-depth2) / sub_pixel2) + COR_STEPS/2; + + float c1 = readCost(costs1, x, y, i); + float c2 = (depthPos2 > cam2.minDepth && depthPos2 < cam2.maxDepth && index >= 0 && index < COR_STEPS && linePos.x >= 0.0f && linePos.y >= 0.0f && linePos.x < d2.width()-1 && linePos.y < d2.height()-1) ? readCost(costs2, int(linePos.x+0.5f), int(linePos.y+0.5f), index) : PINF; + //writeCost(costs1, x, y, i, c1+c2); + + //if (index >= 0 && index < COR_STEPS && y == 200) printf("Index = %f\n", sub_pixel); + float cost = c1+c2; + + // Record the best result + bestStep = (cost < bestcost) ? i : bestStep; + bestScreen = (cost < bestcost) ? make_short2(linePos.x+0.5f,linePos.y+0.5f) : bestScreen; + //bestc2 = (cost < bestcost) ? c2 : bestc2; + bestcost = min(bestcost, cost); - //depthPos += depthM; - depthPos2 += depthM2; - linePos.x += 1.0f; - linePos.y += lineM; + // Move to next correspondence check point + depthPos2 += sub_pixel*depthM2; + linePos.x += sub_pixel*1.0f; + linePos.y += sub_pixel*lineM; } - //const float avgcolour = totalcolour/(float)count; - const float confidence = ((bestcolour / totalcolour) - (1.0f / 16.0f)) * (1.0f + (1.0f/16.0f)); - float bestadjust = float(bestStep-(COR_STEPS/2))*depthM; + float bestadjust = float(bestStep-(COR_STEPS/2))*depthM*sub_pixel; // Detect matches to boundaries, and discard those - uint stepMask = 1 << bestStep; - if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestweight = 0.0f; + //uint stepMask = 1 << bestStep; + //if ((stepMask & badMask) || (stepMask & (badMask << 1)) || (stepMask & (badMask >> 1))) bestcost = PINF; + + // TODO: Validate depth? + + // Check that no boundaries were matched in 4x4 block + //bestcost = ftl::cuda::halfWarpMax(bestcost); + + //float confidence = 1.0f - (bestcost2 / bestcost); - //bestadjust = halfWarpBest(bestadjust, (bestweight > 0.0f) ? confidence : 0.0f); - - //Mask m(mask.tex2D(x,y)); - - //if (bestweight > 0.0f) { - float old = conf.tex2D(x,y); - - if (bestweight > 0.0f) { - d1(x,y) = (0.4f*bestadjust) + depth1; - //d2(bestScreen.x, bestScreen.y) = bestdepth2; - //screenOut(x,y) = bestScreen; - conf(x,y) = max(old,confidence); //bestweight * confidence; - //conf(x,y) = max(old,fabs(bestadjust)); - } - //} - - // If a good enough match is found, mark dodgy depth as solid - //if ((m.isFilled() || m.isDiscontinuity()) && (bestweight > params.match_threshold)) mask(x,y) = 0; + // If a match was found then record it + if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < PINF) { + // Delay making the depth change until later. + + conf(x,y) = bestadjust; + //colour1(x,y) = make_uchar4(bestc2.x, bestc2.y, bestc2.z, 0.0f); + //mask(x,y) = mask(x,y) | Mask::kMask_Correspondence; + screenOut(x,y) = bestScreen; + } + + // Doesn't work + //if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost > 0.75f) { + // mask(x,y) = mask(x,y) | Mask::kMask_Bad; + //} } } +/** + * Non-epipolar census search. + */ +template <int RADIUS> +__global__ void corresponding_colour_kernel( + TextureObject<uchar4> colour1, + TextureObject<uchar4> colour2, + TextureObject<short2> screen, + float4x4 pose, + Camera cam1, + Camera cam2, ftl::cuda::MvMLSParams params) { + + const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); + const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y; + + if (x >= 0 && y >=0 && x < screen.width() && y < screen.height()) { + // generate census for camera1 + float4 c = colour1.tex2D(float(x+0.5f),float(y+0.5f)); + int3 cen1; + cen1.x = halfWarpCensus(c.x); + cen1.y = halfWarpCensus(c.y); + cen1.z = halfWarpCensus(c.z); + + short2 corPos = screen.tex2D(x,y); + if (ftl::cuda::halfWarpMin(int(corPos.x)) < 0) return; + + short2 bestScreen = corPos; + float bestcost = PINF; + + // For a perturbation around current correspondence.. + for (int v=-RADIUS; v<=RADIUS; ++v) { + for (int u=-RADIUS; u<=RADIUS; ++u) { + // generate census at that location + float4 c = colour2.tex2D(float(corPos.x+u), float(corPos.y+v)); + int3 cen2; + cen2.x = halfWarpCensus(c.x); + cen2.y = halfWarpCensus(c.y); + cen2.z = halfWarpCensus(c.z); + + // count bit differences + int count = __popc(cen1.x ^ cen2.x) + __popc(cen1.y ^ cen2.y) + __popc(cen1.z ^ cen2.z); + + // generate cost from bit diffs and amount of movement + float cost = float(count) / 48.0f; + cost += 1.0f - min(1.0f, sqrt(float(v*v) + float(u*u)) / float(3*RADIUS)); + bestScreen = (cost < bestcost) ? make_short2(corPos.x+u, corPos.y+v) : bestScreen; + bestcost = min(cost, bestcost); + } + } + + // Adjust correspondence to min cost location + screen(x,y) = bestScreen; + } +} + + void ftl::cuda::correspondence( - TextureObject<float> &d1, + TextureObject<uchar4> &c1, + TextureObject<uchar4> &c2, + TextureObject<float> &d1, TextureObject<float> &d2, - TextureObject<uchar4> &c1, - TextureObject<uchar4> &c2, - TextureObject<short2> &screen, + TextureObject<short2> &screen, TextureObject<float> &conf, - TextureObject<uint8_t> &mask, - float4x4 &pose2, - const Camera &cam1, - const Camera &cam2, const MvMLSParams ¶ms, int func, - cudaStream_t stream) { + //TextureObject<uint8_t> &mask, + //TextureObject<half> &costs, + float4x4 &pose2, + const Camera &cam1, + const Camera &cam2, const MvMLSParams ¶ms, + int radius, + cudaStream_t stream) { + + //const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + //const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + const dim3 gridSize((c1.width() + 8 - 1)/8, (c1.height() + 8 - 1)/8); + const dim3 blockSize(WARP_SIZE, 2); + + + switch (radius) { + case 32: corresponding_colour_epi_kernel<32><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + case 16: corresponding_colour_epi_kernel<16><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + case 8: corresponding_colour_epi_kernel<8><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + case 4: corresponding_colour_epi_kernel<4><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + case 3: corresponding_colour_epi_kernel<3><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + case 2: corresponding_colour_epi_kernel<2><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, screen, conf, pose2, cam1, cam2, params); break; + //case 1: corresponding_colour_epi_kernel<1><<<gridSize, blockSize, 0, stream>>>(c1, c2, d1, d2, costs, pose2, cam1, cam2, params); break; + } + + cudaSafeCall( cudaGetLastError() ); +} + +void ftl::cuda::aggregate_colour_costs( + TextureObject<float> &d1, + TextureObject<float> &d2, + TextureObject<half> &costs1, + TextureObject<half> &costs2, + TextureObject<short2> &screen, + TextureObject<float> &conf, + float4x4 &pose2, + const Camera &cam1, + const Camera &cam2, const MvMLSParams ¶ms, + int radius, + cudaStream_t stream) { //const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); //const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); @@ -263,14 +534,16 @@ void ftl::cuda::correspondence( const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(WARP_SIZE, 2); - - switch (func) { - case 32: corresponding_point_kernel<32,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break; - case 16: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break; - case 8: corresponding_point_kernel<8,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break; - case 4: corresponding_point_kernel<4,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break; - case 2: corresponding_point_kernel<2,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose2, cam1, cam2, params); break; - } - cudaSafeCall( cudaGetLastError() ); + switch (radius) { + case 32: aggregate_colour_costs_kernel<32><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + case 16: aggregate_colour_costs_kernel<16><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + case 8: aggregate_colour_costs_kernel<8><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + case 4: aggregate_colour_costs_kernel<4><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + case 3: aggregate_colour_costs_kernel<3><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + case 2: aggregate_colour_costs_kernel<2><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + //case 1: aggregate_colour_costs_kernel<1><<<gridSize, blockSize, 0, stream>>>(d1, d2, costs1, costs2, screen, conf, pose2, cam1, cam2, params); break; + } + + cudaSafeCall( cudaGetLastError() ); } diff --git a/components/operators/src/fusion/correspondence_common.hpp b/components/operators/src/fusion/correspondence_common.hpp index 6b9405f47..9f19ef711 100644 --- a/components/operators/src/fusion/correspondence_common.hpp +++ b/components/operators/src/fusion/correspondence_common.hpp @@ -14,10 +14,22 @@ __device__ inline bool isBadCor(float depth, const float2 &pos, const ftl::rgbd: depth >= cam.maxDepth || pos.x < 0.5f || pos.y < 0.5f || - pos.x >= cam.width-0.5f || - pos.y >= cam.height-0.5f; + pos.x >= float(cam.width)-0.5f || + pos.y >= float(cam.height)-0.5f; } __device__ inline float square(float v) { return v*v; } +__device__ inline float length2(const float4 v) { + return v.x*v.x + v.y*v.y + v.z*v.z; +} + +template <int BLOCKS> +__device__ inline int2 block4x4() { + return make_int2( + (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)), + blockIdx.y*4*BLOCKS + (threadIdx.x%16)/4 + 4*threadIdx.y + ); +} + #endif diff --git a/components/operators/src/fusion/correspondence_depth.cu b/components/operators/src/fusion/correspondence_depth.cu index 533152617..9f83fa79f 100644 --- a/components/operators/src/fusion/correspondence_depth.cu +++ b/components/operators/src/fusion/correspondence_depth.cu @@ -23,17 +23,16 @@ __global__ void corresponding_depth_kernel( float4x4 pose, Camera cam1, Camera cam2, ftl::cuda::MvMLSParams params) { - - //const int tid = (threadIdx.x + threadIdx.y * blockDim.x); - const int x = (blockIdx.x*8 + (threadIdx.x%4) + 4*(threadIdx.x / 16)); // / WARP_SIZE; - const int y = blockIdx.y*8 + threadIdx.x/4 + 4*threadIdx.y; + + const int2 pt = block4x4<2>(); + // FIXME: Don't return like this due to warp operations - if (x >= 0 && y >=0 && x < screenOut.width() && y < screenOut.height()) { - screenOut(x,y) = make_short2(-1,-1); - conf(x,y) = 0.0f; + if (pt.x >= 0 && pt.y >=0 && pt.x < screenOut.width() && pt.y < screenOut.height()) { + screenOut(pt) = make_short2(-1,-1); + conf(pt) = 0.0f; - const float depth1 = d1.tex2D(x,y); + const float depth1 = d1.tex2D(pt); // TODO: If all depths within a half warp here are bad then can return // early. //if (__ballot_sync(FULL_MASK, depth1 > cam1.minDepth && depth1 < cam1.maxDepth) & ((threadIdx.x/16 == 1) ? 0xFFFF0000 : 0xFFFF) == 0) return; @@ -41,9 +40,9 @@ __global__ void corresponding_depth_kernel( short2 bestScreen = make_short2(-1,-1); float bestcost = PINF; - const float3 camPosOrigin = pose * cam1.screenToCam(x,y,depth1); + const float3 camPosOrigin = pose * cam1.screenToCam(pt.x,pt.y,depth1); const float2 lineOrigin = cam2.camToScreen<float2>(camPosOrigin); - const float3 camPosDistant = pose * cam1.screenToCam(x,y,depth1 + 0.4f); + const float3 camPosDistant = pose * cam1.screenToCam(pt.x,pt.y,depth1 + 0.4f); const float2 lineDistant = cam2.camToScreen<float2>(camPosDistant); const float lineM = (lineDistant.y - lineOrigin.y) / (lineDistant.x - lineOrigin.x); const float depthM = 0.4f / (lineDistant.x - lineOrigin.x); @@ -86,7 +85,8 @@ __global__ void corresponding_depth_kernel( // the greater they are from the warp concensus best cost // Sum all squared distance errors in a 4x4 pixel neighborhood - cost = ftl::cuda::halfWarpSum(cost); + float wcost = ftl::cuda::halfWarpSum(cost) / 16.0f; + cost = (wcost < 1.0f) ? (wcost + cost) / 2.0f : 1.0f; // Enables per pixel adjustments // Record the best result bestStep = (cost < bestcost) ? i : bestStep; @@ -100,7 +100,11 @@ __global__ void corresponding_depth_kernel( linePos.y += lineM; } - float bestadjust = float(bestStep-(COR_STEPS/2))*depthM; + float bestadjust = float(bestStep-(COR_STEPS/2))*depthM; + + //if (bestStep == 0 ||bestStep == COR_STEPS-1) { + // printf("Bad step: %f\n", bestcost); + //} // Detect matches to boundaries, and discard those uint stepMask = 1 << bestStep; @@ -110,11 +114,11 @@ __global__ void corresponding_depth_kernel( bestcost = ftl::cuda::halfWarpMax(bestcost); // If a match was found then record it - if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < PINF) { + if (depth1 > cam1.minDepth && depth1 < cam1.maxDepth && bestcost < 1.0f) { // Delay making the depth change until later. - conf(x,y) = bestadjust; - mask(x,y) = mask(x,y) | Mask::kMask_Correspondence; - screenOut(x,y) = bestScreen; + conf(pt) = bestadjust; + mask(pt) = mask(pt) | Mask::kMask_Correspondence; + screenOut(pt) = bestScreen; } } } @@ -131,7 +135,10 @@ void ftl::cuda::correspondence( cudaStream_t stream) { // FIXME: Check the grid size makes sense - const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + //const dim3 gridSize((d1.width() + 1), (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + //const dim3 blockSize(WARP_SIZE, 2); + + const dim3 gridSize((d1.width() + 8 - 1)/8, (d1.height() + 8 - 1)/8); const dim3 blockSize(WARP_SIZE, 2); switch (func) { diff --git a/components/operators/src/fusion/correspondence_util.cu b/components/operators/src/fusion/correspondence_util.cu index ca1b8a82a..de91eabe6 100644 --- a/components/operators/src/fusion/correspondence_util.cu +++ b/components/operators/src/fusion/correspondence_util.cu @@ -44,7 +44,8 @@ void ftl::cuda::zero_confidence(TextureObject<float> &conf, TextureObject<float> __global__ void show_cor_error_kernel( TextureObject<uchar4> colour, TextureObject<short2> screen1, - TextureObject<short2> screen2) { + TextureObject<short2> screen2, + float thresh) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; @@ -56,18 +57,51 @@ __global__ void show_cor_error_kernel( short2 s2 = screen2.tex2D(s1.x, s1.y); float l = sqrt(square(s2.x-x) + square(s2.y-y)); - float nl = min(1.0f, l/5.0f); + float nl = min(1.0f, l/thresh); //colour(x,y) = (l < 1.0f) ? make_uchar4(0,255,0,0) : (s2.x < 0) ? make_uchar4(255.0f, 0.0f, 0.0f, 0.0f) : make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f); if (nl < 1.0f && s2.x >= 0) colour(x,y) = make_uchar4(0.0f, (1.0f-nl)*255.0f, nl*255.0f, 0.0f); } } } -void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<short2> &screen1, TextureObject<short2> &screen2, cudaStream_t stream) { +void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<short2> &screen1, TextureObject<short2> &screen2, float thresh, cudaStream_t stream) { const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - show_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen1, screen2); + show_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen1, screen2, thresh); + cudaSafeCall( cudaGetLastError() ); +} + +// ==== Remove correspondence errors =========================================== + +__global__ void remove_cor_error_kernel( + TextureObject<float> adjust, + TextureObject<short2> screen1, + TextureObject<short2> screen2, float thresh) { + + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < adjust.width() && y < adjust.height()) { + short2 s1 = screen1.tex2D(x,y); + + if (s1.x >= 0 && s1.x < screen2.width() && s1.y < screen2.height()) { + short2 s2 = screen2.tex2D(s1.x, s1.y); + + float l = sqrt(square(s2.x-x) + square(s2.y-y)); + if (l >= thresh || s2.x < 0) { + adjust(x,y) = PINF; + screen1(x,y) = make_short2(-1,-1); + } + } + } +} + +void ftl::cuda::remove_cor_error(TextureObject<float> &adjust, TextureObject<short2> &screen1, TextureObject<short2> &screen2, float thresh, cudaStream_t stream) { + const dim3 gridSize((adjust.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (adjust.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + remove_cor_error_kernel<<<gridSize, blockSize, 0, stream>>>(adjust, screen1, screen2, thresh); cudaSafeCall( cudaGetLastError() ); } @@ -76,25 +110,79 @@ void ftl::cuda::show_cor_error(TextureObject<uchar4> &colour, TextureObject<shor __global__ void show_depth_adjust_kernel( TextureObject<uchar4> colour, - TextureObject<float> adjust) { + TextureObject<short2> screen, + TextureObject<float> adjust, float scale) { const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; if (x < colour.width() && y < colour.height()) { float a = adjust.tex2D(x,y); + short2 s = screen.tex2D(x,y); - if (a != 0.0f) { - float nc = min(1.0f, fabsf(a)/0.04f); - colour(x,y) = make_uchar4(0.0f, (1.0f-nc)*255.0f, nc*255.0f, 0.0f); + if (s.x >= 0) { + float ncG = min(1.0f, fabsf(a)/scale); + float ncB = -max(-1.0f, min(0.0f, a/scale)); + float ncR = max(0.0f, min(1.0f, a/scale)); + colour(x,y) = make_uchar4(ncB*255.0f, (1.0f-ncG)*255.0f, ncR*255.0f, 0.0f); } } } -void ftl::cuda::show_depth_adjustment(TextureObject<uchar4> &colour, TextureObject<float> &adjust, cudaStream_t stream) { +void ftl::cuda::show_depth_adjustment(TextureObject<uchar4> &colour, TextureObject<short2> &screen, TextureObject<float> &adjust, float scale, cudaStream_t stream) { const dim3 gridSize((colour.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); - show_depth_adjust_kernel<<<gridSize, blockSize, 0, stream>>>(colour, adjust); + show_depth_adjust_kernel<<<gridSize, blockSize, 0, stream>>>(colour, screen, adjust, scale); + cudaSafeCall( cudaGetLastError() ); +} + +// ==== Vis reprojection ======================================================= + +__global__ void viz_reprojection_kernel( + TextureObject<uchar4> colour_out, + TextureObject<uchar4> colour_in, + TextureObject<float> depth_out, + TextureObject<float> depth_in, + float4x4 pose, + Camera cam1, + Camera cam2) { + + const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + if (x < colour_in.width() && y < colour_in.height()) { + float d = depth_in.tex2D(x,y); + const float3 camPosOrigin = pose * cam1.screenToCam(x,y,d); + const uint2 p = cam2.camToScreen<uint2>(camPosOrigin); + + if (p.x < colour_out.width() && p.y < colour_out.height()) { + float dout = depth_out(p.x, p.y); + uchar4 cin = colour_in(x,y); + uchar4 cout = colour_out(p.x, p.y); + + if (fabs(dout-camPosOrigin.z) < 0.1f) { + colour_out(p.x, p.y) = make_uchar4( + (int(cin.x)+int(cout.x)) / 2, + (int(cin.y)+int(cout.y)) / 2, + (int(cin.z)+int(cout.z)) / 2, + 0); + } + } + } +} + +void ftl::cuda::viz_reprojection( + TextureObject<uchar4> &colour_out, + TextureObject<uchar4> &colour_in, + TextureObject<float> &depth_out, + TextureObject<float> &depth_in, + const float4x4 &pose, + const Camera &cam1, + const Camera &cam2, cudaStream_t stream) { + const dim3 gridSize((colour_in.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (colour_in.height() + T_PER_BLOCK - 1)/T_PER_BLOCK); + const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK); + + viz_reprojection_kernel<<<gridSize, blockSize, 0, stream>>>(colour_out, colour_in, depth_out, depth_in, pose, cam1, cam2); cudaSafeCall( cudaGetLastError() ); } diff --git a/components/operators/src/fusion/mvmls.cpp b/components/operators/src/fusion/mvmls.cpp index 7d7ef03d9..55b992957 100644 --- a/components/operators/src/fusion/mvmls.cpp +++ b/components/operators/src/fusion/mvmls.cpp @@ -2,6 +2,7 @@ #include "smoothing_cuda.hpp" #include <ftl/utility/matrix_conversion.hpp> #include "mvmls_cuda.hpp" +#include <ftl/cuda/normals.hpp> #include <opencv2/cudaarithm.hpp> @@ -32,12 +33,16 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda int win = config()->value("window_size",16); bool do_corr = config()->value("merge_corresponding", true); bool do_aggr = config()->value("merge_mls", false); + bool do_colour_adjust = config()->value("apply_colour_adjust", false); bool cull_zero = config()->value("cull_no_confidence", false); //bool show_best_source = config()->value("show_pixel_source", false); ftl::cuda::MvMLSParams params; params.colour_smooth = config()->value("colour_smooth", 150.0f); params.spatial_smooth = config()->value("spatial_smooth", 1.0f); + params.sub_pixel = config()->value("sub_pixel", 0.5f); + params.P1 = config()->value("P1", 0.1f); + params.P2 = config()->value("P2", 0.2f); bool show_consistency = config()->value("show_consistency", false); bool show_adjustment = config()->value("show_adjustment", false); @@ -142,19 +147,143 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda stream ); - if (show_consistency) { - ftl::cuda::show_cor_error(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f2.getTexture<short2>(Channel::Screen), stream); - ftl::cuda::show_cor_error(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f1.getTexture<short2>(Channel::Screen), stream); - } + /*ftl::cuda::remove_cor_error( + f1.getTexture<float>(Channel::Confidence), + f1.getTexture<short2>(Channel::Screen), + f2.getTexture<short2>(Channel::Screen), + 2.0f, + stream); - if (show_adjustment) { - ftl::cuda::show_depth_adjustment(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<float>(Channel::Confidence), stream); - ftl::cuda::show_depth_adjustment(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<float>(Channel::Confidence), stream); - } + ftl::cuda::remove_cor_error( + f2.getTexture<float>(Channel::Confidence), + f2.getTexture<short2>(Channel::Screen), + f1.getTexture<short2>(Channel::Screen), + 2.0f, + stream);*/ // Actually perform the adjustment cv::cuda::add(f1.get<GpuMat>(Channel::Depth), f1.get<GpuMat>(Channel::Confidence), f1.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream); cv::cuda::add(f2.get<GpuMat>(Channel::Depth), f2.get<GpuMat>(Channel::Confidence), f2.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream); + + /*ftl::cuda::viz_reprojection( + f1.getTexture<uchar4>(Channel::Colour), + f2.getTexture<uchar4>(Channel::Colour), + f1.getTexture<float>(Channel::Depth), + f2.getTexture<float>(Channel::Depth), + pose, + f2.getLeftCamera(), + f1.getLeftCamera(), + stream + ); + ftl::cuda::viz_reprojection( + f2.getTexture<uchar4>(Channel::Colour), + f1.getTexture<uchar4>(Channel::Colour), + f2.getTexture<float>(Channel::Depth), + f1.getTexture<float>(Channel::Depth), + pose2, + f1.getLeftCamera(), + f2.getLeftCamera(), + stream + );*/ + //continue; + + //auto &g1 = f1.get<GpuMat>(Channel::Depth); + //costs_[0].create(g1.cols/4, (g1.rows/4) * 16); + //auto &g2 = f2.get<GpuMat>(Channel::Depth); + //costs_[1].create(g2.cols/4, (g2.rows/4) * 16); + + /*ftl::cuda::correspondence( + f1.getTexture<uchar4>(Channel::Colour), + f2.getTexture<uchar4>(Channel::Colour), + f1.getTexture<float>(Channel::Depth), + f2.getTexture<float>(Channel::Depth), + f1.getTexture<short2>(Channel::Screen), + f1.getTexture<float>(Channel::Confidence), + //f1.getTexture<uint8_t>(Channel::Mask), + //costs_[0], + pose2, + f1.getLeftCamera(), + f2.getLeftCamera(), + params, + 16, + stream + ); + ftl::cuda::correspondence( + f2.getTexture<uchar4>(Channel::Colour), + f1.getTexture<uchar4>(Channel::Colour), + f2.getTexture<float>(Channel::Depth), + f1.getTexture<float>(Channel::Depth), + f2.getTexture<short2>(Channel::Screen), + f2.getTexture<float>(Channel::Confidence), + //f2.getTexture<uint8_t>(Channel::Mask), + //costs_[1], + pose, + f2.getLeftCamera(), + f1.getLeftCamera(), + params, + 16, + stream + );*/ + + /*ftl::cuda::aggregate_colour_costs( + f1.getTexture<float>(Channel::Depth), + f2.getTexture<float>(Channel::Depth), + costs_[0], + costs_[1], + f1.getTexture<short2>(Channel::Screen), + f1.getTexture<float>(Channel::Confidence), + pose2, + f1.getLeftCamera(), + f2.getLeftCamera(), + params, + 16, + stream + ); + + ftl::cuda::aggregate_colour_costs( + f2.getTexture<float>(Channel::Depth), + f1.getTexture<float>(Channel::Depth), + costs_[1], + costs_[0], + f2.getTexture<short2>(Channel::Screen), + f2.getTexture<float>(Channel::Confidence), + pose, + f2.getLeftCamera(), + f1.getLeftCamera(), + params, + 16, + stream + );*/ + + if (show_consistency) { + ftl::cuda::show_cor_error(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f2.getTexture<short2>(Channel::Screen), 5.0f, stream); + ftl::cuda::show_cor_error(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f1.getTexture<short2>(Channel::Screen), 5.0f, stream); + } + + /*ftl::cuda::remove_cor_error( + f1.getTexture<float>(Channel::Confidence), + f1.getTexture<short2>(Channel::Screen), + f2.getTexture<short2>(Channel::Screen), + 5.0f, + stream); + + ftl::cuda::remove_cor_error( + f2.getTexture<float>(Channel::Confidence), + f2.getTexture<short2>(Channel::Screen), + f1.getTexture<short2>(Channel::Screen), + 5.0f, + stream);*/ + + // Actually perform the colour adjustment + //if (do_colour_adjust) { + // cv::cuda::add(f1.get<GpuMat>(Channel::Depth), f1.get<GpuMat>(Channel::Confidence), f1.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream); + // cv::cuda::add(f2.get<GpuMat>(Channel::Depth), f2.get<GpuMat>(Channel::Confidence), f2.get<GpuMat>(Channel::Depth), cv::noArray(), -1, cvstream); + //} + + if (show_adjustment) { + ftl::cuda::show_depth_adjustment(f1.getTexture<uchar4>(Channel::Colour), f1.getTexture<short2>(Channel::Screen), f1.getTexture<float>(Channel::Confidence), 0.04f, stream); + ftl::cuda::show_depth_adjustment(f2.getTexture<uchar4>(Channel::Colour), f2.getTexture<short2>(Channel::Screen), f2.getTexture<float>(Channel::Confidence), 0.04f, stream); + } //} //else { /*ftl::cuda::correspondence( f1.getTexture<float>(Channel::Depth), @@ -207,6 +336,16 @@ bool MultiViewMLS::apply(ftl::rgbd::FrameSet &in, ftl::rgbd::FrameSet &out, cuda // Reduce window size for next iteration //win = max(win>>1, 4); + + // Redo normals + for (size_t i=0; i<in.frames.size(); ++i) { + auto &f = in.frames[i]; + ftl::cuda::normals( + f.getTexture<half4>(Channel::Normals), + f.getTexture<float>(Channel::Depth), + f.getLeftCamera(), stream + ); + } } for (int iter=0; iter<iters; ++iter) { diff --git a/components/operators/src/fusion/mvmls_cuda.hpp b/components/operators/src/fusion/mvmls_cuda.hpp index 4ee589e0d..4571e2df4 100644 --- a/components/operators/src/fusion/mvmls_cuda.hpp +++ b/components/operators/src/fusion/mvmls_cuda.hpp @@ -11,6 +11,9 @@ namespace cuda { struct MvMLSParams { float spatial_smooth; float colour_smooth; + float sub_pixel; + float P1; + float P2; }; void correspondence( @@ -26,6 +29,20 @@ void correspondence( const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams ¶ms, int func, cudaStream_t stream); +void correspondence( + ftl::cuda::TextureObject<uchar4> &c1, + ftl::cuda::TextureObject<uchar4> &c2, + ftl::cuda::TextureObject<float> &d1, + ftl::cuda::TextureObject<float> &d2, + ftl::cuda::TextureObject<short2> &screen, + ftl::cuda::TextureObject<float> &conf, + //ftl::cuda::TextureObject<uint8_t> &mask, + //ftl::cuda::TextureObject<half> &costs, + float4x4 &pose, + const ftl::rgbd::Camera &cam1, + const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams ¶ms, int radius, + cudaStream_t stream); + void correspondence( ftl::cuda::TextureObject<float> &d1, ftl::cuda::TextureObject<float> &d2, @@ -37,6 +54,18 @@ void correspondence( const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams ¶ms, int func, cudaStream_t stream); +void aggregate_colour_costs( + ftl::cuda::TextureObject<float> &d1, + ftl::cuda::TextureObject<float> &d2, + ftl::cuda::TextureObject<half> &costs1, + ftl::cuda::TextureObject<half> &costs2, + ftl::cuda::TextureObject<short2> &screen, + ftl::cuda::TextureObject<float> &conf, + float4x4 &pose, + const ftl::rgbd::Camera &cam1, + const ftl::rgbd::Camera &cam2, const ftl::cuda::MvMLSParams ¶ms, int radius, + cudaStream_t stream); + void zero_confidence( ftl::cuda::TextureObject<float> &conf, ftl::cuda::TextureObject<float> &depth, @@ -101,13 +130,32 @@ void show_cor_error( ftl::cuda::TextureObject<uchar4> &colour, ftl::cuda::TextureObject<short2> &screen1, ftl::cuda::TextureObject<short2> &screen2, + float thresh, + cudaStream_t stream); + +void remove_cor_error( + ftl::cuda::TextureObject<float> &adjust, + ftl::cuda::TextureObject<short2> &screen1, + ftl::cuda::TextureObject<short2> &screen2, + float thresh, cudaStream_t stream); void show_depth_adjustment( ftl::cuda::TextureObject<uchar4> &colour, + ftl::cuda::TextureObject<short2> &screen, ftl::cuda::TextureObject<float> &adjust, + float scale, cudaStream_t stream); +void viz_reprojection( + ftl::cuda::TextureObject<uchar4> &colour_out, + ftl::cuda::TextureObject<uchar4> &colour_in, + ftl::cuda::TextureObject<float> &depth_out, + ftl::cuda::TextureObject<float> &depth_in, + const float4x4 &pose, + const ftl::rgbd::Camera &cam1, + const ftl::rgbd::Camera &cam2, cudaStream_t stream); + } } -- GitLab