#include <ftl/operators/cuda/mls/multi_intensity.hpp> #include <opencv2/core/cuda_stream_accessor.hpp> #include <ftl/cuda/weighting.hpp> using ftl::cuda::MLSMultiIntensity; using cv::cuda::GpuMat; // ==== Multi image MLS ======================================================== __device__ inline float featureWeight(int f1, int f2) { const float w = (1.0f-(float(abs(f1 - f2)) / 255.0f)); return w*w*w; } __device__ inline float biasedLength(const float3 &Xi, const float3 &X) { float l = 0.0f; const float dx = Xi.x-X.x; l += 2.0f*dx*dx; const float dy = Xi.y-X.y; l += 2.0f*dy*dy; const float dz = Xi.z-X.z; l += dz*dz; return sqrt(l); } /* * Gather points for Moving Least Squares, from each source image */ template <int SEARCH_RADIUS, typename T> __global__ void mls_gather_intensity_kernel( const half4* __restrict__ normals_in, half4* __restrict__ normals_out, const float* __restrict__ depth_origin, const float* __restrict__ depth_in, const T* __restrict__ feature_origin, const T* __restrict__ feature_in, float4* __restrict__ centroid_out, float* __restrict__ contrib_out, float smoothing, float fsmoothing, float4x4 o_2_in, float4x4 in_2_o, float3x3 in_2_o33, ftl::rgbd::Camera camera_origin, ftl::rgbd::Camera camera_in, int npitch_out, int cpitch_out, int wpitch_out, int dpitch_o, int dpitch_i, int npitch_in, int fpitch_o, int fpitch_i ) { const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; if (x < 0 || y < 0 || x >= camera_origin.width || y >= camera_origin.height) return; float3 nX = make_float3(normals_out[y*npitch_out+x]); float3 aX = make_float3(centroid_out[y*cpitch_out+x]); float contrib = contrib_out[y*wpitch_out+x]; float d0 = depth_origin[x+y*dpitch_o]; if (d0 <= camera_origin.minDepth || d0 >= camera_origin.maxDepth) return; const uchar2 feature1 = feature_origin[x+y*fpitch_o]; // TODO: Could the origin depth actually be averaged with depth in other // image? So as to not bias towards the original view? float3 X = camera_origin.screenToCam((int)(x),(int)(y),d0); const float3 camPos = o_2_in * X; const int2 s = camera_in.camToScreen<int2>(camPos); // Move point off of original surface //X = camera_origin.screenToCam((int)(x),(int)(y),d0-0.005f); // TODO: Could dynamically adjust the smoothing factors depending upon the // number of matches. Meaning, if lots of good local and feature matches // then be less likely to include poorer matches. Conversely, if only poor // non-local or feature distance matches, then increase search range. // Could also adapt smoothing parameters using variance or some other local // image measures. Or by just considering distance of the central projected // points as an indication of miss-alignment. Both spatial distance and // feature distance could be used to adjust parameters. // FIXME: For own image, need to do something different than the below. // Otherwise smoothing factors become 0. float spatial_smoothing = (depth_origin == depth_in) ? 0.005f : 0.03f; // 3cm default float hf_intensity_smoothing = (depth_origin == depth_in) ? 100.0f : 50.0f; float mean_smoothing = (depth_origin == depth_in) ? 100.0f : 100.0f; if (depth_origin != depth_in && s.x >= 0 && s.x < camera_in.width && s.y >= 0 && s.y <= camera_in.height) { // Get depth at exact reprojection point const float d = depth_in[s.x+s.y*dpitch_i]; // Get feature at exact reprojection point const uchar2 feature2 = feature_in[s.x+s.y*fpitch_i]; if (d > camera_in.minDepth && d < camera_in.maxDepth) { spatial_smoothing = min(spatial_smoothing, smoothing * fabsf(camPos.z - d)); } hf_intensity_smoothing = smoothing * fabsf(float(feature2.x) - float(feature1.x)); //mean_smoothing = smoothing * fabsf(float(feature2.y) - float(feature1.y)); // Make start point the average of the two sources... const float3 reversePos = in_2_o * camera_in.screenToCam(s.x, s.y, d); X = X + (reversePos) / 2.0f; } // Make sure there is a minimum smoothing value spatial_smoothing = max(0.01f, spatial_smoothing); hf_intensity_smoothing = max(5.0f, hf_intensity_smoothing); //mean_smoothing = max(10.0f, mean_smoothing); // Check for neighbourhood symmetry and use to weight overall contribution float symx = 0.0f; float symy = 0.0f; // Neighbourhood for (int v=-SEARCH_RADIUS; v<=SEARCH_RADIUS; ++v) { for (int u=-SEARCH_RADIUS; u<=SEARCH_RADIUS; ++u) { const float d = (s.x+u >= 0 && s.x+u < camera_in.width && s.y+v >= 0 && s.y+v < camera_in.height) ? depth_in[s.x+u+(s.y+v)*dpitch_i] : 0.0f; if (d <= camera_in.minDepth || d >= camera_in.maxDepth) continue; // Point and normal of neighbour const float3 Xi = in_2_o * camera_in.screenToCam(s.x+u, s.y+v, d); const float3 Ni = make_float3(normals_in[s.x+u+(s.y+v)*npitch_in]); const uchar2 feature2 = feature_in[s.x+u+(s.y+v)*fpitch_i]; // Gauss approx weighting functions // Rule: spatially close and feature close is strong // Spatially far or feature far, then poor. // So take the minimum, must be close and feature close to get good value const float w_high_int = ftl::cuda::weighting(float(abs(int(feature1.x)-int(feature2.x))), hf_intensity_smoothing); const float w_mean_int = ftl::cuda::weighting(float(abs(int(feature1.y)-int(feature2.y))), mean_smoothing); const float w_space = ftl::cuda::spatialWeighting(X,Xi,spatial_smoothing); //const float w_space = ftl::cuda::weighting(biasedLength(Xi,X),spatial_smoothing); // TODO: Distance from cam squared // TODO: Angle from cam (dot of normal and ray) //const float w_lateral = ftl::cuda::weighting(sqrt(Xi.x*X.x + Xi.y*X.y), float(SEARCH_RADIUS)*camera_origin.fx/Xi.z); const float w = (length(Ni) > 0.0f) ?w_space * w_high_int * w_mean_int // min(w_space, min(w_high_int, w_mean_int)) : 0.0f; // Mark as a symmetry contribution if (w > 0.0f) { if (u < 0) symx -= 1.0f; else if (u > 0) symx += 1.0f; if (v < 0) symy -= 1.0f; else if (v > 0) symy += 1.0f; } aX += Xi*w; nX += (in_2_o33 * Ni)*w; contrib += w; } } // Perfect symmetry means symx and symy == 0, therefore actual length can // be measure of asymmetry, so when inverted it can be used to weight result symx = fabsf(symx) / float(SEARCH_RADIUS); symy = fabsf(symy) / float(SEARCH_RADIUS); float l = 1.0f - sqrt(symx*symx+symy*symy); l = l*l; normals_out[y*npitch_out+x] = make_half4(nX*l, 0.0f); centroid_out[y*cpitch_out+x] = make_float4(aX*l, 0.0f); contrib_out[y*wpitch_out+x] = contrib*l; } /** * Convert accumulated values into estimate of depth and normals at pixel. */ __global__ void mls_reduce_kernel_2( const float4* __restrict__ centroid, const half4* __restrict__ normals, const float* __restrict__ contrib_out, half4* __restrict__ normals_out, float* __restrict__ depth, ftl::rgbd::Camera camera, int npitch_in, int cpitch_in, int wpitch, int npitch, int dpitch ) { const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; if (x >= 0 && y >= 0 && x < camera.width && y < camera.height) { float3 nX = make_float3(normals[y*npitch_in+x]); float3 aX = make_float3(centroid[y*cpitch_in+x]); float contrib = contrib_out[y*wpitch+x]; //depth[x+y*dpitch] = X.z; //normals_out[x+y*npitch] = make_half4(0.0f, 0.0f, 0.0f, 0.0f); float d0 = depth[x+y*dpitch]; //depth[x+y*dpitch] = 0.0f; if (d0 <= camera.minDepth || d0 >= camera.maxDepth || contrib == 0.0f) return; float3 X = camera.screenToCam((int)(x),(int)(y),d0); nX /= contrib; // Weighted average normal aX /= contrib; // Weighted average point (centroid) // Signed-Distance Field function float fX = nX.x * (X.x - aX.x) + nX.y * (X.y - aX.y) + nX.z * (X.z - aX.z); // Calculate new point using SDF function to adjust depth (and position) X = X - nX * fX; depth[x+y*dpitch] = X.z; normals_out[x+y*npitch] = make_half4(nX / length(nX), 0.0f); } } MLSMultiIntensity::MLSMultiIntensity(int radius) : radius_(radius) { } MLSMultiIntensity::~MLSMultiIntensity() { } void MLSMultiIntensity::prime( const GpuMat &depth_prime, const GpuMat &intensity_prime, const ftl::rgbd::Camera &cam_prime, const float4x4 &pose_prime, cudaStream_t stream) { depth_prime_ = depth_prime; intensity_prime_ = intensity_prime; cam_prime_ = cam_prime; pose_prime_ = pose_prime; centroid_accum_.create(depth_prime.size(), CV_32FC4); normal_accum_.create(depth_prime.size(), CV_16FC4); weight_accum_.create(depth_prime.size(), CV_32F); cv::cuda::Stream cvstream = cv::cuda::StreamAccessor::wrapStream(stream); // Reset buffers centroid_accum_.setTo(cv::Scalar(0,0,0,0), cvstream); weight_accum_.setTo(cv::Scalar(0), cvstream); normal_accum_.setTo(cv::Scalar(0,0,0,0), cvstream); } void MLSMultiIntensity::gatherPrime(float smoothing, cudaStream_t stream) { // Can use a simpler kernel without pose transformations } void MLSMultiIntensity::gather( const GpuMat &depth_src, const GpuMat &normals_src, const GpuMat &intensity_src, const ftl::rgbd::Camera &cam_src, const float4x4 &pose_src, float smoothing, float fsmoothing, cudaStream_t stream) { static constexpr int THREADS_X = 8; static constexpr int THREADS_Y = 8; const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y); const dim3 blockSize(THREADS_X, THREADS_Y); float4x4 inv_pose_src = pose_src; inv_pose_src.invert(); float4x4 o_2_in = inv_pose_src * pose_prime_; float4x4 inv_pose_prime = pose_prime_; inv_pose_prime.invert(); float4x4 in_2_o = inv_pose_prime * pose_src; float3x3 in_2_o33 = inv_pose_prime.getFloat3x3() * pose_src.getFloat3x3(); mls_gather_intensity_kernel<3><<<gridSize, blockSize, 0, stream>>>( normals_src.ptr<half4>(), normal_accum_.ptr<half4>(), depth_prime_.ptr<float>(), depth_src.ptr<float>(), intensity_prime_.ptr<uchar2>(), intensity_src.ptr<uchar2>(), centroid_accum_.ptr<float4>(), weight_accum_.ptr<float>(), smoothing, fsmoothing, o_2_in, in_2_o, in_2_o33, cam_prime_, cam_src, normal_accum_.step1()/4, centroid_accum_.step1()/4, weight_accum_.step1(), depth_prime_.step1(), depth_src.step1(), normals_src.step1()/4, intensity_prime_.step1()/2, intensity_src.step1()/2 ); cudaSafeCall( cudaGetLastError() ); } void MLSMultiIntensity::adjust( GpuMat &depth_out, GpuMat &normals_out, cudaStream_t stream) { static constexpr int THREADS_X = 8; static constexpr int THREADS_Y = 8; const dim3 gridSize((depth_prime_.cols + THREADS_X - 1)/THREADS_X, (depth_prime_.rows + THREADS_Y - 1)/THREADS_Y); const dim3 blockSize(THREADS_X, THREADS_Y); normals_out.create(depth_prime_.size(), CV_16FC4); depth_out.create(depth_prime_.size(), CV_32F); // FIXME: Depth prime assumed to be same as depth out mls_reduce_kernel_2<<<gridSize, blockSize, 0, stream>>>( centroid_accum_.ptr<float4>(), normal_accum_.ptr<half4>(), weight_accum_.ptr<float>(), normals_out.ptr<half4>(), depth_prime_.ptr<float>(), cam_prime_, normal_accum_.step1()/4, centroid_accum_.step1()/4, weight_accum_.step1(), normals_out.step1()/4, depth_prime_.step1() ); cudaSafeCall( cudaGetLastError() ); } // ============================================================================= template <int RADIUS> __global__ void mean_subtract_kernel( const uchar* __restrict__ intensity, uchar2* __restrict__ contrast, int pitch, int cpitch, int width, int height ) { const int x = blockIdx.x*blockDim.x + threadIdx.x; const int y = blockIdx.y*blockDim.y + threadIdx.y; if (x >= RADIUS && y >= RADIUS && x < width-RADIUS && y < height-RADIUS) { float mean = 0.0f; for (int v=-RADIUS; v<=RADIUS; ++v) { for (int u=-RADIUS; u<=RADIUS; ++u) { mean += float(intensity[x+u+(y+v)*pitch]); } } mean /= float((2*RADIUS+1)*(2*RADIUS+1)); float diff = float(intensity[x+y*pitch]) - mean; contrast[x+y*pitch] = make_uchar2(max(0, min(254, int(diff)+127)), int(mean)); } } void ftl::cuda::mean_subtract( const cv::cuda::GpuMat &intensity, cv::cuda::GpuMat &contrast, int radius, cudaStream_t stream ) { static constexpr int THREADS_X = 8; static constexpr int THREADS_Y = 8; const dim3 gridSize((intensity.cols + THREADS_X - 1)/THREADS_X, (intensity.rows + THREADS_Y - 1)/THREADS_Y); const dim3 blockSize(THREADS_X, THREADS_Y); contrast.create(intensity.size(), CV_8UC2); mean_subtract_kernel<3><<<gridSize, blockSize, 0, stream>>>( intensity.ptr<uchar>(), contrast.ptr<uchar2>(), intensity.step1(), contrast.step1()/2, intensity.cols, intensity.rows ); cudaSafeCall( cudaGetLastError() ); }