diff --git a/lib/libstereo/src/algorithms/clustersf.cu b/lib/libstereo/src/algorithms/clustersf.cu index 454a42b3cb3db9b5f579160b3255c2b638dee260..db488112343fb0c76b0d15af0487b266b1340586 100644 --- a/lib/libstereo/src/algorithms/clustersf.cu +++ b/lib/libstereo/src/algorithms/clustersf.cu @@ -33,10 +33,11 @@ void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disp mat2gray(r, impl_->r); SalientGradient sg = {impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->l.width, impl_->l.height}; - parallel1DWarp(sg, l.rows(), l.cols()); + parallel1DWarpSM(sg, l.rows(), l.cols()); cv::Mat tmp; impl_->gl.toGpuMat().download(tmp); + cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2)); cv::imshow("Gradients", tmp); cudaSafeCall(cudaDeviceSynchronize()); diff --git a/lib/libstereo/src/filters/salient_gradient.hpp b/lib/libstereo/src/filters/salient_gradient.hpp index fd521d1742d886c2a26ce7552dd9473ea72e8d2d..093319518e75d51d5b220eb48709815be167199d 100644 --- a/lib/libstereo/src/filters/salient_gradient.hpp +++ b/lib/libstereo/src/filters/salient_gradient.hpp @@ -24,28 +24,55 @@ struct SalientGradient { return make_float2(g,a); } - __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) { + struct WarpSharedMemory { + int gradient_histogram[32]; + }; + + inline __device__ int scan(volatile int *s_Data, int thread, int threshold) { + for (uint offset = 1; offset < 32; offset <<= 1) { + __syncwarp(); + uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread]; + __syncwarp(); + s_Data[thread] = t; + } + + uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold); + return __ffs(t); + } + + __device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) { static const float PI = 3.14f; static constexpr float PI2 = PI/2.0f; for (int y=thread.y; y<size.y; y+=stride.y) { + // Reset histogram + //for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0; + wsm.gradient_histogram[thread.x] = 0; + for (int x=thread.x; x<size.x; x+=stride.x) { auto g = calculateGradient(x,y); - output(y,x) = uchar((g.y+PI2) / PI * 255.0f); + output(y,x) = uchar((g.y+PI2) / PI * 63.0f); magnitude(y,x) = uchar(g.x); + + //maxmag = warpMax(max(maxmag,int(g.x))); + atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1); + //atomicAdd(&wsm.gradient_histogram[0], 1); } __syncwarp(); + uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32)); + // Apply threshold for (int x=thread.x; x<size.x; x+=stride.x) { - uchar thresh = 0; - for (int u=-15; u<=15; ++u) { + uchar thresh = gthresh; + for (int u=-5; u<=5; ++u) { thresh = (x+u >= 0 && x+u < width) ? max(thresh, magnitude(y,x+u)-abs(u)) : thresh; } uchar m = magnitude(y,x); - if (m < thresh) output(y,x) = 0; + //if (m < max(thresh, uchar(float(maxmag)*0.2f))) output(y,x) = 0; + output(y,x) = (m < thresh) ? 0 : 255; } // Next step would be to bucket the results diff --git a/lib/libstereo/src/util.hpp b/lib/libstereo/src/util.hpp index 4c50f082b7f44dffd85634e8e6b1101193542fca..92d9c3ee97ba3a714b2eb2adb42d83e5ffc3d6c8 100644 --- a/lib/libstereo/src/util.hpp +++ b/lib/libstereo/src/util.hpp @@ -21,6 +21,15 @@ __device__ inline T warpMin(T e) { return e; } +template <typename T> +__device__ inline T warpMax(T e) { + for (int i = WARP_SIZE/2; i > 0; i /= 2) { + const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE); + e = max(e, other); + } + return e; +} + #ifdef USE_GPU template <typename FUNCTOR> @@ -30,6 +39,16 @@ __global__ void kernel2D(FUNCTOR f, ushort2 size) { f(thread, stride, size); } +template <typename FUNCTOR, int WARPS> +__global__ void kernel2DWarpSM(FUNCTOR f, ushort2 size) { + const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)}; + const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)}; + + __shared__ typename FUNCTOR::WarpSharedMemory sm[WARPS]; + + f(thread, stride, size, sm[threadIdx.y]); +} + template <typename FUNCTOR> __global__ void kernel1D(FUNCTOR f, ushort size) { const ushort thread = threadIdx.x+blockIdx.x*blockDim.x; @@ -99,6 +118,27 @@ void parallel1DWarp(const FUNCTOR &f, int size, int size2) { #endif } +template <typename FUNCTOR> +void parallel1DWarpSM(const FUNCTOR &f, int size, int size2) { +#if __cplusplus >= 201703L + //static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)"); + // Is this static_assert correct? + static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)"); +#endif + +#ifdef USE_GPU + cudaSafeCall(cudaGetLastError()); + const dim3 gridSize(1, (size+8-1) / 8); + const dim3 blockSize(32, 8); + ushort2 tsize{ushort(size2), ushort(size)}; + kernel2DWarpSM<FUNCTOR, 8><<<gridSize, blockSize, 0, 0>>>(f,tsize); + cudaSafeCall(cudaGetLastError()); + //cudaSafeCall(cudaDeviceSynchronize()); +#else + +#endif +} + /** * Wrapper to initiate a parallel processing job using a given functor. The * width and height parameters are used to determine the number of threads.