Skip to content
Snippets Groups Projects
Commit 5f2019db authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Histogram scan gradient filter

parent 7879a994
No related branches found
No related tags found
1 merge request!347Feature buckets experiment
......@@ -33,10 +33,11 @@ void StereoCSF::compute(cv::InputArray l, cv::InputArray r, cv::OutputArray disp
mat2gray(r, impl_->r);
SalientGradient sg = {impl_->l.data(), impl_->gl.data(), impl_->temp.data(), impl_->l.width, impl_->l.height};
parallel1DWarp(sg, l.rows(), l.cols());
parallel1DWarpSM(sg, l.rows(), l.cols());
cv::Mat tmp;
impl_->gl.toGpuMat().download(tmp);
cv::resize(tmp,tmp, cv::Size(tmp.cols/2, tmp.rows/2));
cv::imshow("Gradients", tmp);
cudaSafeCall(cudaDeviceSynchronize());
......
......
......@@ -24,28 +24,55 @@ struct SalientGradient {
return make_float2(g,a);
}
__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size) {
struct WarpSharedMemory {
int gradient_histogram[32];
};
inline __device__ int scan(volatile int *s_Data, int thread, int threshold) {
for (uint offset = 1; offset < 32; offset <<= 1) {
__syncwarp();
uint t = (thread >= offset) ? s_Data[thread] + s_Data[thread - offset] : s_Data[thread];
__syncwarp();
s_Data[thread] = t;
}
uint t = __ballot_sync(0xFFFFFFFF, s_Data[thread] > threshold);
return __ffs(t);
}
__device__ void operator()(ushort2 thread, ushort2 stride, ushort2 size, WarpSharedMemory &wsm) {
static const float PI = 3.14f;
static constexpr float PI2 = PI/2.0f;
for (int y=thread.y; y<size.y; y+=stride.y) {
// Reset histogram
//for (int i=thread.x; i < 32; i+=32) wsm.gradient_histogram[i] = 0;
wsm.gradient_histogram[thread.x] = 0;
for (int x=thread.x; x<size.x; x+=stride.x) {
auto g = calculateGradient(x,y);
output(y,x) = uchar((g.y+PI2) / PI * 255.0f);
output(y,x) = uchar((g.y+PI2) / PI * 63.0f);
magnitude(y,x) = uchar(g.x);
//maxmag = warpMax(max(maxmag,int(g.x)));
atomicAdd(&wsm.gradient_histogram[min(31,int(g.x / 361.0f * 32.0f))], 1);
//atomicAdd(&wsm.gradient_histogram[0], 1);
}
__syncwarp();
uchar gthresh = min(255, scan(wsm.gradient_histogram, thread.x, float(width)*0.95f) * (256/32));
// Apply threshold
for (int x=thread.x; x<size.x; x+=stride.x) {
uchar thresh = 0;
for (int u=-15; u<=15; ++u) {
uchar thresh = gthresh;
for (int u=-5; u<=5; ++u) {
thresh = (x+u >= 0 && x+u < width) ? max(thresh, magnitude(y,x+u)-abs(u)) : thresh;
}
uchar m = magnitude(y,x);
if (m < thresh) output(y,x) = 0;
//if (m < max(thresh, uchar(float(maxmag)*0.2f))) output(y,x) = 0;
output(y,x) = (m < thresh) ? 0 : 255;
}
// Next step would be to bucket the results
......
......
......@@ -21,6 +21,15 @@ __device__ inline T warpMin(T e) {
return e;
}
template <typename T>
__device__ inline T warpMax(T e) {
for (int i = WARP_SIZE/2; i > 0; i /= 2) {
const T other = __shfl_xor_sync(FULL_MASK, e, i, WARP_SIZE);
e = max(e, other);
}
return e;
}
#ifdef USE_GPU
template <typename FUNCTOR>
......@@ -30,6 +39,16 @@ __global__ void kernel2D(FUNCTOR f, ushort2 size) {
f(thread, stride, size);
}
template <typename FUNCTOR, int WARPS>
__global__ void kernel2DWarpSM(FUNCTOR f, ushort2 size) {
const ushort2 thread{ushort(threadIdx.x+blockIdx.x*blockDim.x), ushort(threadIdx.y+blockIdx.y*blockDim.y)};
const ushort2 stride{ushort(blockDim.x * gridDim.x), ushort(blockDim.y * gridDim.y)};
__shared__ typename FUNCTOR::WarpSharedMemory sm[WARPS];
f(thread, stride, size, sm[threadIdx.y]);
}
template <typename FUNCTOR>
__global__ void kernel1D(FUNCTOR f, ushort size) {
const ushort thread = threadIdx.x+blockIdx.x*blockDim.x;
......@@ -99,6 +118,27 @@ void parallel1DWarp(const FUNCTOR &f, int size, int size2) {
#endif
}
template <typename FUNCTOR>
void parallel1DWarpSM(const FUNCTOR &f, int size, int size2) {
#if __cplusplus >= 201703L
//static_assert(std::is_invocable<FUNCTOR,int,int,int>::value, "Parallel1D requires a valid functor: void()(int,int,int)");
// Is this static_assert correct?
static_assert(std::is_invocable<FUNCTOR,ushort2,ushort2,ushort2>::value, "Parallel1D requires a valid functor: void()(ushort2,ushort2,ushort2)");
#endif
#ifdef USE_GPU
cudaSafeCall(cudaGetLastError());
const dim3 gridSize(1, (size+8-1) / 8);
const dim3 blockSize(32, 8);
ushort2 tsize{ushort(size2), ushort(size)};
kernel2DWarpSM<FUNCTOR, 8><<<gridSize, blockSize, 0, 0>>>(f,tsize);
cudaSafeCall(cudaGetLastError());
//cudaSafeCall(cudaDeviceSynchronize());
#else
#endif
}
/**
* Wrapper to initiate a parallel processing job using a given functor. The
* width and height parameters are used to determine the number of threads.
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment