Skip to content
Snippets Groups Projects
Commit acdbc9c9 authored by Sebastian Hahta's avatar Sebastian Hahta
Browse files

fixstars median filter

parent 435afff4
No related branches found
No related tags found
No related merge requests found
Pipeline #24384 passed
...@@ -38,6 +38,8 @@ if (LIBSTEREO_SHARED) ...@@ -38,6 +38,8 @@ if (LIBSTEREO_SHARED)
src/costs/census.cu src/costs/census.cu
src/costs/gradient.cu src/costs/gradient.cu
src/costs/mutual_information.cu src/costs/mutual_information.cu
src/median_filter.cu
src/median_filter_fixstars.cu
) )
set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp) set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp)
...@@ -52,6 +54,8 @@ else() ...@@ -52,6 +54,8 @@ else()
src/costs/census.cu src/costs/census.cu
src/costs/gradient.cu src/costs/gradient.cu
src/costs/mutual_information.cu src/costs/mutual_information.cu
src/median_filter.cu
src/median_filter_fixstars.cu
) )
endif() endif()
......
#include "median_filter.hpp"
#include "median_filter_fixstars.hpp"
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/core/cuda/common.hpp>
void median_filter(cv::InputArray in, cv::OutputArray out) {
if (in.isGpuMat()) {
if (in.type() != CV_32F) { throw std::exception(); }
auto gpuin = in.getGpuMat();
auto gpuout = out.getGpuMatRef();
gpuout.create(in.size(), in.type());
// TODO: 32F median filter
cv::cuda::GpuMat tmp1(gpuin.size(), CV_16SC1);
cv::cuda::GpuMat tmp2(gpuin.size(), CV_16SC1);
gpuin.convertTo(tmp1, CV_16SC1, 16.0, 0.0);
median_filter((uint16_t*) tmp1.data, (uint16_t*) tmp2.data, tmp1.cols, tmp1.rows, tmp1.step1(), 0);
cudaStreamSynchronize(0);
tmp2.convertTo(gpuout, CV_32FC1, 1.0/16.0, 0.0);
}
else {
auto hostin = in.getMat();
auto hostout = out.getMatRef();
cv::medianBlur(in, out, 3);
}
}
void median_filter(Array2D<float> &in, cv::OutputArray out) {
if (out.isGpuMat()) {
median_filter(in.toGpuMat(), out);
}
else {
median_filter(in.toMat(), out);
}
}
#pragma once
#include <opencv2/core/core.hpp>
#include "array2d.hpp"
void median_filter(cv::InputArray in, cv::OutputArray out);
void median_filter(Array2D<float> &in, cv::OutputArray out);
/*
Copyright 2016 Fixstars Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http ://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <opencv2/core/cuda/common.hpp>
#include "median_filter.hpp"
namespace {
const int BLOCK_X = 16;
const int BLOCK_Y = 16;
const int KSIZE = 3;
const int RADIUS = KSIZE / 2;
const int KSIZE_SQ = KSIZE * KSIZE;
inline int divup(int total, int grain)
{
return (total + grain - 1) / grain;
}
template <typename T>
__device__ inline void swap(T& x, T& y)
{
T tmp(x);
x = y;
y = tmp;
}
// sort, min, max of 1 element
template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); }
template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); }
template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); }
// sort, min, max of 2 elements
__device__ inline void dev_sort_2(uint32_t& x, uint32_t& y)
{
const uint32_t mask = __vcmpgtu2(x, y);
const uint32_t tmp = (x ^ y) & mask;
x ^= tmp;
y ^= tmp;
}
__device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); }
__device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); }
template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); }
template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); }
template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); }
// sort, min, max of 4 elements
__device__ inline void dev_sort_4(uint32_t& x, uint32_t& y)
{
const uint32_t mask = __vcmpgtu4(x, y);
const uint32_t tmp = (x ^ y) & mask;
x ^= tmp;
y ^= tmp;
}
__device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); }
__device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); }
template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); }
template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); }
template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); }
template <typename T, int V = 1>
__device__ inline void median_selection_network_9(T* buf)
{
#define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j])
#define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j])
#define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j])
SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8);
SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
MAX_OP(0, 3); MAX_OP(3, 6);
SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4);
MIN_OP(5, 8); MIN_OP(2, 5);
SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4);
#undef SWAP_OP
#undef MIN_OP
#undef MAX_OP
}
__global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS)
return;
uint8_t buf[KSIZE_SQ];
for (int i = 0; i < KSIZE_SQ; i++)
buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf);
dst[y * p + x] = buf[KSIZE_SQ / 2];
}
__global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS)
return;
uint16_t buf[KSIZE_SQ];
for (int i = 0; i < KSIZE_SQ; i++)
buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf);
dst[y * p + x] = buf[KSIZE_SQ / 2];
}
__global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch)
{
const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < RADIUS || y >= h - RADIUS)
return;
uint32_t buf[KSIZE_SQ];
if (x_4 >= 4 && x_4 + 7 < w)
{
buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]);
buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]);
buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]);
buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]);
buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]);
buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]);
buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]);
buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]);
buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]);
buf[0] = (buf[1] << 8) | (buf[0] >> 24);
buf[2] = (buf[1] >> 8) | (buf[2] << 24);
buf[3] = (buf[4] << 8) | (buf[3] >> 24);
buf[5] = (buf[4] >> 8) | (buf[5] << 24);
buf[6] = (buf[7] << 8) | (buf[6] >> 24);
buf[8] = (buf[7] >> 8) | (buf[8] << 24);
median_selection_network_9<uint32_t, 4>(buf);
*((uint32_t*)&dst[y * pitch + x_4]) = buf[KSIZE_SQ / 2];
}
else if (x_4 == 0)
{
for (int x = RADIUS; x < 4; x++)
{
uint8_t* buf_u8 = (uint8_t*)buf;
for (int i = 0; i < KSIZE_SQ; i++)
buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf_u8);
dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
}
}
else if (x_4 < w)
{
for (int x = x_4; x < min(x_4 + 4, w - RADIUS); x++)
{
uint8_t* buf_u8 = (uint8_t*)buf;
for (int i = 0; i < KSIZE_SQ; i++)
buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf_u8);
dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
}
}
}
__global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch)
{
const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < RADIUS || y >= h - RADIUS)
return;
uint32_t buf[KSIZE_SQ];
if (x_2 >= 2 && x_2 + 3 < w)
{
buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]);
buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]);
buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]);
buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]);
buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]);
buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]);
buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]);
buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]);
buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]);
buf[0] = (buf[1] << 16) | (buf[0] >> 16);
buf[2] = (buf[1] >> 16) | (buf[2] << 16);
buf[3] = (buf[4] << 16) | (buf[3] >> 16);
buf[5] = (buf[4] >> 16) | (buf[5] << 16);
buf[6] = (buf[7] << 16) | (buf[6] >> 16);
buf[8] = (buf[7] >> 16) | (buf[8] << 16);
median_selection_network_9<uint32_t, 2>(buf);
*((uint32_t*)&dst[y * pitch + x_2]) = buf[KSIZE_SQ / 2];
}
else if (x_2 == 0)
{
for (int x = RADIUS; x < 2; x++)
{
uint8_t* buf_u8 = (uint8_t*)buf;
for (int i = 0; i < KSIZE_SQ; i++)
buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf_u8);
dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
}
}
else if (x_2 < w)
{
for (int x = x_2; x < min(x_2 + 2, w - RADIUS); x++)
{
uint8_t* buf_u8 = (uint8_t*)buf;
for (int i = 0; i < KSIZE_SQ; i++)
buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
median_selection_network_9(buf_u8);
dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
}
}
}
}
void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream) {
if (pitch % 4 == 0) {
const dim3 block(BLOCK_X, BLOCK_Y);
const dim3 grid(divup(width / 4, block.x), divup(height, block.y));
median_kernel_3x3_8u_v4<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
}
else {
const dim3 block(BLOCK_X, BLOCK_Y);
const dim3 grid(divup(width, block.x), divup(height, block.y));
median_kernel_3x3_8u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
}
cudaSafeCall(cudaGetLastError());
}
void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream) {
if (pitch % 2 == 0) {
const dim3 block(BLOCK_X, BLOCK_Y);
const dim3 grid(divup(width / 2, block.x), divup(height, block.y));
median_kernel_3x3_16u_v2<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
}
else {
const dim3 block(BLOCK_X, BLOCK_Y);
const dim3 grid(divup(width, block.x), divup(height, block.y));
median_kernel_3x3_16u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch);
}
cudaSafeCall(cudaGetLastError());
}
/*
Copyright 2016 Fixstars Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http ://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#pragma once
#include <cstdint>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream);
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#include <opencv2/imgproc.hpp> #include <opencv2/imgproc.hpp>
#include <opencv2/core/cuda/common.hpp> #include <opencv2/core/cuda/common.hpp>
#include <nppi.h>
#include "stereo.hpp" #include "stereo.hpp"
#include "util_opencv.hpp" #include "util_opencv.hpp"
...@@ -12,6 +14,8 @@ ...@@ -12,6 +14,8 @@
#include "cost_aggregation.hpp" #include "cost_aggregation.hpp"
#include "aggregations/standard_sgm.hpp" #include "aggregations/standard_sgm.hpp"
#include "median_filter.hpp"
#ifdef __GNUG__ #ifdef __GNUG__
#include <chrono> #include <chrono>
...@@ -108,14 +112,8 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra ...@@ -108,14 +112,8 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
if (params.debug) { timer_print("WTA"); } if (params.debug) { timer_print("WTA"); }
if (disparity.isGpuMat()) { median_filter(impl_->wta.disparity, disparity);
impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef()); if (params.debug) { timer_print("median filter"); }
}
else {
cv::Mat &disparity_ = disparity.getMatRef();
impl_->wta.disparity.toMat(disparity_);
cv::medianBlur(disparity_, disparity_, 3);
}
// confidence estimate // confidence estimate
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment