From acdbc9c9d72515cb5e7341d4ee36f336cd149d8e Mon Sep 17 00:00:00 2001 From: Sebastian Hahta <joseha@utu.fi> Date: Fri, 17 Apr 2020 16:15:58 +0300 Subject: [PATCH] fixstars median filter --- lib/libstereo/CMakeLists.txt | 4 + lib/libstereo/src/median_filter.cu | 41 +++ lib/libstereo/src/median_filter.hpp | 7 + lib/libstereo/src/median_filter_fixstars.cu | 288 +++++++++++++++++++ lib/libstereo/src/median_filter_fixstars.hpp | 26 ++ lib/libstereo/src/stereo_censussgm.cu | 14 +- 6 files changed, 372 insertions(+), 8 deletions(-) create mode 100644 lib/libstereo/src/median_filter.cu create mode 100644 lib/libstereo/src/median_filter.hpp create mode 100644 lib/libstereo/src/median_filter_fixstars.cu create mode 100644 lib/libstereo/src/median_filter_fixstars.hpp diff --git a/lib/libstereo/CMakeLists.txt b/lib/libstereo/CMakeLists.txt index 8ebb62f64..bdd9a91cc 100644 --- a/lib/libstereo/CMakeLists.txt +++ b/lib/libstereo/CMakeLists.txt @@ -38,6 +38,8 @@ if (LIBSTEREO_SHARED) src/costs/census.cu src/costs/gradient.cu src/costs/mutual_information.cu + src/median_filter.cu + src/median_filter_fixstars.cu ) set_target_properties(libstereo PROPERTIES PUBLIC_HEADER include/stereo.hpp) @@ -52,6 +54,8 @@ else() src/costs/census.cu src/costs/gradient.cu src/costs/mutual_information.cu + src/median_filter.cu + src/median_filter_fixstars.cu ) endif() diff --git a/lib/libstereo/src/median_filter.cu b/lib/libstereo/src/median_filter.cu new file mode 100644 index 000000000..99bce97fd --- /dev/null +++ b/lib/libstereo/src/median_filter.cu @@ -0,0 +1,41 @@ +#include "median_filter.hpp" +#include "median_filter_fixstars.hpp" + +#include <opencv2/imgproc.hpp> +#include <opencv2/core.hpp> +#include <opencv2/core/cuda.hpp> +#include <opencv2/core/cuda/common.hpp> + +void median_filter(cv::InputArray in, cv::OutputArray out) { + if (in.isGpuMat()) { + if (in.type() != CV_32F) { throw std::exception(); } + + auto gpuin = in.getGpuMat(); + auto gpuout = out.getGpuMatRef(); + gpuout.create(in.size(), in.type()); + + // TODO: 32F median filter + cv::cuda::GpuMat tmp1(gpuin.size(), CV_16SC1); + cv::cuda::GpuMat tmp2(gpuin.size(), CV_16SC1); + gpuin.convertTo(tmp1, CV_16SC1, 16.0, 0.0); + + median_filter((uint16_t*) tmp1.data, (uint16_t*) tmp2.data, tmp1.cols, tmp1.rows, tmp1.step1(), 0); + cudaStreamSynchronize(0); + + tmp2.convertTo(gpuout, CV_32FC1, 1.0/16.0, 0.0); + } + else { + auto hostin = in.getMat(); + auto hostout = out.getMatRef(); + cv::medianBlur(in, out, 3); + } +} + +void median_filter(Array2D<float> &in, cv::OutputArray out) { + if (out.isGpuMat()) { + median_filter(in.toGpuMat(), out); + } + else { + median_filter(in.toMat(), out); + } +} diff --git a/lib/libstereo/src/median_filter.hpp b/lib/libstereo/src/median_filter.hpp new file mode 100644 index 000000000..896593062 --- /dev/null +++ b/lib/libstereo/src/median_filter.hpp @@ -0,0 +1,7 @@ +#pragma once + +#include <opencv2/core/core.hpp> +#include "array2d.hpp" + +void median_filter(cv::InputArray in, cv::OutputArray out); +void median_filter(Array2D<float> &in, cv::OutputArray out); diff --git a/lib/libstereo/src/median_filter_fixstars.cu b/lib/libstereo/src/median_filter_fixstars.cu new file mode 100644 index 000000000..9149b2275 --- /dev/null +++ b/lib/libstereo/src/median_filter_fixstars.cu @@ -0,0 +1,288 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <opencv2/core/cuda/common.hpp> + +#include "median_filter.hpp" + +namespace { + + const int BLOCK_X = 16; + const int BLOCK_Y = 16; + const int KSIZE = 3; + const int RADIUS = KSIZE / 2; + const int KSIZE_SQ = KSIZE * KSIZE; + + inline int divup(int total, int grain) + { + return (total + grain - 1) / grain; + } + + template <typename T> + __device__ inline void swap(T& x, T& y) + { + T tmp(x); + x = y; + y = tmp; + } + + // sort, min, max of 1 element + template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); } + template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); } + template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); } + + // sort, min, max of 2 elements + __device__ inline void dev_sort_2(uint32_t& x, uint32_t& y) + { + const uint32_t mask = __vcmpgtu2(x, y); + const uint32_t tmp = (x ^ y) & mask; + x ^= tmp; + y ^= tmp; + } + __device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); } + __device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); } + + template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); } + template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); } + template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); } + + // sort, min, max of 4 elements + __device__ inline void dev_sort_4(uint32_t& x, uint32_t& y) + { + const uint32_t mask = __vcmpgtu4(x, y); + const uint32_t tmp = (x ^ y) & mask; + x ^= tmp; + y ^= tmp; + } + __device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); } + __device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); } + + template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); } + template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); } + template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); } + + template <typename T, int V = 1> + __device__ inline void median_selection_network_9(T* buf) + { +#define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j]) +#define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j]) +#define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j]) + + SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7); + SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8); + SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7); + MAX_OP(0, 3); MAX_OP(3, 6); + SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4); + MIN_OP(5, 8); MIN_OP(2, 5); + SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4); + +#undef SWAP_OP +#undef MIN_OP +#undef MAX_OP + } + + __global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS) + return; + + uint8_t buf[KSIZE_SQ]; + for (int i = 0; i < KSIZE_SQ; i++) + buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf); + + dst[y * p + x] = buf[KSIZE_SQ / 2]; + } + + __global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS) + return; + + uint16_t buf[KSIZE_SQ]; + for (int i = 0; i < KSIZE_SQ; i++) + buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf); + + dst[y * p + x] = buf[KSIZE_SQ / 2]; + } + + __global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch) + { + const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x); + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < RADIUS || y >= h - RADIUS) + return; + + uint32_t buf[KSIZE_SQ]; + if (x_4 >= 4 && x_4 + 7 < w) + { + buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]); + buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]); + buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]); + + buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]); + buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]); + buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]); + + buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]); + buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]); + buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]); + + buf[0] = (buf[1] << 8) | (buf[0] >> 24); + buf[2] = (buf[1] >> 8) | (buf[2] << 24); + + buf[3] = (buf[4] << 8) | (buf[3] >> 24); + buf[5] = (buf[4] >> 8) | (buf[5] << 24); + + buf[6] = (buf[7] << 8) | (buf[6] >> 24); + buf[8] = (buf[7] >> 8) | (buf[8] << 24); + + median_selection_network_9<uint32_t, 4>(buf); + + *((uint32_t*)&dst[y * pitch + x_4]) = buf[KSIZE_SQ / 2]; + } + else if (x_4 == 0) + { + for (int x = RADIUS; x < 4; x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + else if (x_4 < w) + { + for (int x = x_4; x < min(x_4 + 4, w - RADIUS); x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + } + + __global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch) + { + const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x); + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < RADIUS || y >= h - RADIUS) + return; + + uint32_t buf[KSIZE_SQ]; + if (x_2 >= 2 && x_2 + 3 < w) + { + buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]); + buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]); + buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]); + + buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]); + buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]); + buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]); + + buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]); + buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]); + buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]); + + buf[0] = (buf[1] << 16) | (buf[0] >> 16); + buf[2] = (buf[1] >> 16) | (buf[2] << 16); + + buf[3] = (buf[4] << 16) | (buf[3] >> 16); + buf[5] = (buf[4] >> 16) | (buf[5] << 16); + + buf[6] = (buf[7] << 16) | (buf[6] >> 16); + buf[8] = (buf[7] >> 16) | (buf[8] << 16); + + median_selection_network_9<uint32_t, 2>(buf); + + *((uint32_t*)&dst[y * pitch + x_2]) = buf[KSIZE_SQ / 2]; + } + else if (x_2 == 0) + { + for (int x = RADIUS; x < 2; x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + else if (x_2 < w) + { + for (int x = x_2; x < min(x_2 + 2, w - RADIUS); x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + } +} + +void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream) { + + if (pitch % 4 == 0) { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width / 4, block.x), divup(height, block.y)); + median_kernel_3x3_8u_v4<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); + } + else { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width, block.x), divup(height, block.y)); + median_kernel_3x3_8u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); + } + + cudaSafeCall(cudaGetLastError()); +} + +void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream) { + + if (pitch % 2 == 0) { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width / 2, block.x), divup(height, block.y)); + median_kernel_3x3_16u_v2<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); + } + else { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width, block.x), divup(height, block.y)); + median_kernel_3x3_16u<<<grid, block, 0, stream>>>(d_src, d_dst, width, height, pitch); + } + + cudaSafeCall(cudaGetLastError()); +} diff --git a/lib/libstereo/src/median_filter_fixstars.hpp b/lib/libstereo/src/median_filter_fixstars.hpp new file mode 100644 index 000000000..3cacf4f19 --- /dev/null +++ b/lib/libstereo/src/median_filter_fixstars.hpp @@ -0,0 +1,26 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#pragma once + +#include <cstdint> + +#include <cuda_runtime.h> +#include <cuda_runtime_api.h> + + +void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch, cudaStream_t stream); +void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch, cudaStream_t stream); diff --git a/lib/libstereo/src/stereo_censussgm.cu b/lib/libstereo/src/stereo_censussgm.cu index f3969a5f3..6e2a4c36b 100644 --- a/lib/libstereo/src/stereo_censussgm.cu +++ b/lib/libstereo/src/stereo_censussgm.cu @@ -2,6 +2,8 @@ #include <opencv2/imgproc.hpp> #include <opencv2/core/cuda/common.hpp> +#include <nppi.h> + #include "stereo.hpp" #include "util_opencv.hpp" @@ -12,6 +14,8 @@ #include "cost_aggregation.hpp" #include "aggregations/standard_sgm.hpp" +#include "median_filter.hpp" + #ifdef __GNUG__ #include <chrono> @@ -108,14 +112,8 @@ void StereoCensusSgm::compute(cv::InputArray l, cv::InputArray r, cv::OutputArra cudaSafeCall(cudaDeviceSynchronize()); if (params.debug) { timer_print("WTA"); } - if (disparity.isGpuMat()) { - impl_->wta.disparity.toGpuMat(disparity.getGpuMatRef()); - } - else { - cv::Mat &disparity_ = disparity.getMatRef(); - impl_->wta.disparity.toMat(disparity_); - cv::medianBlur(disparity_, disparity_, 3); - } + median_filter(impl_->wta.disparity, disparity); + if (params.debug) { timer_print("median filter"); } // confidence estimate -- GitLab