diff --git a/CMakeLists.txt b/CMakeLists.txt index 376b06b36dcaa0450f21629f4ac17046fa1b3f56..d1a93b36729e0051a00f7d8c1131399c166a25a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,10 +116,12 @@ endif() # ============================================================================== if (WITH_FIXSTARS) - find_package(LibSGM REQUIRED) - if (LibSGM_FOUND) - set(HAVE_LIBSGM true) - endif() + set(HAVE_LIBSGM true) + add_subdirectory(lib/libsgm) + include_directories(lib/libsgm/include) + set_property(TARGET sgm PROPERTY FOLDER "dependencies") +else() + add_library(sgm INTERFACE) endif() if (WITH_CERES) @@ -400,7 +402,7 @@ if (BUILD_CALIBRATION) if (NOT HAVE_CERES) message(ERROR "Ceres is required") endif() - + add_subdirectory(applications/calibration-ceres) add_subdirectory(applications/calibration-multi) endif() diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt index d0d786884525f45bbf3b4dcb41efda089d0bce89..0fd5466aed057c3d1034501fee00518de5a44bf9 100644 --- a/components/operators/CMakeLists.txt +++ b/components/operators/CMakeLists.txt @@ -35,9 +35,9 @@ set(OPERSRC src/gt_analysis.cu ) -if (LIBSGM_FOUND) +if (HAVE_LIBSGM) list(APPEND OPERSRC src/disparity/fixstars_sgm.cpp) -endif (LIBSGM_FOUND) +endif (HAVE_LIBSGM) if (HAVE_OPTFLOW) list(APPEND OPERSRC @@ -52,6 +52,7 @@ target_include_directories(ftloperators PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include> PRIVATE src) -target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads ${OpenCV_LIBS}) + +target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon sgm Eigen3::Eigen Threads::Threads ${OpenCV_LIBS}) #ADD_SUBDIRECTORY(test) diff --git a/lib/libsgm/CMakeLists.txt b/lib/libsgm/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..497411f85b8f667e0bbb3516fa3e61782f3a34ec --- /dev/null +++ b/lib/libsgm/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_EXTENSIONS OFF) + +set(CUDA_ARCH "-arch=sm_50" CACHE STRING "Value of the NVCC -arch option.") + +option(ENABLE_ZED_DEMO "Build a Demo using ZED Camera" OFF) +option(ENABLE_SAMPLES "Build samples" OFF) +option(ENABLE_TESTS "Test library" OFF) +option(LIBSGM_SHARED "Build a shared library" OFF) +option(BUILD_OPENCV_WRAPPER "Make library compatible with cv::Mat and cv::cuda::GpuMat of OpenCV" OFF) + +if(WIN32) + set(ZED_SDK_LIB "C:\\Program Files (x86)\\ZED SDK\\lib\\sl_zed64.lib" CACHE STRING "ZED SDK library(sl_zed**.llb) path.") + set(ZED_SDK_INCLUDE_DIR "C:\\Program Files (x86)\\ZED SDK\\include" CACHE STRING "ZED SDK include path.") +else() + set(ZED_SDK_LIB "/usr/local/zed/lib/libsl_zed.so" CACHE STRING "ZED SDK library(sl_zed**.llb) path.") + set(ZED_SDK_INCLUDE_DIR "/usr/local/zed/include" CACHE STRING "ZED SDK include path.") +endif() + +project(libSGM VERSION 2.4.0) + +if(BUILD_OPENCV_WRAPPER) + find_package(OpenCV REQUIRED core) + include_directories(${OpenCV_INCLUDE_DIRS}) +endif() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/libsgm_config.h.in + ${CMAKE_CURRENT_SOURCE_DIR}/include/libsgm_config.h +) + +add_subdirectory(src) + +if(ENABLE_SAMPLES) + add_subdirectory(sample/image) + add_subdirectory(sample/movie) +# add_subdirectory(sample/reprojection) + add_subdirectory(sample/benchmark) + if(BUILD_OPENCV_WRAPPER) + add_subdirectory(sample/image_cv_gpumat) + endif() +endif() + +if(ENABLE_TESTS) + add_subdirectory(test) +endif() + +if(ENABLE_ZED_DEMO) + add_subdirectory(sample/zed) +endif() diff --git a/cmake/FindLibSGM.cmake b/lib/libsgm/FindLibSGM.cmake similarity index 73% rename from cmake/FindLibSGM.cmake rename to lib/libsgm/FindLibSGM.cmake index 37fe6ca922084499b3ee7e964ea38b5e6fbbbfb6..ac950c25175cc43df740f5c3ee76e09bce4545e3 100644 --- a/cmake/FindLibSGM.cmake +++ b/lib/libsgm/FindLibSGM.cmake @@ -6,36 +6,28 @@ # LIBSGM_INCLUDE_DIRS - Directories containing the LIBSGM include files. # LIBSGM_LIBRARY - Libraries needed to use LIBSGM. -if(WIN32) -find_path(libSGM_DIR libSGM PATHS "C:/Program Files" "C:/Program Files (x86)") -set(libSGM_DIR ${libSGM_DIR}/libSGM) -else() -set(glog_DIR "") -endif() - # Find lib set(LIBSGM_FOUND FALSE CACHE BOOL "" FORCE) find_library(LIBSGM_LIBRARY NAMES sgm libsgm - PATHS ${libSGM_DIR} PATH_SUFFIXES lib/ ) # Find include find_path(LIBSGM_INCLUDE_DIRS NAMES libsgm.h - PATHS ${libSGM_DIR} PATH_SUFFIXES include/ ) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(LibSGM DEFAULT_MSG LIBSGM_LIBRARY LIBSGM_INCLUDE_DIRS) +message(STATUS "(LIBSGM_FOUND : ${LIBSGM_FOUND} include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})") + mark_as_advanced(LIBSGM_FOUND) if(LIBSGM_FOUND) - include_directories(${LIBSGM_INCLUDE_DIRS}) set(LIBSGM_FOUND TRUE CACHE BOOL "" FORCE) set(LIBSGM_LIBRARIES ${LIBSGM_LIBRARY}) - message(STATUS "Found libSGM") + message(STATUS "LibSGM found ( include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})") endif() diff --git a/lib/libsgm/LICENSE b/lib/libsgm/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7 --- /dev/null +++ b/lib/libsgm/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/libsgm/README.jp.md b/lib/libsgm/README.jp.md new file mode 100644 index 0000000000000000000000000000000000000000..5811b94dc5f357e99f2aa801ea906f9a876d67d9 --- /dev/null +++ b/lib/libsgm/README.jp.md @@ -0,0 +1,88 @@ +# libSGM +--- +Semi-Global Matchingをベースã¨ã—ãŸCUDA実装ã§ã™ã€‚ + +## æ¦‚è¦ +--- + +libSGMã¯ã€Semi-Global MatchingアルゴリズムをCUDAã§å®Ÿè£…ã—ãŸã‚‚ã®ã§ã™ã€‚ +é©åˆ‡ã«ã‚ャリブレーションã•ã‚ŒãŸ2ã¤ã®å…¥åŠ›ç”»åƒã‹ã‚‰ã€è¦–差画åƒã‚’å–å¾—ã™ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚ + +## 特徴 +--- +CUDAを使用ã—ã€é«˜é€Ÿãªè¦–差画åƒç®—出ãŒå¯èƒ½ + +## パフォーマンス +benchmarkサンプルã§è¨ˆæ¸¬ã—ãŸå‡¦ç†æ™‚間を示ã—ã¾ã™ +### Settings +- image size : 1024 x 440 +- disparity size : 128 +- sgm path : 8 path + +### Results +|Device|Processing Time[Milliseconds]|FPS| +|---|---|---| +|Tegra X2|52.4|19.1| +|GTX 1080 Ti|3.4|296| + +## Requirements +libSGMã¯CUDA (compute capabilities >= 3.0)ã‚’å¿…è¦ã¨ã—ã¾ã™ã€‚ +ã¾ãŸã€ã‚µãƒ³ãƒ—ルをビルドã™ã‚‹éš›ã«ã¯ä»¥ä¸‹ã®ãƒ©ã‚¤ãƒ–ラリãŒå¿…è¦ã¨ãªã‚Šã¾ã™ã€‚ +- OpenCV 3.0 ä»¥é™ +- CMake 3.10 ä»¥é™ + +## build +``` +$ git clone https://github.com/fixstars/libSGM.git +$ cd libSGM +$ mkdir build +$ cd build +$ cmake ../ +$ make +``` + +## サンプル実行 +``` +$ pwd +.../libSGM +$ cd build +$ cd sample/movie/ +$ ./stereo_movie <left image path format> <right image path format> <disparity> <frame count> +left image path format: å·¦å´ç”»åƒå…¥åŠ›æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚¡ã‚¤ãƒ«ãƒ‘スã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆ +right image path format: å³å´ç”»åƒå…¥åŠ›æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚¡ã‚¤ãƒ«ãƒ‘スã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆ +disparity: è¦–å·®æƒ…å ±ã‚’ä½•æ®µéšŽã§ä¿æŒã™ã‚‹ã‹(çœç•¥å¯) +frame count: 全画åƒãŒä½•æžšå˜åœ¨ã™ã‚‹ã‹(çœç•¥å¯) +``` + +disparityã¨frame countã¯çœç•¥ãŒå¯èƒ½ã§ã™ã€‚çœç•¥ã—ãŸæ™‚ã«ã¯ã€ãã‚Œãžã‚Œ64, 100ãŒä»˜ä¸Žã•ã‚Œã¾ã™ã€‚ + +ã“ã“ã§ã€left image path format, right image path formatã¨ã¯ã€ãƒ•ã‚¡ã‚¤ãƒ«èªã¿è¾¼ã¿æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆã‚’æ„味ã—ã¾ã™ã€‚ +次ã®ã‚ˆã†ãªé€£ç•ªãƒ•ã‚¡ã‚¤ãƒ«ãŒä¸Žãˆã‚‰ã‚Œã¦ã„ãŸã¨ãã€ä¸Žãˆã‚‹ã¹ãpath formatã¯ä»¥ä¸‹ã®ã‚ˆã†ã«ãªã‚Šã¾ã™ã€‚ +``` +left_image_0000.pgm +left_image_0001.pgm +left_image_0002.pgm +left_image_0003.pgm +... + +right_image_0000.pgm +right_image_0001.pgm +right_image_0002.pgm +right_image_0003.pgm +``` + +``` +$ ./stereo_movie left_image_%04d.pgm right_image_%04d.pgm +``` + +movie, imageサンプルã¯ã€ +http://www.6d-vision.com/scene-labeling +ã«ã¦æä¾›ã•ã‚Œã¦ã„ã‚‹ã€Daimler Urban Scene Segmentation Benchmark Datasetã«ã¦ +動作確èªã‚’ã—ã¦ã„ã¾ã™ã€‚ + +## Authors +The "SGM Team": Samuel Audet, Yoriyuki Kitta, Yuta Noto, Ryo Sakamoto, Akihiro Takagi +[Fixstars Corporation](http://www.fixstars.com/) + +## License +Apache License 2.0 diff --git a/lib/libsgm/README.md b/lib/libsgm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e80ade9fc24868b8efe203ab20e54274d13ef72c --- /dev/null +++ b/lib/libsgm/README.md @@ -0,0 +1,88 @@ +# libSGM +--- +A CUDA implementation performing Semi-Global Matching. + +## Introduction +--- + +libSGM is library that implements in CUDA the Semi-Global Matching algorithm. +From a pair of appropriately calibrated input images, we can obtain the disparity map. + +## Features +--- +Because it uses CUDA, we can compute the disparity map at high speed. + +## Performance +The libSGM performance obtained from benchmark sample +### Settings +- image size : 1024 x 440 +- disparity size : 128 +- sgm path : 8 path + +### Results +|Device|Processing Time[Milliseconds]|FPS| +|---|---|---| +|Tegra X2|52.4|19.1| +|GTX 1080 Ti|3.4|296| + +## Requirements +libSGM needs CUDA (compute capabilities >= 3.0) to be installed. +Moreover, to build the sample, we need the following libraries: +- OpenCV 3.0 or later +- CMake 3.10 or later + +## Build Instructions +``` +$ git clone https://github.com/fixstars/libSGM.git +$ cd libSGM +$ mkdir build +$ cd build +$ cmake ../ +$ make +``` + +## Sample Execution +``` +$ pwd +.../libSGM +$ cd build +$ cd sample/movie/ +$ ./stereo_movie <left image path format> <right image path format> <disparity> <frame count> +left image path format: the format used for the file paths to the left input images +right image path format: the format used for the file paths to the right input images +disparity: the maximum number of disparities (optional) +frame count: the total number of images (optional) +``` + +"disparity" and "frame count" are optional. By default, they are 64 and 100, respectively. + +Next, we explain the meaning of the "left image path format" and "right image path format". +When provided with the following set of files, we should pass the "path formats" given below. +``` +left_image_0000.pgm +left_image_0001.pgm +left_image_0002.pgm +left_image_0003.pgm +... + +right_image_0000.pgm +right_image_0001.pgm +right_image_0002.pgm +right_image_0003.pgm +``` + +``` +$ ./stereo_movie left_image_%04d.pgm right_image_%04d.pgm +``` + +The sample movie images available at +http://www.6d-vision.com/scene-labeling +under "Daimler Urban Scene Segmentation Benchmark Dataset" +are used to test the software. + +## Authors +The "SGM Team": Samuel Audet, Yoriyuki Kitta, Yuta Noto, Ryo Sakamoto, Akihiro Takagi +[Fixstars Corporation](http://www.fixstars.com/) + +## License +Apache License 2.0 diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h new file mode 100644 index 0000000000000000000000000000000000000000..7e69ed8aef137ebf90878e49616a3e87b2bc3bf7 --- /dev/null +++ b/lib/libsgm/include/libsgm.h @@ -0,0 +1,155 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#pragma once + +/** +* @mainpage stereo-sgm +* See sgm::StereoSGM +*/ + +/** +* @file libsgm.h +* stereo-sgm main header +*/ + +#include <stdint.h> +#include "libsgm_config.h" + +#if defined(LIBSGM_SHARED) + #if defined(WIN32) || defined(_WIN32) + #if defined sgm_EXPORTS + #define LIBSGM_API __declspec(dllexport) + #else + #define LIBSGM_API __declspec(dllimport) + #endif + #else + #define LIBSGM_API __attribute__((visibility("default"))) + #endif +#else + #define LIBSGM_API +#endif + +namespace sgm { + struct CudaStereoSGMResources; + + /** + * @enum DST_TYPE + * Indicates input/output pointer type. + */ + enum EXECUTE_INOUT { + EXECUTE_INOUT_HOST2HOST = (0 << 1) | 0, + EXECUTE_INOUT_HOST2CUDA = (1 << 1) | 0, + EXECUTE_INOUT_CUDA2HOST = (0 << 1) | 1, + EXECUTE_INOUT_CUDA2CUDA = (1 << 1) | 1, + }; + + /** + * @brief StereoSGM class + */ + class StereoSGM { + public: + static const int SUBPIXEL_SHIFT = 4; + static const int SUBPIXEL_SCALE = (1 << SUBPIXEL_SHIFT); + struct Parameters + { + int P1; + int P2; + float uniqueness; + bool subpixel; + Parameters(int P1 = 10, int P2 = 120, float uniqueness = 0.95f, bool subpixel = false) : P1(P1), P2(P2), uniqueness(uniqueness), subpixel(subpixel) {} + }; + + /** + * @param width Processed image's width. + * @param height Processed image's height. + * @param disparity_size It must be 64 or 128. + * @param input_depth_bits Processed image's bits per pixel. It must be 8 or 16. + * @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16. + * @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE. + * @attention + * output_depth_bits must be set to 16 when subpixel is enabled. + */ + LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, + EXECUTE_INOUT inout_type, const Parameters& param = Parameters()); + + /** + * @param width Processed image's width. + * @param height Processed image's height. + * @param disparity_size It must be 64 or 128. + * @param input_depth_bits Processed image's bits per pixel. It must be 8 or 16. + * @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16. + * @param src_pitch Source image's pitch (pixels). + * @param dst_pitch Destination image's pitch (pixels). + * @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE. + * @attention + * output_depth_bits must be set to 16 when subpixel is enabled. + */ + LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch, + EXECUTE_INOUT inout_type, const Parameters& param = Parameters()); + + LIBSGM_API virtual ~StereoSGM(); + + /** + * Execute stereo semi global matching. + * @param left_pixels A pointer stored input left image. + * @param right_pixels A pointer stored input right image. + * @param dst Output pointer. User must allocate enough memory. + * @attention + * You need to allocate dst memory at least width x height x sizeof(element_type) bytes. + * The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16. + * Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. + */ + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst); + + /** + * Same as execute(left_pixels, right_pixels, dst) with image size parameters. + * Dimensions must be smaller or equal to dimensions provided in constructor. + */ + LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch); + + /** + * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values + * will be masked in final disparity image. + */ + LIBSGM_API void setMask(uint8_t* mask, int pitch); + + /** + * Update parameters. Returns true if successful. + */ + LIBSGM_API bool updateParameters(const Parameters ¶ms); + + private: + StereoSGM(const StereoSGM&); + StereoSGM& operator=(const StereoSGM&); + + void cuda_resource_allocate(); + + CudaStereoSGMResources* cu_res_; + + int width_; + int height_; + int disparity_size_; + int input_depth_bits_; + int output_depth_bits_; + int src_pitch_; + int dst_pitch_; + EXECUTE_INOUT inout_type_; + Parameters param_; + }; +} + +#include "libsgm_wrapper.h" diff --git a/lib/libsgm/include/libsgm_config.h b/lib/libsgm/include/libsgm_config.h new file mode 100644 index 0000000000000000000000000000000000000000..67444c41f80c77412b25b2109bb764d2e8f57497 --- /dev/null +++ b/lib/libsgm/include/libsgm_config.h @@ -0,0 +1,13 @@ +#ifndef __LIBSGM_CONFIG_H__ +#define __LIBSGM_CONFIG_H__ + +/* #undef LIBSGM_SHARED */ + +#define LIBSGM_VERSION 2.4.0 +#define LIBSGM_VERSION_MAJOR 2 +#define LIBSGM_VERSION_MINOR 4 +#define LIBSGM_VERSION_PATCH 0 + +/* #undef BUILD_OPENCV_WRAPPER */ + +#endif // __LIBSGM_CONFIG_H__ diff --git a/lib/libsgm/include/libsgm_config.h.in b/lib/libsgm/include/libsgm_config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..26ac62442965d31dc53e319d9910197e475dc2ad --- /dev/null +++ b/lib/libsgm/include/libsgm_config.h.in @@ -0,0 +1,13 @@ +#ifndef __LIBSGM_CONFIG_H__ +#define __LIBSGM_CONFIG_H__ + +#cmakedefine LIBSGM_SHARED + +#define LIBSGM_VERSION @libSGM_VERSION@ +#define LIBSGM_VERSION_MAJOR @libSGM_VERSION_MAJOR@ +#define LIBSGM_VERSION_MINOR @libSGM_VERSION_MINOR@ +#define LIBSGM_VERSION_PATCH @libSGM_VERSION_PATCH@ + +#cmakedefine BUILD_OPENCV_WRAPPER + +#endif // __LIBSGM_CONFIG_H__ diff --git a/lib/libsgm/include/libsgm_wrapper.h b/lib/libsgm/include/libsgm_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..376be5ff458a308915ef4a61daf7c76545fa69f1 --- /dev/null +++ b/lib/libsgm/include/libsgm_wrapper.h @@ -0,0 +1,64 @@ +#ifndef __LIBSGM_WRAPPER_H__ +#define __LIBSGM_WRAPPER_H__ + +#include "libsgm.h" +#include <memory> +#ifdef BUILD_OPENCV_WRAPPER +#include <opencv2/core/cuda.hpp> +#endif + +namespace sgm { + /** + * @brief LibSGMWrapper class which is wrapper for sgm::StereoSGM. + */ + class LibSGMWrapper { + public: + /** + * @param numDisparity Maximum possible disparity value + * @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels + * @param P2 Penalty on the disparity change by more than 1 between neighbor pixels + * @param uniquenessRatio Margin in ratio by which the best cost function value should be at least second one + * @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled + */ + LIBSGM_API LibSGMWrapper(int numDisparity = 128, int P1 = 10, int P2 = 120, float uniquenessRatio = 0.95f, bool subpixel = false); + LIBSGM_API ~LibSGMWrapper(); + + LIBSGM_API int getNumDisparities() const; + LIBSGM_API int getP1() const; + LIBSGM_API int getP2() const; + LIBSGM_API float getUniquenessRatio() const; + LIBSGM_API bool hasSubpixel() const; + +#ifdef BUILD_OPENCV_WRAPPER + /** + * Execute stereo semi global matching via wrapper class + * @param I1 Input left image. Image's type is must be CV_8U or CV_16U + * @param I2 Input right image. Image's size and type must be same with I1. + * @param disparity Output image. Its memory will be allocated automatically dependent on input image size. + * @attention + * type of output image `disparity` is CV_16U. + * Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. + */ + LIBSGM_API void execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity); + /** + * Execute stereo semi global matching via wrapper class + * @param I1 Input left image. Image's type is must be CV_8U or CV_16U + * @param I2 Input right image. Image's size and type must be same with I1. + * @param disparity Output image. Its memory will be allocated automatically dependent on input image size. + * @attention + * type of output image `disparity` is CV_16U. + * Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled. + */ + LIBSGM_API void execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity); +#endif // BUILD_OPRENCV_WRAPPER + + private: + struct Creator; + std::unique_ptr<sgm::StereoSGM> sgm_; + int numDisparity_; + sgm::StereoSGM::Parameters param_; + std::unique_ptr<Creator> prev_; + }; +} + +#endif // __LIBSGM_WRAPPER_H__ diff --git a/lib/libsgm/sample/benchmark/CMakeLists.txt b/lib/libsgm/sample/benchmark/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c20233bd31b028f103652dbdc098f6807a956de --- /dev/null +++ b/lib/libsgm/sample/benchmark/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") +endif() + +find_package(CUDA REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(../../include) + +cuda_add_executable(stereo_benchmark stereosgm_benchmark.cpp) +target_link_libraries(stereo_benchmark sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS}) diff --git a/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp b/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp new file mode 100644 index 0000000000000000000000000000000000000000..28947bad4b320c494bd800a179d8c9fe043737ff --- /dev/null +++ b/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp @@ -0,0 +1,131 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <iostream> +#include <iomanip> +#include <chrono> + +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <cuda_runtime.h> + +#include <libsgm.h> + +#define ASSERT_MSG(expr, msg) \ + if (!(expr)) { \ + std::cerr << msg << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + +struct device_buffer +{ + device_buffer() : data(nullptr) {} + device_buffer(size_t count) { allocate(count); } + void allocate(size_t count) { cudaMalloc(&data, count); } + ~device_buffer() { cudaFree(data); } + void* data; +}; + +int main(int argc, char* argv[]) +{ + if (argc < 3) { + std::cout << "usage: " << argv[0] << " left_img right_img [disp_size] [out_depth] [subpixel_enable(0: false, 1:true)] [iterations]" << std::endl; + std::exit(EXIT_FAILURE); + } + + cv::Mat I1 = cv::imread(argv[1], -1); + cv::Mat I2 = cv::imread(argv[2], -1); + + ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed."); + ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type."); + ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U."); + + const int disp_size = argc > 3 ? std::stoi(argv[3]) : 128; + const int out_depth = argc > 4 ? std::stoi(argv[4]) : 8; + const bool subpixel = argc > 5 ? std::stoi(argv[5]) != 0 : false; + + ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128."); + if (subpixel) { + ASSERT_MSG(out_depth == 16, "output depth bits must be 16 if subpixel option is enabled."); + } else { + ASSERT_MSG(out_depth == 8 || out_depth == 16, "output depth bits must be 8 or 16"); + } + const int iterations = argc > 6 ? std::stoi(argv[6]) : 100; + + const int width = I1.cols; + const int height = I1.rows; + + const int input_depth = I1.type() == CV_8U ? 8 : 16; + const int input_bytes = input_depth * width * height / 8; + const int output_bytes = out_depth * width * height / 8; + + const sgm::StereoSGM::Parameters params{10, 120, 0.95f, subpixel}; + + sgm::StereoSGM sgm(width, height, disp_size, input_depth, out_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, params); + + device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes); + cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice); + + cudaDeviceProp prop; + int version; + cudaGetDeviceProperties(&prop, 0); + cudaRuntimeGetVersion(&version); + + // show settings + std::cout << "# Settings" << std::endl; + std::cout << "device name : " << prop.name << std::endl; + std::cout << "CUDA runtime version: " << version << std::endl; + std::cout << "image size : " << I1.size() << std::endl; + std::cout << "disparity size : " << disp_size << std::endl; + std::cout << "output depth : " << out_depth << std::endl; + std::cout << "subpixel option : " << (subpixel ? "true" : "false") << std::endl; + std::cout << "sgm path : " << "8 path" << std::endl; + std::cout << "iterations : " << iterations << std::endl; + std::cout << std::endl; + + // run benchmark + std::cout << "Running benchmark..." << std::endl; + uint64_t sum = 0; + for (int i = 0; i <= iterations; i++) { + const auto t1 = std::chrono::system_clock::now(); + + sgm.execute(d_I1.data, d_I2.data, d_disparity.data); + cudaDeviceSynchronize(); + + const auto t2 = std::chrono::system_clock::now(); + if (i > 0) + sum += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); + } + std::cout << "Done." << std::endl << std::endl; + + // show results + const double time_millisec = 1e-3 * sum / iterations; + const double fps = 1e3 / time_millisec; + std::cout << "# Results" << std::endl; + std::cout.setf(std::ios::fixed); + std::cout << std::setprecision(1) << "Processing Time[Milliseconds]: " << time_millisec << std::endl; + std::cout << std::setprecision(1) << "FPS : " << fps << std::endl; + std::cout << std::endl; + + // save disparity image + cv::Mat disparity(height, width, out_depth == 8 ? CV_8U : CV_16U); + cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost); + disparity *= 255. / disp_size; + cv::imwrite("disparity.png", disparity); + + return 0; +} diff --git a/lib/libsgm/sample/image/CMakeLists.txt b/lib/libsgm/sample/image/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..80e404936d9aa0f17e0f7b431707e6ee38879572 --- /dev/null +++ b/lib/libsgm/sample/image/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") +endif() + +find_package(CUDA REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(../../include) + +cuda_add_executable(stereo_test stereosgm_image.cpp) +target_link_libraries(stereo_test sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS}) diff --git a/lib/libsgm/sample/image/stereosgm_image.cpp b/lib/libsgm/sample/image/stereosgm_image.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a735de5d997fef9914e3a331a481338b969af8f --- /dev/null +++ b/lib/libsgm/sample/image/stereosgm_image.cpp @@ -0,0 +1,108 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <stdlib.h> +#include <iostream> + +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#include <opencv2/core/version.hpp> + +#include <libsgm.h> + +#define ASSERT_MSG(expr, msg) \ + if (!(expr)) { \ + std::cerr << msg << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "usage: stereosgm left_img right_img [disp_size]" << std::endl; + std::exit(EXIT_FAILURE); + } + + cv::Mat left = cv::imread(argv[1], -1); + cv::Mat right = cv::imread(argv[2], -1); + + int disp_size = 64; + if (argc >= 4) { + disp_size = atoi(argv[3]); + } + + ASSERT_MSG(left.size() == right.size() && left.type() == right.type(), "input images must be same size and type."); + //ASSERT_MSG(left.type() == CV_8U || left.type() == CV_16U, "input image format must be CV_8U or CV_16U."); + ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256."); + + cv::resize(left, left, cv::Size(left.cols/2, left.rows/2)); + cv::resize(right, right, cv::Size(right.cols/2, right.rows/2)); + + cv::cvtColor(left, left, cv::COLOR_BGR2GRAY); + cv::cvtColor(right, right, cv::COLOR_BGR2GRAY); + int bits = 0; + + switch (left.type()) { + case CV_8UC1: bits = 8; break; + case CV_16UC1: bits = 16; break; + default: + std::cerr << "invalid input image color format" << left.type() << std::endl; + std::exit(EXIT_FAILURE); + } + + sgm::StereoSGM ssgm(left.cols, left.rows, disp_size, bits, 8, sgm::EXECUTE_INOUT_HOST2HOST); + + cv::Mat output(cv::Size(left.cols, left.rows), CV_8UC1); + + ssgm.execute(left.data, right.data, output.data); + // show image + cv::imshow("image", output * 256 / disp_size); + + int key = cv::waitKey(); + int mode = 0; + while (key != 27) { + if (key == 's') { + mode += 1; + if (mode >= 3) mode = 0; + + switch (mode) { + case 0: + { + cv::setWindowTitle("image", "disparity"); + cv::imshow("image", output * 256 / disp_size); + break; + } + case 1: + { + cv::Mat m; + cv::applyColorMap(output * 256 / disp_size, m, cv::COLORMAP_JET); + cv::setWindowTitle("image", "disparity color"); + cv::imshow("image", m); + break; + } + case 2: + { + cv::setWindowTitle("image", "input"); + cv::imshow("image", left); + break; + } + } + } + key = cv::waitKey(); + } + + return 0; +} diff --git a/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt b/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8486103999ec02a94207515c0ba0db36c0b0533c --- /dev/null +++ b/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") +endif() + +find_package(CUDA REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(../../include) + +cuda_add_executable(stereo_image_cv_gpumat stereosgm_image_cv_gpumat.cpp ${CUDA_SRC}) +target_link_libraries(stereo_image_cv_gpumat sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS}) diff --git a/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp b/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b3bb7201789e1366e2c2d68831d0bcb386139b55 --- /dev/null +++ b/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp @@ -0,0 +1,111 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <stdlib.h> +#include <iostream> + +#include <opencv2/core/core.hpp> +#include <opencv2/core/cuda.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#include <opencv2/core/version.hpp> + +#include <libsgm.h> + +#define ASSERT_MSG(expr, msg) \ + if (!(expr)) { \ + std::cerr << msg << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + +static void execute(sgm::LibSGMWrapper& sgmw, const cv::Mat& h_left, const cv::Mat& h_right, cv::Mat& h_disparity) noexcept(false) +{ + cv::cuda::GpuMat d_left, d_right; + d_left.upload(h_left); + d_right.upload(h_right); + + cv::cuda::GpuMat d_disparity; + + sgmw.execute(d_left, d_right, d_disparity); + + // normalize result + cv::cuda::GpuMat d_normalized_disparity; + d_disparity.convertTo(d_normalized_disparity, CV_8UC1, 256. / sgmw.getNumDisparities()); + d_normalized_disparity.download(h_disparity); +} + +int main(int argc, char* argv[]) { + ASSERT_MSG(argc >= 3, "usage: stereosgm left_img right_img [disp_size]"); + + const cv::Mat left = cv::imread(argv[1], -1); + const cv::Mat right = cv::imread(argv[2], -1); + const int disp_size = argc > 3 ? std::atoi(argv[3]) : 128; + + ASSERT_MSG(left.size() == right.size() && left.type() == right.type(), "input images must be same size and type."); + ASSERT_MSG(left.type() == CV_8U || left.type() == CV_16U, "input image format must be CV_8U or CV_16U."); + ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128."); + + sgm::LibSGMWrapper sgmw{disp_size}; + cv::Mat disparity; + try { + execute(sgmw, left, right, disparity); + } catch (const cv::Exception& e) { + std::cerr << e.what() << std::endl; + if (e.code == cv::Error::GpuNotSupported) { + return 1; + } else { + return -1; + } + } + + // post-process for showing image + cv::Mat colored; + cv::applyColorMap(disparity, colored, cv::COLORMAP_JET); + cv::imshow("image", disparity); + + int key = cv::waitKey(); + int mode = 0; + while (key != 27) { + if (key == 's') { + mode += 1; + if (mode >= 3) mode = 0; + + switch (mode) { + case 0: + { + cv::setWindowTitle("image", "disparity"); + cv::imshow("image", disparity); + break; + } + case 1: + { + cv::setWindowTitle("image", "disparity color"); + cv::imshow("image", colored); + break; + } + case 2: + { + cv::setWindowTitle("image", "input"); + cv::imshow("image", left); + break; + } + } + } + key = cv::waitKey(); + } + + return 0; +} diff --git a/lib/libsgm/sample/movie/CMakeLists.txt b/lib/libsgm/sample/movie/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6cddac16acdacd3a9a9c4a69f458d0d56d0be0 --- /dev/null +++ b/lib/libsgm/sample/movie/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") +endif() + +find_package(CUDA REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(../../include) + +cuda_add_executable(stereo_movie stereosgm_movie.cpp) +target_link_libraries(stereo_movie sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS}) diff --git a/lib/libsgm/sample/movie/stereosgm_movie.cpp b/lib/libsgm/sample/movie/stereosgm_movie.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d8082478e7d243a6cccda347eef949bae27e787e --- /dev/null +++ b/lib/libsgm/sample/movie/stereosgm_movie.cpp @@ -0,0 +1,129 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <iostream> +#include <iomanip> +#include <string> +#include <chrono> + +#include <cuda_runtime.h> +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#include <opencv2/core/version.hpp> + +#include <libsgm.h> + +#define ASSERT_MSG(expr, msg) \ + if (!(expr)) { \ + std::cerr << msg << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + +struct device_buffer +{ + device_buffer() : data(nullptr) {} + device_buffer(size_t count) { allocate(count); } + void allocate(size_t count) { cudaMalloc(&data, count); } + ~device_buffer() { cudaFree(data); } + void* data; +}; + +template <class... Args> +static std::string format_string(const char* fmt, Args... args) +{ + const int BUF_SIZE = 1024; + char buf[BUF_SIZE]; + std::snprintf(buf, BUF_SIZE, fmt, args...); + return std::string(buf); +} + +int main(int argc, char* argv[]) +{ + if (argc < 3) { + std::cout << "usage: " << argv[0] << " left-image-format right-image-format [disp_size]" << std::endl; + std::exit(EXIT_FAILURE); + } + + const int first_frame = 1; + + cv::Mat I1 = cv::imread(format_string(argv[1], first_frame), -1); + cv::Mat I2 = cv::imread(format_string(argv[2], first_frame), -1); + const int disp_size = argc >= 4 ? std::stoi(argv[3]) : 128; + + ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed."); + ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type."); + ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U."); + ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128."); + + const int width = I1.cols; + const int height = I1.rows; + + const int input_depth = I1.type() == CV_8U ? 8 : 16; + const int input_bytes = input_depth * width * height / 8; + const int output_depth = 8; + const int output_bytes = output_depth * width * height / 8; + + sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, sgm::EXECUTE_INOUT_CUDA2CUDA); + + cv::Mat disparity(height, width, output_depth == 8 ? CV_8U : CV_16U); + cv::Mat disparity_8u, disparity_color; + + device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes); + + for (int frame_no = first_frame;; frame_no++) { + + I1 = cv::imread(format_string(argv[1], frame_no), -1); + I2 = cv::imread(format_string(argv[2], frame_no), -1); + if (I1.empty() || I2.empty()) { + frame_no = first_frame; + continue; + } + + cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice); + + const auto t1 = std::chrono::system_clock::now(); + + sgm.execute(d_I1.data, d_I2.data, d_disparity.data); + cudaDeviceSynchronize(); + + const auto t2 = std::chrono::system_clock::now(); + const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); + const double fps = 1e6 / duration; + + cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost); + + // draw results + if (I1.type() != CV_8U) { + cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX); + I1.convertTo(I1, CV_8U); + } + + disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size); + cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET); + cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps), + cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255)); + + cv::imshow("left image", I1); + cv::imshow("disparity", disparity_color); + const char c = cv::waitKey(1); + if (c == 27) // ESC + break; + } + + return 0; +} diff --git a/lib/libsgm/sample/reprojection/CMakeLists.txt b/lib/libsgm/sample/reprojection/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..43b584ab9b0ff1785198db7d581a20de03492d40 --- /dev/null +++ b/lib/libsgm/sample/reprojection/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") +endif() + +find_package(CUDA REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(../../include) + +cuda_add_executable(stereo_reprojection stereosgm_reprojection.cpp) +target_link_libraries(stereo_reprojection sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS}) diff --git a/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml b/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml new file mode 100644 index 0000000000000000000000000000000000000000..002fbada79b183172b29f618089d6c69f94e6a52 --- /dev/null +++ b/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<opencv_storage> + +<!-- Intrinsic parameters --> +<FocalLengthX>1267.485352</FocalLengthX> <!-- focal length x (pixel) --> +<FocalLengthY>1224.548950</FocalLengthY> <!-- focal length y (pixel) --> +<CenterX>472.735474</CenterX> <!-- principal point x (pixel) --> +<CenterY>175.787781</CenterY> <!-- principal point y (pixel) --> + +<!-- Extrinsic parameters --> +<BaseLine>0.214382</BaseLine> <!-- baseline (meter) --> +<Height>1.170000</Height> <!-- height position (meter) --> +<Tilt>0.081276</Tilt> <!-- tilt angle (radian) --> + +</opencv_storage> diff --git a/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml b/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml new file mode 100644 index 0000000000000000000000000000000000000000..0a0cc094a27cc5a10127a7f77ff78fbe6e2e833e --- /dev/null +++ b/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml @@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<opencv_storage> +<FocalLengthX>1249.7700195</FocalLengthX> +<FocalLengthY>1249.7700195</FocalLengthY> +<CenterX>480.8460083</CenterX> +<CenterY>237.4100037</CenterY> +<BaseLine>0.2339240</BaseLine> +<Height>1.2000000</Height> +<Tilt>0.07</Tilt> +</opencv_storage> diff --git a/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp b/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36ade078a8a2f73f7818424ec6d3921a63e54241 --- /dev/null +++ b/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp @@ -0,0 +1,270 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <iostream> +#include <iomanip> +#include <string> +#include <chrono> + +#include <opencv2/core/core.hpp> +#include <opencv2/highgui/highgui.hpp> +#include <opencv2/imgproc/imgproc.hpp> +#include <cuda_runtime.h> + +#include <libsgm.h> + +#define ASSERT_MSG(expr, msg) \ + if (!(expr)) { \ + std::cerr << msg << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + +struct device_buffer +{ + device_buffer() : data(nullptr) {} + device_buffer(size_t count) { allocate(count); } + void allocate(size_t count) { cudaMalloc(&data, count); } + ~device_buffer() { cudaFree(data); } + void* data; +}; + +// Camera Parameters +struct CameraParameters +{ + float fu; //!< focal length x (pixel) + float fv; //!< focal length y (pixel) + float u0; //!< principal point x (pixel) + float v0; //!< principal point y (pixel) + float baseline; //!< baseline (meter) + float height; //!< height position (meter), ignored when ROAD_ESTIMATION_AUTO + float tilt; //!< tilt angle (radian), ignored when ROAD_ESTIMATION_AUTO +}; + +// Transformation between pixel coordinate and world coordinate +struct CoordinateTransform +{ + CoordinateTransform(const CameraParameters& camera) : camera(camera) + { + sinTilt = (sinf(camera.tilt)); + cosTilt = (cosf(camera.tilt)); + bf = camera.baseline * camera.fu; + invfu = 1.f / camera.fu; + invfv = 1.f / camera.fv; + } + + inline cv::Point3f imageToWorld(const cv::Point2f& pt, float d) const + { + const float u = pt.x; + const float v = pt.y; + + const float Zc = bf / d; + const float Xc = invfu * (u - camera.u0) * Zc; + const float Yc = invfv * (v - camera.v0) * Zc; + + const float Xw = Xc; + const float Yw = Yc * cosTilt + Zc * sinTilt; + const float Zw = Zc * cosTilt - Yc * sinTilt; + + return cv::Point3f(Xw, Yw, Zw); + } + + CameraParameters camera; + float sinTilt, cosTilt, bf, invfu, invfv; +}; + +template <class... Args> +static std::string format_string(const char* fmt, Args... args) +{ + const int BUF_SIZE = 1024; + char buf[BUF_SIZE]; + std::snprintf(buf, BUF_SIZE, fmt, args...); + return std::string(buf); +} + +static cv::Scalar computeColor(float val) +{ + const float hscale = 6.f; + float h = 0.6f * (1.f - val), s = 1.f, v = 1.f; + float r, g, b; + + static const int sector_data[][3] = + { { 1,3,0 },{ 1,0,2 },{ 3,0,1 },{ 0,2,1 },{ 0,1,3 },{ 2,1,0 } }; + float tab[4]; + int sector; + h *= hscale; + if (h < 0) + do h += 6; while (h < 0); + else if (h >= 6) + do h -= 6; while (h >= 6); + sector = cvFloor(h); + h -= sector; + if ((unsigned)sector >= 6u) + { + sector = 0; + h = 0.f; + } + + tab[0] = v; + tab[1] = v*(1.f - s); + tab[2] = v*(1.f - s*h); + tab[3] = v*(1.f - s*(1.f - h)); + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + return 255 * cv::Scalar(b, g, r); +} + +void reprojectPointsTo3D(const cv::Mat& disparity, const CameraParameters& camera, std::vector<cv::Point3f>& points, bool subpixeled) +{ + CV_Assert(disparity.type() == CV_32F); + + CoordinateTransform tf(camera); + + points.clear(); + points.reserve(disparity.rows * disparity.cols); + + for (int y = 0; y < disparity.rows; y++) + { + for (int x = 0; x < disparity.cols; x++) + { + const float d = disparity.at<float>(y, x); + if (d > 0) + points.push_back(tf.imageToWorld(cv::Point(x, y), d)); + } + } +} + +void drawPoints3D(const std::vector<cv::Point3f>& points, cv::Mat& draw) +{ + const int SIZE_X = 512; + const int SIZE_Z = 1024; + const int maxz = 20; // [meter] + const double pixelsPerMeter = 1. * SIZE_Z / maxz; + + draw = cv::Mat::zeros(SIZE_Z, SIZE_X, CV_8UC3); + + for (const cv::Point3f& pt : points) + { + const float X = pt.x; + const float Z = pt.z; + + const int u = cvRound(pixelsPerMeter * X) + SIZE_X / 2; + const int v = SIZE_Z - cvRound(pixelsPerMeter * Z); + + const cv::Scalar color = computeColor(std::min(Z, 1.f * maxz) / maxz); + cv::circle(draw, cv::Point(u, v), 1, color); + } +} + +int main(int argc, char* argv[]) +{ + if (argc < 4) { + std::cout << "usage: " << argv[0] << " left-image-format right-image-format camera.xml [disp_size] [subpixel_enable(0: false, 1:true)]" << std::endl; + std::exit(EXIT_FAILURE); + } + + const int first_frame = 1; + + cv::Mat I1 = cv::imread(format_string(argv[1], first_frame), -1); + cv::Mat I2 = cv::imread(format_string(argv[2], first_frame), -1); + const cv::FileStorage cvfs(argv[3], cv::FileStorage::READ); + const int disp_size = argc >= 5 ? std::stoi(argv[4]) : 128; + const bool subpixel = argc >= 6 ? std::stoi(argv[5]) != 0 : true; + const int output_depth = 16; + + ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed."); + ASSERT_MSG(cvfs.isOpened(), "camera.xml read failed."); + ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type."); + ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U."); + ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128."); + + // read camera parameters + const cv::FileNode node(cvfs.fs, NULL); + CameraParameters camera; + camera.fu = node["FocalLengthX"]; + camera.fv = node["FocalLengthY"]; + camera.u0 = node["CenterX"]; + camera.v0 = node["CenterY"]; + camera.baseline = node["BaseLine"]; + camera.tilt = node["Tilt"]; + + const int width = I1.cols; + const int height = I1.rows; + + const int input_depth = I1.type() == CV_8U ? 8 : 16; + const int input_bytes = input_depth * width * height / 8; + const int output_bytes = output_depth * width * height / 8; + + const sgm::StereoSGM::Parameters params{10, 120, 0.95f, subpixel}; + + sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, params); + + cv::Mat disparity(height, width, output_depth == 8 ? CV_8U : CV_16U); + cv::Mat disparity_8u, disparity_32f, disparity_color, draw; + std::vector<cv::Point3f> points; + + device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes); + + for (int frame_no = first_frame;; frame_no++) { + + I1 = cv::imread(format_string(argv[1], frame_no), -1); + I2 = cv::imread(format_string(argv[2], frame_no), -1); + if (I1.empty() || I2.empty()) { + frame_no = first_frame; + continue; + } + + cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice); + + const auto t1 = std::chrono::system_clock::now(); + + sgm.execute(d_I1.data, d_I2.data, d_disparity.data); + cudaDeviceSynchronize(); + + const auto t2 = std::chrono::system_clock::now(); + const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); + const double fps = 1e6 / duration; + + cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost); + + // draw results + if (I1.type() != CV_8U) { + cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX); + I1.convertTo(I1, CV_8U); + } + + disparity.convertTo(disparity_32f, CV_32F, subpixel ? 1. / sgm::StereoSGM::SUBPIXEL_SCALE : 1); + reprojectPointsTo3D(disparity_32f, camera, points, subpixel); + drawPoints3D(points, draw); + + disparity_32f.convertTo(disparity_8u, CV_8U, 255. / disp_size); + cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET); + cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps), + cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255)); + + cv::imshow("left image", I1); + cv::imshow("disparity", disparity_color); + cv::imshow("points", draw); + + const char c = cv::waitKey(1); + if (c == 27) // ESC + break; + } + + return 0; +} diff --git a/lib/libsgm/sample/zed/CMakeLists.txt b/lib/libsgm/sample/zed/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..30659d31473ddd50522ee1cdab04853d5a88be46 --- /dev/null +++ b/lib/libsgm/sample/zed/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../movie/cmake) + +message(STATUS ${CMAKE_MODULE_PATH}) + +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) + +find_package(ZED 2 REQUIRED) +find_package(CUDA ${ZED_CUDA_VERSION} REQUIRED) + +find_package(OpenCV REQUIRED) +if (OpenCV_VERSION VERSION_LESS 3.0) + message(FATAL_ERROR "Error: OpenCV version requires at least 3.0") +endif() + +include_directories(../../include) +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${OpenCV_INCLUDE_DIRS}) +include_directories(${ZED_INCLUDE_DIRS}) + +link_directories(${ZED_LIBRARY_DIR}) +link_directories(${CUDA_LIBRARY_DIRS}) + +if(NOT WIN32) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") +endif() + + +CUDA_ADD_EXECUTABLE(zed_demo + zed_demo.cpp +) + + +TARGET_LINK_LIBRARIES(zed_demo + sgm + ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES_ZED} + ${OpenCV_LIBS} + ${ZED_LIBRARIES} +) diff --git a/lib/libsgm/sample/zed/zed_demo.cpp b/lib/libsgm/sample/zed/zed_demo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9139dad8719792241b41a1dac4d4070675ac595c --- /dev/null +++ b/lib/libsgm/sample/zed/zed_demo.cpp @@ -0,0 +1,108 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <stdlib.h> +#include <iostream> +#include <iomanip> +#include <string> +#include <chrono> + +#include <cuda_runtime.h> +#include <opencv2/opencv.hpp> +#include <opencv2/core/core.hpp> +#include <opencv2/imgproc/imgproc.hpp> + +#include <sl/Camera.hpp> + +#include <libsgm.h> + +template <class... Args> +static std::string format_string(const char* fmt, Args... args) +{ + const int BUF_SIZE = 1024; + char buf[BUF_SIZE]; + std::snprintf(buf, BUF_SIZE, fmt, args...); + return std::string(buf); +} + +struct device_buffer { + device_buffer() : data(nullptr) {} + device_buffer(size_t count) { allocate(count); } + void allocate(size_t count) { cudaMalloc(&data, count); } + ~device_buffer() { cudaFree(data); } + void* data; +}; + +int main(int argc, char* argv[]) { + + const int disp_size = 128; + + sl::Camera zed; + sl::InitParameters initParameters; + initParameters.camera_resolution = sl::RESOLUTION_VGA; + sl::ERROR_CODE err = zed.open(initParameters); + if (err != sl::SUCCESS) { + std::cout << toString(err) << std::endl; + zed.close(); + return 1; + } + const int width = static_cast<int>(zed.getResolution().width); + const int height = static_cast<int>(zed.getResolution().height); + + sl::Mat d_zed_image_l(zed.getResolution(), sl::MAT_TYPE_8U_C1, sl::MEM_GPU); + sl::Mat d_zed_image_r(zed.getResolution(), sl::MAT_TYPE_8U_C1, sl::MEM_GPU); + + const int input_depth = 8; + const int output_depth = 8; + const int output_bytes = output_depth * width * height / 8; + + CV_Assert(d_zed_image_l.getStep(sl::MEM_GPU) == d_zed_image_r.getStep(sl::MEM_GPU)); + sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, static_cast<int>(d_zed_image_l.getStep(sl::MEM_GPU)), width, sgm::EXECUTE_INOUT_CUDA2CUDA); + + cv::Mat disparity(height, width, CV_8U); + cv::Mat disparity_8u, disparity_color; + + device_buffer d_disparity(output_bytes); + while (1) { + if (zed.grab() == sl::SUCCESS) { + zed.retrieveImage(d_zed_image_l, sl::VIEW_LEFT_GRAY, sl::MEM_GPU); + zed.retrieveImage(d_zed_image_r, sl::VIEW_RIGHT_GRAY, sl::MEM_GPU); + } else continue; + + const auto t1 = std::chrono::system_clock::now(); + + sgm.execute(d_zed_image_l.getPtr<uchar>(sl::MEM_GPU), d_zed_image_r.getPtr<uchar>(sl::MEM_GPU), d_disparity.data); + cudaDeviceSynchronize(); + + const auto t2 = std::chrono::system_clock::now(); + const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); + const double fps = 1e6 / duration; + + cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost); + + disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size); + cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET); + cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps), + cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255)); + + cv::imshow("disparity", disparity_color); + const char c = cv::waitKey(1); + if (c == 27) // ESC + break; + } + zed.close(); + return 0; +} diff --git a/lib/libsgm/src/CMakeLists.txt b/lib/libsgm/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e5035877010c146dd4d0d988b263a9ccddf9df --- /dev/null +++ b/lib/libsgm/src/CMakeLists.txt @@ -0,0 +1,42 @@ +cmake_minimum_required(VERSION 3.1) + +find_package(CUDA REQUIRED) + +include_directories(../include) + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall -fPIC") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") +endif() + +SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}") + +file(GLOB STEREOSRCS "*.cu" "*.cpp") + +if(LIBSGM_SHARED) + CUDA_ADD_LIBRARY(sgm stereo_sgm.cpp ${STEREOSRCS} SHARED) + target_link_libraries(sgm ${CUDA_LIBRARIES}) + if(BUILD_OPENCV_WRAPPER) + target_link_libraries(sgm ${OpenCV_LIBS}) + endif() +else() + CUDA_ADD_LIBRARY(sgm stereo_sgm.cpp ${STEREOSRCS} STATIC) +endif() + +install( + TARGETS sgm + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin +) + +install( + DIRECTORY ${CMAKE_SOURCE_DIR}/include + DESTINATION ${CMAKE_INSTALL_PREFIX} + FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp" +) + +install( + FILES ${CMAKE_SOURCE_DIR}/FindLibSGM.cmake + DESTINATION ${CMAKE_INSTALL_PREFIX} +) diff --git a/lib/libsgm/src/census_transform.cu b/lib/libsgm/src/census_transform.cu new file mode 100644 index 0000000000000000000000000000000000000000..70f67bcffd1eb2540c1f6c64da0ced5483bca2c9 --- /dev/null +++ b/lib/libsgm/src/census_transform.cu @@ -0,0 +1,150 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <cstdio> +#include "census_transform.hpp" + +namespace sgm { + +namespace { + +static constexpr int WINDOW_WIDTH = 9; +static constexpr int WINDOW_HEIGHT = 7; + +static constexpr int BLOCK_SIZE = 128; +static constexpr int LINES_PER_BLOCK = 16; + +template <typename T> +__global__ void census_transform_kernel( + feature_type *dest, + const T *src, + int width, + int height, + int pitch) +{ + using pixel_type = T; + static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1; + + const int half_kw = WINDOW_WIDTH / 2; + const int half_kh = WINDOW_HEIGHT / 2; + + __shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE]; + + const int tid = threadIdx.x; + const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw; + const int y0 = blockIdx.y * LINES_PER_BLOCK; + + for(int i = 0; i < WINDOW_HEIGHT; ++i){ + const int x = x0 + tid, y = y0 - half_kh + i; + pixel_type value = 0; + if(0 <= x && x < width && 0 <= y && y < height){ + value = src[x + y * pitch]; + } + smem_lines[i][tid] = value; + } + __syncthreads(); + +#pragma unroll + for(int i = 0; i < LINES_PER_BLOCK; ++i){ + if(i + 1 < LINES_PER_BLOCK){ + // Load to smem + const int x = x0 + tid, y = y0 + half_kh + i + 1; + pixel_type value = 0; + if(0 <= x && x < width && 0 <= y && y < height){ + value = src[x + y * pitch]; + } + const int smem_x = tid; + const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE; + smem_lines[smem_y][smem_x] = value; + } + + if(half_kw <= tid && tid < BLOCK_SIZE - half_kw){ + // Compute and store + const int x = x0 + tid, y = y0 + i; + if(half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh){ + const int smem_x = tid; + const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE; + feature_type f = 0; + for(int dy = -half_kh; dy < 0; ++dy){ + const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE; + const int smem_y2 = (smem_y - dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE; + for(int dx = -half_kw; dx <= half_kw; ++dx){ + const int smem_x1 = smem_x + dx; + const int smem_x2 = smem_x - dx; + const auto a = smem_lines[smem_y1][smem_x1]; + const auto b = smem_lines[smem_y2][smem_x2]; + f = (f << 1) | (a > b); + } + } + for(int dx = -half_kw; dx < 0; ++dx){ + const int smem_x1 = smem_x + dx; + const int smem_x2 = smem_x - dx; + const auto a = smem_lines[smem_y][smem_x1]; + const auto b = smem_lines[smem_y][smem_x2]; + f = (f << 1) | (a > b); + } + dest[x + y * width] = f; + } + } + __syncthreads(); + } +} + +template <typename T> +void enqueue_census_transform( + feature_type *dest, + const T *src, + int width, + int height, + int pitch, + cudaStream_t stream) +{ + const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1; + const int height_per_block = LINES_PER_BLOCK; + const dim3 gdim( + (width + width_per_block - 1) / width_per_block, + (height + height_per_block - 1) / height_per_block); + const dim3 bdim(BLOCK_SIZE); + census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch); +} + +} + + +template <typename T> +CensusTransform<T>::CensusTransform() + : m_feature_buffer() +{ } + +template <typename T> +void CensusTransform<T>::enqueue( + const input_type *src, + int width, + int height, + int pitch, + cudaStream_t stream) +{ + if(m_feature_buffer.size() < static_cast<size_t>(width * height)){ + m_feature_buffer = DeviceBuffer<feature_type>(width * height); + } + enqueue_census_transform( + m_feature_buffer.data(), src, width, height, pitch, stream); +} + +template class CensusTransform<uint8_t>; +template class CensusTransform<uint16_t>; + +} diff --git a/lib/libsgm/src/census_transform.hpp b/lib/libsgm/src/census_transform.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8a80b903b4b20152c0c8fad7fada8032d60a1565 --- /dev/null +++ b/lib/libsgm/src/census_transform.hpp @@ -0,0 +1,52 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_CENSUS_TRANSFORM_HPP +#define SGM_CENSUS_TRANSFORM_HPP + +#include "device_buffer.hpp" +#include "types.hpp" + +namespace sgm { + +template <typename T> +class CensusTransform { + +public: + using input_type = T; + +private: + DeviceBuffer<feature_type> m_feature_buffer; + +public: + CensusTransform(); + + const feature_type *get_output() const { + return m_feature_buffer.data(); + } + + void enqueue( + const input_type *src, + int width, + int height, + int pitch, + cudaStream_t stream); + +}; + +} + +#endif diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu new file mode 100644 index 0000000000000000000000000000000000000000..f0a58247b56d6062ae47bedbf5c4e7554c73c82e --- /dev/null +++ b/lib/libsgm/src/check_consistency.cu @@ -0,0 +1,77 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <libsgm.h> +#include "internal.h" + +namespace { + template<typename SRC_T, typename DST_T> + __global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel) { + + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; + + // left-right consistency check, only on leftDisp, but could be done for rightDisp too + + uint8_t mask = d_mask[i * src_pitch + j]; + int d = d_leftDisp[i * dst_pitch + j]; + if (subpixel) { + d >>= sgm::StereoSGM::SUBPIXEL_SHIFT; + } + int k = j - d; + if (mask != 0 || d <= 0 || (k >= 0 && k < width)) { + int diff = abs(d_rightDisp[i * dst_pitch + k] - d); + if (mask != 0 || diff > 1) { + // masked or left-right inconsistent pixel -> invalid + d_leftDisp[i * dst_pitch + j] = (256 << (sgm::StereoSGM::SUBPIXEL_SHIFT+1)); + } + } + } +} + +namespace sgm { + namespace details { + + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) { + + const dim3 blocks(width / 16, height / 16); + const dim3 threads(16, 16); + if (depth_bits == 16) { + check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + } + else if (depth_bits == 8) { + check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + } + + CudaKernelCheck(); + } + + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) { + + const dim3 blocks(width / 16, height / 16); + const dim3 threads(16, 16); + if (depth_bits == 16) { + check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + } + else if (depth_bits == 8) { + check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel); + } + + CudaKernelCheck(); + } + + } +} diff --git a/lib/libsgm/src/cuda_utils.cu b/lib/libsgm/src/cuda_utils.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c6ddf1b8b3e915cfcf4aa1ee15a09fdc41344cc --- /dev/null +++ b/lib/libsgm/src/cuda_utils.cu @@ -0,0 +1,55 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "internal.h" + +namespace { + + __global__ void cast_16bit_8bit_array_kernel(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + arr8bits[i] = (uint8_t)arr16bits[i]; + } + + __global__ void cast_8bit_16bit_array_kernel(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + arr16bits[i] = (uint16_t)arr8bits[i]; + } + +} + +namespace sgm { + namespace details { + + void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements) { + for (int mod = 1024; mod != 0; mod >>= 1) { + if (num_elements % mod == 0) { + cast_16bit_8bit_array_kernel << <num_elements / mod, mod >> >(arr16bits, arr8bits, num_elements); + break; + } + } + } + + void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements) { + for (int mod = 1024; mod != 0; mod >>= 1) { + if (num_elements % mod == 0) { + cast_8bit_16bit_array_kernel << <num_elements / mod, mod >> >(arr8bits, arr16bits, num_elements); + break; + } + } + } + + } +} diff --git a/lib/libsgm/src/device_buffer.hpp b/lib/libsgm/src/device_buffer.hpp new file mode 100644 index 0000000000000000000000000000000000000000..82b4829d1cf6ec4eead42837759d2830e286753d --- /dev/null +++ b/lib/libsgm/src/device_buffer.hpp @@ -0,0 +1,90 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_DEVICE_BUFFER_HPP +#define SGM_DEVICE_BUFFER_HPP + +#include <cstddef> + +namespace sgm { + +template <typename T> +class DeviceBuffer { + +public: + using value_type = T; + +private: + T *m_data; + size_t m_size; + +public: + DeviceBuffer() + : m_data(nullptr) + , m_size(0) + { } + + explicit DeviceBuffer(size_t n) + : m_data(nullptr) + , m_size(n) + { + cudaMalloc(reinterpret_cast<void **>(&m_data), sizeof(T) * n); + } + + DeviceBuffer(const DeviceBuffer&) = delete; + + DeviceBuffer(DeviceBuffer&& obj) + : m_data(obj.m_data) + , m_size(obj.m_size) + { + obj.m_data = nullptr; + obj.m_size = 0; + } + + ~DeviceBuffer(){ + cudaFree(m_data); + } + + + DeviceBuffer& operator=(const DeviceBuffer&) = delete; + + DeviceBuffer& operator=(DeviceBuffer&& obj){ + if (m_data != nullptr) cudaFree(m_data); + m_data = obj.m_data; + m_size = obj.m_size; + obj.m_data = nullptr; + obj.m_size = 0; + return *this; + } + + + size_t size() const { + return m_size; + } + + const value_type *data() const { + return m_data; + } + + value_type *data(){ + return m_data; + } + +}; + +} + +#endif diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu new file mode 100644 index 0000000000000000000000000000000000000000..2a9f1a87f7e95e0212c1dd7d37f7f4e79edc2440 --- /dev/null +++ b/lib/libsgm/src/horizontal_path_aggregation.cu @@ -0,0 +1,262 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <cstdio> +#include "horizontal_path_aggregation.hpp" +#include "path_aggregation_common.hpp" + +namespace sgm { +namespace path_aggregation { + +static constexpr unsigned int DP_BLOCK_SIZE = 8u; +static constexpr unsigned int DP_BLOCKS_PER_THREAD = 1u; + +static constexpr unsigned int WARPS_PER_BLOCK = 4u; +static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK; + + +template <int DIRECTION, unsigned int MAX_DISPARITY> +__global__ void aggregate_horizontal_path_kernel( + uint8_t *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; + static const unsigned int PATHS_PER_WARP = + WARP_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE; + static const unsigned int PATHS_PER_BLOCK = + BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE; + + static_assert(DIRECTION == 1 || DIRECTION == -1, ""); + if(width == 0 || height == 0){ + return; + } + + feature_type right_buffer[DP_BLOCKS_PER_THREAD][DP_BLOCK_SIZE]; + DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp[DP_BLOCKS_PER_THREAD]; + + const unsigned int warp_id = threadIdx.x / WARP_SIZE; + const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE; + const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; + const unsigned int shfl_mask = + ((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE); + + const unsigned int y0 = + PATHS_PER_BLOCK * blockIdx.x + + PATHS_PER_WARP * warp_id + + group_id; + const unsigned int feature_step = SUBGROUPS_PER_WARP * width; + const unsigned int dest_step = SUBGROUPS_PER_WARP * MAX_DISPARITY * width; + const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE; + left += y0 * width; + right += y0 * width; + dest += y0 * MAX_DISPARITY * width; + + if(y0 >= height){ + return; + } + + if(DIRECTION > 0){ + for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){ + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + right_buffer[i][j] = 0; + } + } + }else{ + for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){ + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + const int x = static_cast<int>(width - (j + dp_offset)); + if(0 <= x && x < static_cast<int>(width)){ + right_buffer[i][j] = __ldg(&right[i * feature_step + x]); + }else{ + right_buffer[i][j] = 0; + } + } + } + } + + int x0 = (DIRECTION > 0) ? 0 : static_cast<int>((width - 1) & ~(DP_BLOCK_SIZE - 1)); + for(unsigned int iter = 0; iter < width; iter += DP_BLOCK_SIZE){ + for(unsigned int i = 0; i < DP_BLOCK_SIZE; ++i){ + const unsigned int x = x0 + (DIRECTION > 0 ? i : (DP_BLOCK_SIZE - 1 - i)); + if(x >= width){ + continue; + } + for(unsigned int j = 0; j < DP_BLOCKS_PER_THREAD; ++j){ + const unsigned int y = y0 + j * SUBGROUPS_PER_WARP; + if(y >= height){ + continue; + } + const feature_type left_value = __ldg(&left[j * feature_step + x]); + if(DIRECTION > 0){ + const feature_type t = right_buffer[j][DP_BLOCK_SIZE - 1]; + for(unsigned int k = DP_BLOCK_SIZE - 1; k > 0; --k){ + right_buffer[j][k] = right_buffer[j][k - 1]; + } +#if CUDA_VERSION >= 9000 + right_buffer[j][0] = __shfl_up_sync(shfl_mask, t, 1, SUBGROUP_SIZE); +#else + right_buffer[j][0] = __shfl_up(t, 1, SUBGROUP_SIZE); +#endif + if(lane_id == 0){ + right_buffer[j][0] = + __ldg(&right[j * feature_step + x - dp_offset]); + } + }else{ + const feature_type t = right_buffer[j][0]; + for(unsigned int k = 1; k < DP_BLOCK_SIZE; ++k){ + right_buffer[j][k - 1] = right_buffer[j][k]; + } +#if CUDA_VERSION >= 9000 + right_buffer[j][DP_BLOCK_SIZE - 1] = + __shfl_down_sync(shfl_mask, t, 1, SUBGROUP_SIZE); +#else + right_buffer[j][DP_BLOCK_SIZE - 1] = __shfl_down(t, 1, SUBGROUP_SIZE); +#endif + if(lane_id + 1 == SUBGROUP_SIZE){ + if(x >= dp_offset + DP_BLOCK_SIZE - 1){ + right_buffer[j][DP_BLOCK_SIZE - 1] = + __ldg(&right[j * feature_step + x - (dp_offset + DP_BLOCK_SIZE - 1)]); + }else{ + right_buffer[j][DP_BLOCK_SIZE - 1] = 0; + } + } + } + uint32_t local_costs[DP_BLOCK_SIZE]; + for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){ + local_costs[k] = __popc(left_value ^ right_buffer[j][k]); + } + dp[j].update(local_costs, p1, p2, shfl_mask); + store_uint8_vector<DP_BLOCK_SIZE>( + &dest[j * dest_step + x * MAX_DISPARITY + dp_offset], + dp[j].dp); + } + } + x0 += static_cast<int>(DP_BLOCK_SIZE) * DIRECTION; + } +} + + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_left2right_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = + BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE; + + const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_right2left_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = + BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE; + + const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + + +template void enqueue_aggregate_left2right_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_left2right_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_left2right_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_right2left_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_right2left_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_right2left_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ce825e5fa1c56d9daa904285677f1257b9b172d0 --- /dev/null +++ b/lib/libsgm/src/horizontal_path_aggregation.hpp @@ -0,0 +1,50 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_HORIZONTAL_PATH_AGGREGATION_HPP +#define SGM_HORIZONTAL_PATH_AGGREGATION_HPP + +#include "types.hpp" + +namespace sgm { +namespace path_aggregation { + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_left2right_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_right2left_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} + +#endif diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h new file mode 100644 index 0000000000000000000000000000000000000000..577101728e33b528a9def320d21f5785b4319ac2 --- /dev/null +++ b/lib/libsgm/src/internal.h @@ -0,0 +1,54 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#pragma once + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <assert.h> + +#include <stdexcept> + +#include <cuda_runtime.h> +#include <cuda_runtime_api.h> + +#define CudaSafeCall(error) sgm::details::cuda_safe_call(error, __FILE__, __LINE__) + +#define CudaKernelCheck() CudaSafeCall(cudaGetLastError()) + +namespace sgm { + namespace details { + + void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch); + void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch); + + void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel); + void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel); + + void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements); + void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements); + + inline void cuda_safe_call(cudaError error, const char *file, const int line) + { + if (error != cudaSuccess) { + fprintf(stderr, "cuda error %s : %d %s\n", file, line, cudaGetErrorString(error)); + exit(-1); + } + } + + } +} diff --git a/lib/libsgm/src/libsgm_wrapper.cpp b/lib/libsgm/src/libsgm_wrapper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b78f3c6ad5c5bac4619690eab3f366ff81214de7 --- /dev/null +++ b/lib/libsgm/src/libsgm_wrapper.cpp @@ -0,0 +1,119 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <libsgm_wrapper.h> + +namespace sgm { + LibSGMWrapper::LibSGMWrapper(int numDisparity, int P1, int P2, float uniquenessRatio, bool subpixel) + : sgm_(nullptr), numDisparity_(numDisparity), param_(P1, P2, uniquenessRatio, subpixel), prev_(nullptr) {} + LibSGMWrapper::~LibSGMWrapper() = default; + + int LibSGMWrapper::getNumDisparities() const { return numDisparity_; } + float LibSGMWrapper::getUniquenessRatio() const { return param_.uniqueness; } + int LibSGMWrapper::getP1() const { return param_.P1; } + int LibSGMWrapper::getP2() const { return param_.P2; } + bool LibSGMWrapper::hasSubpixel() const { return param_.subpixel; } + + struct LibSGMWrapper::Creator { + int width; + int height; + int src_pitch; + int dst_pitch; + int input_depth_bits; + int output_depth_bits; + sgm::EXECUTE_INOUT inout_type; + + bool operator==(const Creator& rhs) const { + return + width == rhs.width + && height == rhs.height + && src_pitch == rhs.src_pitch + && dst_pitch == rhs.dst_pitch + && input_depth_bits == rhs.input_depth_bits + && output_depth_bits == rhs.output_depth_bits + && inout_type == rhs.inout_type; + } + bool operator!=(const Creator& rhs) const { + return !(*this == rhs); + } + + StereoSGM* createStereoSGM(int disparity_size, const StereoSGM::Parameters& param) { + return new StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, src_pitch, dst_pitch, inout_type, param); + } + +#ifdef BUILD_OPENCV_WRAPPER + Creator(const cv::cuda::GpuMat& src, const cv::cuda::GpuMat& dst) { + const int depth = src.depth(); + CV_Assert(depth == CV_8U || depth == CV_16U); + width = src.cols; + height = src.rows; + src_pitch = static_cast<int>(src.step1()); + dst_pitch = static_cast<int>(dst.step1()); + input_depth_bits = static_cast<int>(src.elemSize1()) * 8; + output_depth_bits = static_cast<int>(dst.elemSize1()) * 8; + inout_type = sgm::EXECUTE_INOUT_CUDA2CUDA; + } + Creator(const cv::Mat& src, const cv::Mat& dst) { + const int depth = src.depth(); + CV_Assert(depth == CV_8U || depth == CV_16U); + width = src.cols; + height = src.rows; + src_pitch = static_cast<int>(src.step1()); + dst_pitch = static_cast<int>(dst.step1()); + input_depth_bits = static_cast<int>(src.elemSize1()) * 8; + output_depth_bits = static_cast<int>(dst.elemSize1()) * 8; + inout_type = sgm::EXECUTE_INOUT_HOST2HOST; + } +#endif // BUILD_OPRENCV_WRAPPER + }; + +#ifdef BUILD_OPENCV_WRAPPER + void LibSGMWrapper::execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity) { + const cv::Size size = I1.size(); + CV_Assert(size == I2.size()); + CV_Assert(I1.type() == I2.type()); + const int depth = I1.depth(); + CV_Assert(depth == CV_8U || depth == CV_16U); + if (disparity.size() != size || disparity.depth() != CV_16U) { + disparity.create(size, CV_16U); + } + std::unique_ptr<Creator> creator(new Creator(I1, disparity)); + if (!sgm_ || !prev_ || *creator != *prev_) { + sgm_.reset(creator->createStereoSGM(numDisparity_, param_)); + } + prev_ = std::move(creator); + + sgm_->execute(I1.data, I2.data, disparity.data); + } + void LibSGMWrapper::execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity) { + const cv::Size size = I1.size(); + CV_Assert(size == I2.size()); + CV_Assert(I1.type() == I2.type()); + const int depth = I1.depth(); + CV_Assert(depth == CV_8U || depth == CV_16U); + if (disparity.size() != size || disparity.depth() != CV_16U) { + disparity.create(size, CV_16U); + } + std::unique_ptr<Creator> creator(new Creator(I1, disparity)); + if (!sgm_ || !prev_ || *creator != *prev_) { + sgm_.reset(creator->createStereoSGM(numDisparity_, param_)); + } + prev_ = std::move(creator); + + sgm_->execute(I1.data, I2.data, disparity.data); + } +#endif // BUILD_OPENCV_WRAPPER +} diff --git a/lib/libsgm/src/median_filter.cu b/lib/libsgm/src/median_filter.cu new file mode 100644 index 0000000000000000000000000000000000000000..fffe24f0422a984ebb919bf188431c7e9994f148 --- /dev/null +++ b/lib/libsgm/src/median_filter.cu @@ -0,0 +1,292 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "internal.h" + +namespace { + + const int BLOCK_X = 16; + const int BLOCK_Y = 16; + const int KSIZE = 3; + const int RADIUS = KSIZE / 2; + const int KSIZE_SQ = KSIZE * KSIZE; + + inline int divup(int total, int grain) + { + return (total + grain - 1) / grain; + } + + template <typename T> + __device__ inline void swap(T& x, T& y) + { + T tmp(x); + x = y; + y = tmp; + } + + // sort, min, max of 1 element + template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); } + template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); } + template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); } + + // sort, min, max of 2 elements + __device__ inline void dev_sort_2(uint32_t& x, uint32_t& y) + { + const uint32_t mask = __vcmpgtu2(x, y); + const uint32_t tmp = (x ^ y) & mask; + x ^= tmp; + y ^= tmp; + } + __device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); } + __device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); } + + template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); } + template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); } + template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); } + + // sort, min, max of 4 elements + __device__ inline void dev_sort_4(uint32_t& x, uint32_t& y) + { + const uint32_t mask = __vcmpgtu4(x, y); + const uint32_t tmp = (x ^ y) & mask; + x ^= tmp; + y ^= tmp; + } + __device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); } + __device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); } + + template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); } + template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); } + template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); } + + template <typename T, int V = 1> + __device__ inline void median_selection_network_9(T* buf) + { +#define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j]) +#define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j]) +#define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j]) + + SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7); + SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8); + SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7); + MAX_OP(0, 3); MAX_OP(3, 6); + SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4); + MIN_OP(5, 8); MIN_OP(2, 5); + SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4); + +#undef SWAP_OP +#undef MIN_OP +#undef MAX_OP + } + + __global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS) + return; + + uint8_t buf[KSIZE_SQ]; + for (int i = 0; i < KSIZE_SQ; i++) + buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf); + + dst[y * p + x] = buf[KSIZE_SQ / 2]; + } + + __global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS) + return; + + uint16_t buf[KSIZE_SQ]; + for (int i = 0; i < KSIZE_SQ; i++) + buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf); + + dst[y * p + x] = buf[KSIZE_SQ / 2]; + } + + __global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch) + { + const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x); + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < RADIUS || y >= h - RADIUS) + return; + + uint32_t buf[KSIZE_SQ]; + if (x_4 >= 4 && x_4 + 7 < w) + { + buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]); + buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]); + buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]); + + buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]); + buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]); + buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]); + + buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]); + buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]); + buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]); + + buf[0] = (buf[1] << 8) | (buf[0] >> 24); + buf[2] = (buf[1] >> 8) | (buf[2] << 24); + + buf[3] = (buf[4] << 8) | (buf[3] >> 24); + buf[5] = (buf[4] >> 8) | (buf[5] << 24); + + buf[6] = (buf[7] << 8) | (buf[6] >> 24); + buf[8] = (buf[7] >> 8) | (buf[8] << 24); + + median_selection_network_9<uint32_t, 4>(buf); + + *((uint32_t*)&dst[y * pitch + x_4]) = buf[KSIZE_SQ / 2]; + } + else if (x_4 == 0) + { + for (int x = RADIUS; x < 4; x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + else if (x_4 < w) + { + for (int x = x_4; x < min(x_4 + 4, w - RADIUS); x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + } + + __global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch) + { + const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x); + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (y < RADIUS || y >= h - RADIUS) + return; + + uint32_t buf[KSIZE_SQ]; + if (x_2 >= 2 && x_2 + 3 < w) + { + buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]); + buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]); + buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]); + + buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]); + buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]); + buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]); + + buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]); + buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]); + buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]); + + buf[0] = (buf[1] << 16) | (buf[0] >> 16); + buf[2] = (buf[1] >> 16) | (buf[2] << 16); + + buf[3] = (buf[4] << 16) | (buf[3] >> 16); + buf[5] = (buf[4] >> 16) | (buf[5] << 16); + + buf[6] = (buf[7] << 16) | (buf[6] >> 16); + buf[8] = (buf[7] >> 16) | (buf[8] << 16); + + median_selection_network_9<uint32_t, 2>(buf); + + *((uint32_t*)&dst[y * pitch + x_2]) = buf[KSIZE_SQ / 2]; + } + else if (x_2 == 0) + { + for (int x = RADIUS; x < 2; x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + else if (x_2 < w) + { + for (int x = x_2; x < min(x_2 + 2, w - RADIUS); x++) + { + uint8_t* buf_u8 = (uint8_t*)buf; + for (int i = 0; i < KSIZE_SQ; i++) + buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)]; + + median_selection_network_9(buf_u8); + + dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2]; + } + } + } +} + +namespace sgm { + namespace details { + + void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch) { + + if (pitch % 4 == 0) { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width / 4, block.x), divup(height, block.y)); + median_kernel_3x3_8u_v4<<<grid, block>>>(d_src, d_dst, width, height, pitch); + } + else { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width, block.x), divup(height, block.y)); + median_kernel_3x3_8u<<<grid, block>>>(d_src, d_dst, width, height, pitch); + } + + CudaSafeCall(cudaGetLastError()); + } + + void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch) { + + if (pitch % 2 == 0) { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width / 2, block.x), divup(height, block.y)); + median_kernel_3x3_16u_v2<<<grid, block>>>(d_src, d_dst, width, height, pitch); + } + else { + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid(divup(width, block.x), divup(height, block.y)); + median_kernel_3x3_16u<<<grid, block>>>(d_src, d_dst, width, height, pitch); + } + + CudaSafeCall(cudaGetLastError()); + } + + } +} diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu new file mode 100644 index 0000000000000000000000000000000000000000..21785c947e4e0ed91273eecb92cfb5ffa8bb767b --- /dev/null +++ b/lib/libsgm/src/oblique_path_aggregation.cu @@ -0,0 +1,319 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <cstdio> +#include "oblique_path_aggregation.hpp" +#include "path_aggregation_common.hpp" + +namespace sgm { +namespace path_aggregation { + +static constexpr unsigned int DP_BLOCK_SIZE = 16u; +static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u; + +template <int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY> +__global__ void aggregate_oblique_path_kernel( + uint8_t *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK; + static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE; + + static_assert(X_DIRECTION == 1 || X_DIRECTION == -1, ""); + static_assert(Y_DIRECTION == 1 || Y_DIRECTION == -1, ""); + if(width == 0 || height == 0){ + return; + } + + __shared__ feature_type right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS]; + DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp; + + const unsigned int warp_id = threadIdx.x / WARP_SIZE; + const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE; + const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; + const unsigned int shfl_mask = + ((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE); + + const int x0 = + blockIdx.x * PATHS_PER_BLOCK + + warp_id * PATHS_PER_WARP + + group_id + + (X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0); + const int right_x00 = + blockIdx.x * PATHS_PER_BLOCK + + (X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0); + const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE; + + const unsigned int right0_addr = + static_cast<unsigned int>(right_x00 + PATHS_PER_BLOCK - 1 - x0) + dp_offset; + const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE; + const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE; + + for(unsigned int iter = 0; iter < height; ++iter){ + const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter); + const int x = x0 + static_cast<int>(iter) * X_DIRECTION; + const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION; + // Load right to smem + for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){ + const unsigned int i = i0 + threadIdx.x; + if(i < RIGHT_BUFFER_SIZE){ + const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i); + feature_type right_value = 0; + if(0 <= x && x < static_cast<int>(width)){ + right_value = right[x + y * width]; + } + const unsigned int lo = i % DP_BLOCK_SIZE; + const unsigned int hi = i / DP_BLOCK_SIZE; + right_buffer[lo][hi] = right_value; + if(hi > 0){ + right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value; + } + } + } + __syncthreads(); + // Compute + if(0 <= x && x < static_cast<int>(width)){ + const feature_type left_value = __ldg(&left[x + y * width]); + feature_type right_values[DP_BLOCK_SIZE]; + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi]; + } + uint32_t local_costs[DP_BLOCK_SIZE]; + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + local_costs[j] = __popc(left_value ^ right_values[j]); + } + dp.update(local_costs, p1, p2, shfl_mask); + store_uint8_vector<DP_BLOCK_SIZE>( + &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], + dp.dp); + } + __syncthreads(); + } +} + + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_upleft2downright_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_upright2downleft_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_downright2upleft_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_downleft2upright_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + + +template void enqueue_aggregate_upleft2downright_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_upleft2downright_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_upleft2downright_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_upright2downleft_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_upright2downleft_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_upright2downleft_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downright2upleft_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downright2upleft_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downright2upleft_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downleft2upright_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downleft2upright_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_downleft2upright_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a4f949b1e3f95fbccd42184f3e27d354ebe4ac58 --- /dev/null +++ b/lib/libsgm/src/oblique_path_aggregation.hpp @@ -0,0 +1,72 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_OBLIQUE_PATH_AGGREGATION_HPP +#define SGM_OBLIQUE_PATH_AGGREGATION_HPP + +#include "types.hpp" + +namespace sgm { +namespace path_aggregation { + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_upleft2downright_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_upright2downleft_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_downright2upleft_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_downleft2upright_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} + +#endif diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu new file mode 100644 index 0000000000000000000000000000000000000000..117cac507b1dded2c42415f968cec9e38d7674fe --- /dev/null +++ b/lib/libsgm/src/path_aggregation.cu @@ -0,0 +1,94 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "path_aggregation.hpp" +#include "vertical_path_aggregation.hpp" +#include "horizontal_path_aggregation.hpp" +#include "oblique_path_aggregation.hpp" + +namespace sgm { + +template <size_t MAX_DISPARITY> +PathAggregation<MAX_DISPARITY>::PathAggregation() + : m_cost_buffer() +{ + for(unsigned int i = 0; i < NUM_PATHS; ++i){ + cudaStreamCreate(&m_streams[i]); + cudaEventCreate(&m_events[i]); + } +} + +template <size_t MAX_DISPARITY> +PathAggregation<MAX_DISPARITY>::~PathAggregation(){ + for(unsigned int i = 0; i < NUM_PATHS; ++i){ + cudaStreamSynchronize(m_streams[i]); + cudaStreamDestroy(m_streams[i]); + cudaEventDestroy(m_events[i]); + } +} + +template <size_t MAX_DISPARITY> +void PathAggregation<MAX_DISPARITY>::enqueue( + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS; + if(m_cost_buffer.size() < buffer_size){ + m_cost_buffer = DeviceBuffer<cost_type>(buffer_size); + } + const size_t buffer_step = width * height * MAX_DISPARITY; + cudaStreamSynchronize(stream); + path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>( + m_cost_buffer.data() + 0 * buffer_step, + left, right, width, height, p1, p2, m_streams[0]); + path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>( + m_cost_buffer.data() + 1 * buffer_step, + left, right, width, height, p1, p2, m_streams[1]); + path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>( + m_cost_buffer.data() + 2 * buffer_step, + left, right, width, height, p1, p2, m_streams[2]); + path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>( + m_cost_buffer.data() + 3 * buffer_step, + left, right, width, height, p1, p2, m_streams[3]); + path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>( + m_cost_buffer.data() + 4 * buffer_step, + left, right, width, height, p1, p2, m_streams[4]); + path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>( + m_cost_buffer.data() + 5 * buffer_step, + left, right, width, height, p1, p2, m_streams[5]); + path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>( + m_cost_buffer.data() + 6 * buffer_step, + left, right, width, height, p1, p2, m_streams[6]); + path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>( + m_cost_buffer.data() + 7 * buffer_step, + left, right, width, height, p1, p2, m_streams[7]); + for(unsigned int i = 0; i < NUM_PATHS; ++i){ + cudaEventRecord(m_events[i], m_streams[i]); + cudaStreamWaitEvent(stream, m_events[i], 0); + } +} + + +template class PathAggregation< 64>; +template class PathAggregation<128>; +template class PathAggregation<256>; + +} diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3d2b35d8e9854587025b3d91ce3f96607aad3f82 --- /dev/null +++ b/lib/libsgm/src/path_aggregation.hpp @@ -0,0 +1,56 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_PATH_AGGREGATION_HPP +#define SGM_PATH_AGGREGATION_HPP + +#include "device_buffer.hpp" +#include "types.hpp" + +namespace sgm { + +template <size_t MAX_DISPARITY> +class PathAggregation { + +private: + static const unsigned int NUM_PATHS = 8; + + DeviceBuffer<cost_type> m_cost_buffer; + cudaStream_t m_streams[NUM_PATHS]; + cudaEvent_t m_events[NUM_PATHS]; + +public: + PathAggregation(); + ~PathAggregation(); + + const cost_type *get_output() const { + return m_cost_buffer.data(); + } + + void enqueue( + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +}; + +} + +#endif diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b560e732ff20e5983186e9137b70ab91e76a5f9f --- /dev/null +++ b/lib/libsgm/src/path_aggregation_common.hpp @@ -0,0 +1,98 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_PATH_AGGREGATION_COMMON_HPP +#define SGM_PATH_AGGREGATION_COMMON_HPP + +#include <cstdint> +#include <cuda.h> +#include "utility.hpp" + +namespace sgm { +namespace path_aggregation { + +template < + unsigned int DP_BLOCK_SIZE, + unsigned int SUBGROUP_SIZE> +struct DynamicProgramming { + static_assert( + DP_BLOCK_SIZE >= 2, + "DP_BLOCK_SIZE must be greater than or equal to 2"); + static_assert( + (SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, + "SUBGROUP_SIZE must be a power of 2"); + + uint32_t last_min; + uint32_t dp[DP_BLOCK_SIZE]; + + __device__ DynamicProgramming() + : last_min(0) + { + for(unsigned int i = 0; i < DP_BLOCK_SIZE; ++i){ dp[i] = 0; } + } + + __device__ void update( + uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask) + { + const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; + + const auto dp0 = dp[0]; + uint32_t lazy_out = 0, local_min = 0; + { + const unsigned int k = 0; +#if CUDA_VERSION >= 9000 + const uint32_t prev = + __shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1); +#else + const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1); +#endif + uint32_t out = min(dp[k] - last_min, p2); + if(lane_id != 0){ out = min(out, prev - last_min + p1); } + out = min(out, dp[k + 1] - last_min + p1); + lazy_out = local_min = out + local_costs[k]; + } + for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){ + uint32_t out = min(dp[k] - last_min, p2); + out = min(out, dp[k - 1] - last_min + p1); + out = min(out, dp[k + 1] - last_min + p1); + dp[k - 1] = lazy_out; + lazy_out = out + local_costs[k]; + local_min = min(local_min, lazy_out); + } + { + const unsigned int k = DP_BLOCK_SIZE - 1; +#if CUDA_VERSION >= 9000 + const uint32_t next = __shfl_down_sync(mask, dp0, 1); +#else + const uint32_t next = __shfl_down(dp0, 1); +#endif + uint32_t out = min(dp[k] - last_min, p2); + out = min(out, dp[k - 1] - last_min + p1); + if(lane_id + 1 != SUBGROUP_SIZE){ + out = min(out, next - last_min + p1); + } + dp[k - 1] = lazy_out; + dp[k] = out + local_costs[k]; + local_min = min(local_min, dp[k]); + } + last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask); + } +}; + +} +} + +#endif diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu new file mode 100644 index 0000000000000000000000000000000000000000..d276c3fa9eb3b7dc76297eec694c02c2ad061559 --- /dev/null +++ b/lib/libsgm/src/sgm.cu @@ -0,0 +1,148 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "sgm.hpp" +#include "census_transform.hpp" +#include "path_aggregation.hpp" +#include "winner_takes_all.hpp" + +namespace sgm { + +template <typename T, size_t MAX_DISPARITY> +class SemiGlobalMatching<T, MAX_DISPARITY>::Impl { + +private: + DeviceBuffer<T> m_input_left; + DeviceBuffer<T> m_input_right; + CensusTransform<T> m_census_left; + CensusTransform<T> m_census_right; + PathAggregation<MAX_DISPARITY> m_path_aggregation; + WinnerTakesAll<MAX_DISPARITY> m_winner_takes_all; + +public: + Impl() + : m_input_left() + , m_input_right() + , m_census_left() + , m_census_right() + , m_path_aggregation() + , m_winner_takes_all() + { } + + void enqueue( + output_type *dest_left, + output_type *dest_right, + const input_type *src_left, + const input_type *src_right, + int width, + int height, + int src_pitch, + int dst_pitch, + unsigned int penalty1, + unsigned int penalty2, + float uniqueness, + bool subpixel, + cudaStream_t stream) + { + m_census_left.enqueue( + src_left, width, height, src_pitch, stream); + m_census_right.enqueue( + src_right, width, height, src_pitch, stream); + m_path_aggregation.enqueue( + m_census_left.get_output(), + m_census_right.get_output(), + width, height, + penalty1, penalty2, + stream); + m_winner_takes_all.enqueue( + dest_left, dest_right, + m_path_aggregation.get_output(), + width, height, dst_pitch, uniqueness, subpixel, + stream); + } + +}; + + +template <typename T, size_t MAX_DISPARITY> +SemiGlobalMatching<T, MAX_DISPARITY>::SemiGlobalMatching() + : m_impl(new Impl()) +{ } + +template <typename T, size_t MAX_DISPARITY> +SemiGlobalMatching<T, MAX_DISPARITY>::~SemiGlobalMatching() = default; + + +template <typename T, size_t MAX_DISPARITY> +void SemiGlobalMatching<T, MAX_DISPARITY>::execute( + output_type *dest_left, + output_type *dest_right, + const input_type *src_left, + const input_type *src_right, + int width, + int height, + int src_pitch, + int dst_pitch, + unsigned int penalty1, + unsigned int penalty2, + float uniqueness, + bool subpixel) +{ + m_impl->enqueue( + dest_left, dest_right, + src_left, src_right, + width, height, + src_pitch, dst_pitch, + penalty1, penalty2, + uniqueness, subpixel, + 0); + cudaStreamSynchronize(0); +} + +template <typename T, size_t MAX_DISPARITY> +void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue( + output_type *dest_left, + output_type *dest_right, + const input_type *src_left, + const input_type *src_right, + int width, + int height, + int src_pitch, + int dst_pitch, + unsigned int penalty1, + unsigned int penalty2, + float uniqueness, + bool subpixel, + cudaStream_t stream) +{ + m_impl->enqueue( + dest_left, dest_right, + src_left, src_right, + width, height, + src_pitch, dst_pitch, + penalty1, penalty2, + uniqueness, subpixel, + stream); +} + + +template class SemiGlobalMatching<uint8_t, 64>; +template class SemiGlobalMatching<uint8_t, 128>; +template class SemiGlobalMatching<uint8_t, 256>; +template class SemiGlobalMatching<uint16_t, 64>; +template class SemiGlobalMatching<uint16_t, 128>; + +} diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp new file mode 100644 index 0000000000000000000000000000000000000000..afb308a25a1bf4bfc4366160d813e27f815337ab --- /dev/null +++ b/lib/libsgm/src/sgm.hpp @@ -0,0 +1,74 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_SGM_HPP +#define SGM_SGM_HPP + +#include <memory> +#include <cstdint> +#include "types.hpp" + +namespace sgm { + +template <typename T, size_t MAX_DISPARITY> +class SemiGlobalMatching { + +public: + using input_type = T; + using output_type = sgm::output_type; + +private: + class Impl; + std::unique_ptr<Impl> m_impl; + +public: + SemiGlobalMatching(); + ~SemiGlobalMatching(); + + void execute( + output_type *dest_left, + output_type *dest_right, + const input_type *src_left, + const input_type *src_right, + int width, + int height, + int src_pitch, + int dst_pitch, + unsigned int penalty1, + unsigned int penalty2, + float uniqueness, + bool subpixel); + + void enqueue( + output_type *dest_left, + output_type *dest_right, + const input_type *src_left, + const input_type *src_right, + int width, + int height, + int src_pitch, + int dst_pitch, + unsigned int penalty1, + unsigned int penalty2, + float uniqueness, + bool subpixel, + cudaStream_t stream); + +}; + +} + +#endif diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0219091f91448d4befa39dbfe190024d97816a7 --- /dev/null +++ b/lib/libsgm/src/stereo_sgm.cpp @@ -0,0 +1,228 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <iostream> + +#include <libsgm.h> + +#include "internal.h" +#include "sgm.hpp" + +namespace sgm { + static bool is_cuda_input(EXECUTE_INOUT type) { return (type & 0x1) > 0; } + static bool is_cuda_output(EXECUTE_INOUT type) { return (type & 0x2) > 0; } + + class SemiGlobalMatchingBase { + public: + using output_type = sgm::output_type; + virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, + int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) = 0; + + virtual ~SemiGlobalMatchingBase() {} + }; + + template <typename input_type, int DISP_SIZE> + class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase { + public: + void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, + int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) override + { + sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel); + } + private: + SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_; + }; + + struct CudaStereoSGMResources { + void* d_src_left; + void* d_src_right; + void* d_left_disp; + void* d_right_disp; + void* d_tmp_left_disp; + void* d_tmp_right_disp; + uint8_t* d_mask; + + SemiGlobalMatchingBase* sgm_engine; + + CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, int src_pitch_, int dst_pitch_, EXECUTE_INOUT inout_type_) { + + if (input_depth_bits_ == 8 && disparity_size_ == 64) + sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>(); + else if (input_depth_bits_ == 8 && disparity_size_ == 128) + sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>(); + else if (input_depth_bits_ == 8 && disparity_size_ == 256) + sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 256>(); + else if (input_depth_bits_ == 16 && disparity_size_ == 64) + sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 64>(); + else if (input_depth_bits_ == 16 && disparity_size_ == 128) + sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 128>(); + else + throw std::logic_error("depth bits must be 8 or 16, and disparity size must be 64, 128 or 256"); + + if (is_cuda_input(inout_type_)) { + this->d_src_left = NULL; + this->d_src_right = NULL; + } + else { + CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * src_pitch_ * height_)); + CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * src_pitch_ * height_)); + } + + CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * dst_pitch_ * height_)); + CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * dst_pitch_ * height_)); + + CudaSafeCall(cudaMalloc(&this->d_tmp_left_disp, sizeof(uint16_t) * dst_pitch_ * height_)); + CudaSafeCall(cudaMalloc(&this->d_tmp_right_disp, sizeof(uint16_t) * dst_pitch_ * height_)); + + CudaSafeCall(cudaMemset(this->d_left_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_)); + CudaSafeCall(cudaMemset(this->d_right_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_)); + CudaSafeCall(cudaMemset(this->d_tmp_left_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_)); + CudaSafeCall(cudaMemset(this->d_tmp_right_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_)); + + CudaSafeCall(cudaMalloc(&this->d_mask, src_pitch_ * height_)); + CudaSafeCall(cudaMemset(this->d_mask, 0, src_pitch_ * height_)); + } + + ~CudaStereoSGMResources() { + CudaSafeCall(cudaFree(this->d_src_left)); + CudaSafeCall(cudaFree(this->d_src_right)); + + CudaSafeCall(cudaFree(this->d_left_disp)); + CudaSafeCall(cudaFree(this->d_right_disp)); + + CudaSafeCall(cudaFree(this->d_tmp_left_disp)); + CudaSafeCall(cudaFree(this->d_tmp_right_disp)); + + CudaSafeCall(cudaFree(this->d_mask)); + + delete sgm_engine; + } + }; + + StereoSGM::StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, + EXECUTE_INOUT inout_type, const Parameters& param) : StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, width, width, inout_type, param) {} + + StereoSGM::StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch, + EXECUTE_INOUT inout_type, const Parameters& param) : + cu_res_(NULL), + width_(width), + height_(height), + disparity_size_(disparity_size), + input_depth_bits_(input_depth_bits), + output_depth_bits_(output_depth_bits), + src_pitch_(src_pitch), + dst_pitch_(dst_pitch), + inout_type_(inout_type), + param_(param) + { + // check values + if (input_depth_bits_ != 8 && input_depth_bits_ != 16 && output_depth_bits_ != 8 && output_depth_bits_ != 16) { + width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; + throw std::logic_error("depth bits must be 8 or 16"); + } + if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 256) { + width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; + throw std::logic_error("disparity size must be 64, 128 or 256"); + } + if (param.subpixel && output_depth_bits != 16) { + width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0; + throw std::logic_error("output depth bits must be 16 if sub-pixel option was enabled"); + } + + cu_res_ = new CudaStereoSGMResources(width_, height_, disparity_size_, input_depth_bits_, output_depth_bits_, src_pitch, dst_pitch, inout_type_); + } + + StereoSGM::~StereoSGM() { + if (cu_res_) { delete cu_res_; } + } + + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch) { + + const void *d_input_left, *d_input_right; + + if (is_cuda_input(inout_type_)) { + d_input_left = left_pixels; + d_input_right = right_pixels; + } + else { + CudaSafeCall(cudaMemcpy(cu_res_->d_src_left, left_pixels, input_depth_bits_ / 8 * src_pitch * height, cudaMemcpyHostToDevice)); + CudaSafeCall(cudaMemcpy(cu_res_->d_src_right, right_pixels, input_depth_bits_ / 8 * src_pitch * height, cudaMemcpyHostToDevice)); + d_input_left = cu_res_->d_src_left; + d_input_right = cu_res_->d_src_right; + } + + void* d_tmp_left_disp = cu_res_->d_tmp_left_disp; + void* d_tmp_right_disp = cu_res_->d_tmp_right_disp; + void* d_left_disp = cu_res_->d_left_disp; + void* d_right_disp = cu_res_->d_right_disp; + + if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) + d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer + + cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp, + d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, param_.P2, param_.uniqueness, param_.subpixel); + + sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch); + sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch); + sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel); + + if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) { + sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height); + CudaSafeCall(cudaMemcpy(dst, d_tmp_left_disp, sizeof(uint8_t) * dst_pitch * height, cudaMemcpyDeviceToHost)); + } + else if (is_cuda_output(inout_type_) && output_depth_bits_ == 8) { + sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)dst, dst_pitch * height); + } + else if (!is_cuda_output(inout_type_) && output_depth_bits_ == 16) { + CudaSafeCall(cudaMemcpy(dst, d_left_disp, sizeof(uint16_t) * dst_pitch * height, cudaMemcpyDeviceToHost)); + } + else if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) { + // optimize! no-copy! + } + else { + std::cerr << "not impl" << std::endl; + } + } + + void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) { + execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_); + } + + bool StereoSGM::updateParameters(const Parameters ¶ms) { + if (params.P1 > params.P2) { + return false; + } + if ((params.uniqueness < 0.0) || (params.uniqueness > 1.0)) { + return false; + } + + Parameters params_ = params; + std::swap(params_, this->param_); + return true; + } + + void StereoSGM::setMask(uint8_t* mask, int pitch) { + if (pitch != src_pitch_) { + throw std::logic_error("Mask pitch must be the same as source pitch"); + } + + CudaSafeCall(cudaMemcpy(cu_res_->d_mask, mask, src_pitch_ * height_, cudaMemcpyHostToDevice)); + } + + /*void StereoSGM::removeMask() { + CudaSafeCall(cudaMemset(cu_res_->d_mask, 0, src_pitch_ * height_)); + }*/ +} diff --git a/lib/libsgm/src/types.hpp b/lib/libsgm/src/types.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6e7725a2bedb36248c87c75d3ea0c61b533dcd29 --- /dev/null +++ b/lib/libsgm/src/types.hpp @@ -0,0 +1,31 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_TYPES_HPP +#define SGM_TYPES_HPP + +#include <cstdint> + +namespace sgm { + +using feature_type = uint32_t; +using cost_type = uint8_t; +using cost_sum_type = uint16_t; +using output_type = uint16_t; + +} + +#endif diff --git a/lib/libsgm/src/utility.hpp b/lib/libsgm/src/utility.hpp new file mode 100644 index 0000000000000000000000000000000000000000..17b9f45f5375e1756f581a2d7fd3e72e5f588d60 --- /dev/null +++ b/lib/libsgm/src/utility.hpp @@ -0,0 +1,246 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_UTILITY_HPP +#define SGM_UTILITY_HPP + +#include <cuda.h> + +namespace sgm { + +static constexpr unsigned int WARP_SIZE = 32u; + +namespace detail { + template <typename T, unsigned int GROUP_SIZE, unsigned int STEP> + struct subgroup_min_impl { + static __device__ T call(T x, uint32_t mask){ +#if CUDA_VERSION >= 9000 + x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE)); +#else + x = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE)); +#endif + return subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask); + } + }; + template <typename T, unsigned int GROUP_SIZE> + struct subgroup_min_impl<T, GROUP_SIZE, 1u> { + static __device__ T call(T x, uint32_t){ + return x; + } + }; + template <unsigned int GROUP_SIZE, unsigned int STEP> + struct subgroup_and_impl { + static __device__ bool call(bool x, uint32_t mask){ +#if CUDA_VERSION >= 9000 + x &= __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE); +#else + x &= __shfl_xor(x, STEP / 2, GROUP_SIZE); +#endif + return subgroup_and_impl<GROUP_SIZE, STEP / 2>::call(x, mask); + } + }; + template <unsigned int GROUP_SIZE> + struct subgroup_and_impl<GROUP_SIZE, 1u> { + static __device__ bool call(bool x, uint32_t){ + return x; + } + }; +} + +template <unsigned int GROUP_SIZE, typename T> +__device__ inline T subgroup_min(T x, uint32_t mask){ + return detail::subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask); +} + +template <unsigned int GROUP_SIZE> +__device__ inline bool subgroup_and(bool x, uint32_t mask){ + return detail::subgroup_and_impl<GROUP_SIZE, GROUP_SIZE>::call(x, mask); +} + +template <typename T, typename S> +__device__ inline T load_as(const S *p){ + return *reinterpret_cast<const T *>(p); +} + +template <typename T, typename S> +__device__ inline void store_as(S *p, const T& x){ + *reinterpret_cast<T *>(p) = x; +} + + +template <typename T> +__device__ inline uint32_t pack_uint8x4(T x, T y, T z, T w){ + uchar4 uint8x4; + uint8x4.x = static_cast<uint8_t>(x); + uint8x4.y = static_cast<uint8_t>(y); + uint8x4.z = static_cast<uint8_t>(z); + uint8x4.w = static_cast<uint8_t>(w); + return load_as<uint32_t>(&uint8x4); +} + + +template <unsigned int N> +__device__ inline void load_uint8_vector(uint32_t *dest, const uint8_t *ptr); + +template <> +__device__ inline void load_uint8_vector<1u>(uint32_t *dest, const uint8_t *ptr){ + dest[0] = static_cast<uint32_t>(ptr[0]); +} + +template <> +__device__ inline void load_uint8_vector<2u>(uint32_t *dest, const uint8_t *ptr){ + const auto uint8x2 = load_as<uchar2>(ptr); + dest[0] = uint8x2.x; dest[1] = uint8x2.y; +} + +template <> +__device__ inline void load_uint8_vector<4u>(uint32_t *dest, const uint8_t *ptr){ + const auto uint8x4 = load_as<uchar4>(ptr); + dest[0] = uint8x4.x; dest[1] = uint8x4.y; dest[2] = uint8x4.z; dest[3] = uint8x4.w; +} + +template <> +__device__ inline void load_uint8_vector<8u>(uint32_t *dest, const uint8_t *ptr){ + const auto uint32x2 = load_as<uint2>(ptr); + load_uint8_vector<4u>(dest + 0, reinterpret_cast<const uint8_t *>(&uint32x2.x)); + load_uint8_vector<4u>(dest + 4, reinterpret_cast<const uint8_t *>(&uint32x2.y)); +} + +template <> +__device__ inline void load_uint8_vector<16u>(uint32_t *dest, const uint8_t *ptr){ + const auto uint32x4 = load_as<uint4>(ptr); + load_uint8_vector<4u>(dest + 0, reinterpret_cast<const uint8_t *>(&uint32x4.x)); + load_uint8_vector<4u>(dest + 4, reinterpret_cast<const uint8_t *>(&uint32x4.y)); + load_uint8_vector<4u>(dest + 8, reinterpret_cast<const uint8_t *>(&uint32x4.z)); + load_uint8_vector<4u>(dest + 12, reinterpret_cast<const uint8_t *>(&uint32x4.w)); +} + + +template <unsigned int N> +__device__ inline void store_uint8_vector(uint8_t *dest, const uint32_t *ptr); + +template <> +__device__ inline void store_uint8_vector<1u>(uint8_t *dest, const uint32_t *ptr){ + dest[0] = static_cast<uint8_t>(ptr[0]); +} + +template <> +__device__ inline void store_uint8_vector<2u>(uint8_t *dest, const uint32_t *ptr){ + uchar2 uint8x2; + uint8x2.x = static_cast<uint8_t>(ptr[0]); + uint8x2.y = static_cast<uint8_t>(ptr[0]); + store_as<uchar2>(dest, uint8x2); +} + +template <> +__device__ inline void store_uint8_vector<4u>(uint8_t *dest, const uint32_t *ptr){ + store_as<uint32_t>(dest, pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3])); +} + +template <> +__device__ inline void store_uint8_vector<8u>(uint8_t *dest, const uint32_t *ptr){ + uint2 uint32x2; + uint32x2.x = pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]); + uint32x2.y = pack_uint8x4(ptr[4], ptr[5], ptr[6], ptr[7]); + store_as<uint2>(dest, uint32x2); +} + +template <> +__device__ inline void store_uint8_vector<16u>(uint8_t *dest, const uint32_t *ptr){ + uint4 uint32x4; + uint32x4.x = pack_uint8x4(ptr[ 0], ptr[ 1], ptr[ 2], ptr[ 3]); + uint32x4.y = pack_uint8x4(ptr[ 4], ptr[ 5], ptr[ 6], ptr[ 7]); + uint32x4.z = pack_uint8x4(ptr[ 8], ptr[ 9], ptr[10], ptr[11]); + uint32x4.w = pack_uint8x4(ptr[12], ptr[13], ptr[14], ptr[15]); + store_as<uint4>(dest, uint32x4); +} + + +template <unsigned int N> +__device__ inline void load_uint16_vector(uint32_t *dest, const uint16_t *ptr); + +template <> +__device__ inline void load_uint16_vector<1u>(uint32_t *dest, const uint16_t *ptr){ + dest[0] = static_cast<uint32_t>(ptr[0]); +} + +template <> +__device__ inline void load_uint16_vector<2u>(uint32_t *dest, const uint16_t *ptr){ + const auto uint16x2 = load_as<ushort2>(ptr); + dest[0] = uint16x2.x; dest[1] = uint16x2.y; +} + +template <> +__device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *ptr){ + const auto uint16x4 = load_as<ushort4>(ptr); + dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w; +} + +template <> +__device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr){ + const auto uint32x4 = load_as<uint4>(ptr); + load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x4.x)); + load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x4.y)); + load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x4.z)); + load_uint16_vector<2u>(dest + 6, reinterpret_cast<const uint16_t *>(&uint32x4.w)); +} + + +template <unsigned int N> +__device__ inline void store_uint16_vector(uint16_t *dest, const uint32_t *ptr); + +template <> +__device__ inline void store_uint16_vector<1u>(uint16_t *dest, const uint32_t *ptr){ + dest[0] = static_cast<uint16_t>(ptr[0]); +} + +template <> +__device__ inline void store_uint16_vector<2u>(uint16_t *dest, const uint32_t *ptr){ + ushort2 uint16x2; + uint16x2.x = static_cast<uint16_t>(ptr[0]); + uint16x2.y = static_cast<uint16_t>(ptr[1]); + store_as<ushort2>(dest, uint16x2); +} + +template <> +__device__ inline void store_uint16_vector<4u>(uint16_t *dest, const uint32_t *ptr){ + ushort4 uint16x4; + uint16x4.x = static_cast<uint16_t>(ptr[0]); + uint16x4.y = static_cast<uint16_t>(ptr[1]); + uint16x4.z = static_cast<uint16_t>(ptr[2]); + uint16x4.w = static_cast<uint16_t>(ptr[3]); + store_as<ushort4>(dest, uint16x4); +} + +template <> +__device__ inline void store_uint16_vector<8u>(uint16_t *dest, const uint32_t *ptr){ + uint4 uint32x4; + store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.x), &ptr[0]); + store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.y), &ptr[2]); + store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.z), &ptr[4]); + store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.w), &ptr[6]); + store_as<uint4>(dest, uint32x4); +} + +template <> +__device__ inline void store_uint16_vector<16u>(uint16_t *dest, const uint32_t *ptr){ + store_uint16_vector<8u>(dest + 0, ptr + 0); + store_uint16_vector<8u>(dest + 8, ptr + 8); +} + +} + +#endif diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu new file mode 100644 index 0000000000000000000000000000000000000000..d318f46b465d7c238c3beb8d67d0b858eada38d1 --- /dev/null +++ b/lib/libsgm/src/vertical_path_aggregation.cu @@ -0,0 +1,216 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <cstdio> +#include "vertical_path_aggregation.hpp" +#include "path_aggregation_common.hpp" + +namespace sgm { +namespace path_aggregation { + +static constexpr unsigned int DP_BLOCK_SIZE = 16u; +static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u; + +template <int DIRECTION, unsigned int MAX_DISPARITY> +__global__ void aggregate_vertical_path_kernel( + uint8_t *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK; + static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE; + + static_assert(DIRECTION == 1 || DIRECTION == -1, ""); + if(width == 0 || height == 0){ + return; + } + + __shared__ feature_type right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS + 1]; + DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp; + + const unsigned int warp_id = threadIdx.x / WARP_SIZE; + const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE; + const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE; + const unsigned int shfl_mask = + ((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE); + + const unsigned int x = + blockIdx.x * PATHS_PER_BLOCK + + warp_id * PATHS_PER_WARP + + group_id; + const unsigned int right_x0 = blockIdx.x * PATHS_PER_BLOCK; + const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE; + + const unsigned int right0_addr = + (right_x0 + PATHS_PER_BLOCK - 1) - x + dp_offset; + const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE; + const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE; + + for(unsigned int iter = 0; iter < height; ++iter){ + const unsigned int y = (DIRECTION > 0 ? iter : height - 1 - iter); + // Load left to register + feature_type left_value; + if(x < width){ + left_value = left[x + y * width]; + } + // Load right to smem + for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){ + const unsigned int i = i0 + threadIdx.x; + if(i < RIGHT_BUFFER_SIZE){ + const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i); + feature_type right_value = 0; + if(0 <= x && x < static_cast<int>(width)){ + right_value = right[x + y * width]; + } + const unsigned int lo = i % DP_BLOCK_SIZE; + const unsigned int hi = i / DP_BLOCK_SIZE; + right_buffer[lo][hi] = right_value; + if(hi > 0){ + right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value; + } + } + } + __syncthreads(); + // Compute + if(x < width){ + feature_type right_values[DP_BLOCK_SIZE]; + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi]; + } + uint32_t local_costs[DP_BLOCK_SIZE]; + for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){ + local_costs[j] = __popc(left_value ^ right_values[j]); + } + dp.update(local_costs, p1, p2, shfl_mask); + store_uint8_vector<DP_BLOCK_SIZE>( + &dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width], + dp.dp); + } + __syncthreads(); + } +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_up2down_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_down2up_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream) +{ + static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE; + static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE; + + const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>( + dest, left, right, width, height, p1, p2); +} + + +template void enqueue_aggregate_up2down_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_up2down_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_up2down_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_down2up_path<64u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_down2up_path<128u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template void enqueue_aggregate_down2up_path<256u>( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0ae04016a3f412c75a173c9a6c03dbd5d7c4a7c2 --- /dev/null +++ b/lib/libsgm/src/vertical_path_aggregation.hpp @@ -0,0 +1,50 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_VERTICAL_PATH_AGGREGATION_HPP +#define SGM_VERTICAL_PATH_AGGREGATION_HPP + +#include "types.hpp" + +namespace sgm { +namespace path_aggregation { + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_up2down_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +template <unsigned int MAX_DISPARITY> +void enqueue_aggregate_down2up_path( + cost_type *dest, + const feature_type *left, + const feature_type *right, + int width, + int height, + unsigned int p1, + unsigned int p2, + cudaStream_t stream); + +} +} + +#endif diff --git a/lib/libsgm/src/winner_takes_all.cu b/lib/libsgm/src/winner_takes_all.cu new file mode 100644 index 0000000000000000000000000000000000000000..f33d3005289a5041d03160af960032e221055e59 --- /dev/null +++ b/lib/libsgm/src/winner_takes_all.cu @@ -0,0 +1,302 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include <cstdio> +#include <libsgm.h> +#include "winner_takes_all.hpp" +#include "utility.hpp" + +namespace sgm { + +namespace { + +static constexpr unsigned int NUM_PATHS = 8u; + +static constexpr unsigned int WARPS_PER_BLOCK = 8u; +static constexpr unsigned int BLOCK_SIZE = WARPS_PER_BLOCK * WARP_SIZE; + + +__device__ inline uint32_t pack_cost_index(uint32_t cost, uint32_t index){ + union { + uint32_t uint32; + ushort2 uint16x2; + } u; + u.uint16x2.x = static_cast<uint16_t>(index); + u.uint16x2.y = static_cast<uint16_t>(cost); + return u.uint32; +} + +__device__ uint32_t unpack_cost(uint32_t packed){ + return packed >> 16; +} + +__device__ int unpack_index(uint32_t packed){ + return packed & 0xffffu; +} + +using ComputeDisparity = uint32_t(*)(uint32_t, uint32_t, uint16_t*); + +__device__ inline uint32_t compute_disparity_normal(uint32_t disp, uint32_t cost = 0, uint16_t* smem = nullptr) +{ + return disp; +} + +template <size_t MAX_DISPARITY> +__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, uint32_t cost, uint16_t* smem) +{ + int subp = disp; + subp <<= sgm::StereoSGM::SUBPIXEL_SHIFT; + if (disp > 0 && disp < MAX_DISPARITY - 1) { + const int left = smem[disp - 1]; + const int right = smem[disp + 1]; + const int numer = left - right; + const int denom = left - 2 * cost + right; + subp += ((numer << sgm::StereoSGM::SUBPIXEL_SHIFT) + denom) / (2 * denom); + } + return subp; +} + + +template <unsigned int MAX_DISPARITY, ComputeDisparity compute_disparity = compute_disparity_normal> +__global__ void winner_takes_all_kernel( + output_type *left_dest, + output_type *right_dest, + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness) +{ + static const unsigned int ACCUMULATION_PER_THREAD = 16u; + static const unsigned int REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE; + static const unsigned int ACCUMULATION_INTERVAL = ACCUMULATION_PER_THREAD / REDUCTION_PER_THREAD; + static const unsigned int UNROLL_DEPTH = + (REDUCTION_PER_THREAD > ACCUMULATION_INTERVAL) + ? REDUCTION_PER_THREAD + : ACCUMULATION_INTERVAL; + + const unsigned int cost_step = MAX_DISPARITY * width * height; + const unsigned int warp_id = threadIdx.x / WARP_SIZE; + const unsigned int lane_id = threadIdx.x % WARP_SIZE; + + const unsigned int y = blockIdx.x * WARPS_PER_BLOCK + warp_id; + src += y * MAX_DISPARITY * width; + left_dest += y * pitch; + right_dest += y * pitch; + + if(y >= height){ + return; + } + + __shared__ uint16_t smem_cost_sum[WARPS_PER_BLOCK][ACCUMULATION_INTERVAL][MAX_DISPARITY]; + + uint32_t right_best[REDUCTION_PER_THREAD]; + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + right_best[i] = 0xffffffffu; + } + + for(unsigned int x0 = 0; x0 < width; x0 += UNROLL_DEPTH){ +#pragma unroll + for(unsigned int x1 = 0; x1 < UNROLL_DEPTH; ++x1){ + if(x1 % ACCUMULATION_INTERVAL == 0){ + const unsigned int k = lane_id * ACCUMULATION_PER_THREAD; + const unsigned int k_hi = k / MAX_DISPARITY; + const unsigned int k_lo = k % MAX_DISPARITY; + const unsigned int x = x0 + x1 + k_hi; + if(x < width){ + const unsigned int offset = x * MAX_DISPARITY + k_lo; + uint32_t sum[ACCUMULATION_PER_THREAD]; + for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){ + sum[i] = 0; + } + for(unsigned int p = 0; p < NUM_PATHS; ++p){ + uint32_t load_buffer[ACCUMULATION_PER_THREAD]; + load_uint8_vector<ACCUMULATION_PER_THREAD>( + load_buffer, &src[p * cost_step + offset]); + for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){ + sum[i] += load_buffer[i]; + } + } + store_uint16_vector<ACCUMULATION_PER_THREAD>( + &smem_cost_sum[warp_id][k_hi][k_lo], sum); + } +#if CUDA_VERSION >= 9000 + __syncwarp(); +#else + __threadfence_block(); +#endif + } + const unsigned int x = x0 + x1; + if(x < width){ + // Load sum of costs + const unsigned int smem_x = x1 % ACCUMULATION_INTERVAL; + const unsigned int k0 = lane_id * REDUCTION_PER_THREAD; + uint32_t local_cost_sum[REDUCTION_PER_THREAD]; + load_uint16_vector<REDUCTION_PER_THREAD>( + local_cost_sum, &smem_cost_sum[warp_id][smem_x][k0]); + // Pack sum of costs and dispairty + uint32_t local_packed_cost[REDUCTION_PER_THREAD]; + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + local_packed_cost[i] = pack_cost_index(local_cost_sum[i], k0 + i); + } + // Update left + uint32_t best = 0xffffffffu; + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + best = min(best, local_packed_cost[i]); + } + best = subgroup_min<WARP_SIZE>(best, 0xffffffffu); + // Update right +#pragma unroll + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + const unsigned int k = lane_id * REDUCTION_PER_THREAD + i; + const int p = static_cast<int>(((x - k) & ~(MAX_DISPARITY - 1)) + k); + const unsigned int d = static_cast<unsigned int>(x - p); +#if CUDA_VERSION >= 9000 + const uint32_t recv = __shfl_sync(0xffffffffu, + local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD], + d / REDUCTION_PER_THREAD, + WARP_SIZE); +#else + const uint32_t recv = __shfl( + local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD], + d / REDUCTION_PER_THREAD, + WARP_SIZE); +#endif + right_best[i] = min(right_best[i], recv); + if(d == MAX_DISPARITY - 1){ + if(0 <= p){ + right_dest[p] = compute_disparity_normal(unpack_index(right_best[i])); + } + right_best[i] = 0xffffffffu; + } + } + // Resume updating left to avoid execution dependency + const uint32_t bestCost = unpack_cost(best); + const int bestDisp = unpack_index(best); + bool uniq = true; + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + const uint32_t x = local_packed_cost[i]; + const bool uniq1 = unpack_cost(x) * uniqueness >= bestCost; + const bool uniq2 = abs(unpack_index(x) - bestDisp) <= 1; + uniq &= uniq1 || uniq2; + } + uniq = subgroup_and<WARP_SIZE>(uniq, 0xffffffffu); + if(lane_id == 0){ + left_dest[x] = uniq ? compute_disparity(bestDisp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0; + } + } + } + } + for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){ + const unsigned int k = lane_id * REDUCTION_PER_THREAD + i; + const int p = static_cast<int>(((width - k) & ~(MAX_DISPARITY - 1)) + k); + if(p < width){ + right_dest[p] = compute_disparity_normal(unpack_index(right_best[i])); + } + } +} + +template <size_t MAX_DISPARITY> +void enqueue_winner_takes_all( + output_type *left_dest, + output_type *right_dest, + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness, + bool subpixel, + cudaStream_t stream) +{ + const int gdim = + (height + WARPS_PER_BLOCK - 1) / WARPS_PER_BLOCK; + const int bdim = BLOCK_SIZE; + if (subpixel) { + winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim, 0, stream>>>( + left_dest, right_dest, src, width, height, pitch, uniqueness); + } else { + winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_normal><<<gdim, bdim, 0, stream>>>( + left_dest, right_dest, src, width, height, pitch, uniqueness); + } +} + +} + + +template <size_t MAX_DISPARITY> +WinnerTakesAll<MAX_DISPARITY>::WinnerTakesAll() + : m_left_buffer() + , m_right_buffer() +{ } + +template <size_t MAX_DISPARITY> +void WinnerTakesAll<MAX_DISPARITY>::enqueue( + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness, + bool subpixel, + cudaStream_t stream) +{ + if(m_left_buffer.size() < static_cast<size_t>(pitch * height)){ + m_left_buffer = DeviceBuffer<output_type>(pitch * height); + } + if(m_right_buffer.size() < static_cast<size_t>(pitch * height)){ + m_right_buffer = DeviceBuffer<output_type>(pitch * height); + } + enqueue_winner_takes_all<MAX_DISPARITY>( + m_left_buffer.data(), + m_right_buffer.data(), + src, + width, + height, + pitch, + uniqueness, + subpixel, + stream); +} + +template <size_t MAX_DISPARITY> +void WinnerTakesAll<MAX_DISPARITY>::enqueue( + output_type* left, + output_type* right, + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness, + bool subpixel, + cudaStream_t stream) +{ + enqueue_winner_takes_all<MAX_DISPARITY>( + left, + right, + src, + width, + height, + pitch, + uniqueness, + subpixel, + stream); +} + + +template class WinnerTakesAll< 64>; +template class WinnerTakesAll<128>; +template class WinnerTakesAll<256>; + +} diff --git a/lib/libsgm/src/winner_takes_all.hpp b/lib/libsgm/src/winner_takes_all.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3dbae82735581c6b1051fd973efd4346e24c2543 --- /dev/null +++ b/lib/libsgm/src/winner_takes_all.hpp @@ -0,0 +1,67 @@ +/* +Copyright 2016 Fixstars Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http ://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SGM_WINNER_TAKES_ALL_HPP +#define SGM_WINNER_TAKES_ALL_HPP + +#include "device_buffer.hpp" +#include "types.hpp" + +namespace sgm { + +template <size_t MAX_DISPARITY> +class WinnerTakesAll { + +private: + DeviceBuffer<output_type> m_left_buffer; + DeviceBuffer<output_type> m_right_buffer; + +public: + WinnerTakesAll(); + + const output_type *get_left_output() const { + return m_left_buffer.data(); + } + + const output_type *get_right_output() const { + return m_right_buffer.data(); + } + + void enqueue( + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness, + bool subpixel, + cudaStream_t stream); + + void enqueue( + output_type *left, + output_type *right, + const cost_type *src, + int width, + int height, + int pitch, + float uniqueness, + bool subpixel, + cudaStream_t stream); + +}; + +} + +#endif diff --git a/lib/libsgm/test/CMakeLists.txt b/lib/libsgm/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..883b89ab50c29c55a907d52b341e216aa0a8c466 --- /dev/null +++ b/lib/libsgm/test/CMakeLists.txt @@ -0,0 +1,26 @@ +cmake_minimum_required(VERSION 3.0) +project(sgm-test) + +find_package(CUDA REQUIRED) + +if (MSVC) + option(gtest_force_shared_crt "Force Gmock to use standard compiler flags" ON) +endif() + +add_subdirectory(googletest) + +include_directories(${gtest_SOURCE_DIR}/include) +include_directories(../include) +include_directories(../src) + +file(GLOB TEST_SOURCES "*.cpp" "*.cu") + +if (CMAKE_COMPILER_IS_GNUCXX) + set(CMAKE_CXX_FLAGS "-O3 -Wall") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") +endif() + +SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH} -lineinfo") + +cuda_add_executable(sgm-test ${TEST_SOURCES}) +target_link_libraries(sgm-test sgm gtest) diff --git a/lib/libsgm/test/census_transform_test.cu b/lib/libsgm/test/census_transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..b369f3927966cc9c21cc9477e3cc720feed1bcf6 --- /dev/null +++ b/lib/libsgm/test/census_transform_test.cu @@ -0,0 +1,71 @@ +#include <gtest/gtest.h> +#include "census_transform.hpp" +#include "generator.hpp" +#include "test_utility.hpp" + +#include "debug.hpp" + +namespace { + +template <typename T> +thrust::host_vector<sgm::feature_type> census_transform( + const thrust::host_vector<T>& src, size_t width, size_t height, size_t pitch) +{ + const int hor = 9 / 2, ver = 7 / 2; + thrust::host_vector<sgm::feature_type> result(width * height, 0); + for(int y = ver; y < static_cast<int>(height) - ver; ++y){ + for(int x = hor; x < static_cast<int>(width) - hor; ++x){ + const auto c = src[x + y * pitch]; + sgm::feature_type value = 0; + for(int dy = -ver; dy <= 0; ++dy){ + for(int dx = -hor; dx <= (dy == 0 ? -1 : hor); ++dx){ + const auto a = src[(x + dx) + (y + dy) * pitch]; + const auto b = src[(x - dx) + (y - dy) * pitch]; + value <<= 1; + if(a > b){ value |= 1; } + } + } + result[x + y * width] = value; + } + } + return result; +} + +} + + +TEST(CensusTransformTest, RandomU8){ + using input_type = uint8_t; + const size_t width = 631, height = 479; + const auto input = generate_random_sequence<input_type>(width * height); + const auto expect = census_transform(input, width, height, width); + + sgm::CensusTransform<uint8_t> transform; + const auto d_input = to_device_vector(input); + transform.enqueue(d_input.data().get(), width, height, width, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::feature_type> d_actual( + transform.get_output(), transform.get_output() + (width * height)); + const auto actual = to_host_vector(d_actual); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, 1); +} + +TEST(CensusTransformTest, RandomU8WithPitch){ + using input_type = uint8_t; + const size_t width = 631, height = 479, pitch = 640; + const auto input = generate_random_sequence<input_type>(pitch * height); + const auto expect = census_transform(input, width, height, pitch); + + sgm::CensusTransform<uint8_t> transform; + const auto d_input = to_device_vector(input); + transform.enqueue(d_input.data().get(), width, height, pitch, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::feature_type> d_actual( + transform.get_output(), transform.get_output() + (width * height)); + const auto actual = to_host_vector(d_actual); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, 1); +} diff --git a/lib/libsgm/test/debug.hpp b/lib/libsgm/test/debug.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9ac1ab26f1ee09bc2e2aa5b496009951ff096cc4 --- /dev/null +++ b/lib/libsgm/test/debug.hpp @@ -0,0 +1,31 @@ +#ifndef SGM_TEST_DEBUG_HPP +#define SGM_TEST_DEBUG_HPP + +#include <iostream> +#include <cstdint> + +template <typename T> +inline bool debug_compare( + const T *actual, + const T *expect, + size_t width, + size_t height, + size_t disparity) +{ + bool accept = true; + for(size_t y = 0, i = 0; y < height; ++y){ + for(size_t x = 0; x < width; ++x){ + for(size_t k = 0; k < disparity; ++k, ++i){ + if(actual[i] != expect[i]){ + std::cerr << "(" << y << ", " << x << ", " << k << "): "; + std::cerr << static_cast<uint64_t>(actual[i]) << " / "; + std::cerr << static_cast<uint64_t>(expect[i]) << std::endl; + accept = false; + } + } + } + } + return accept; +} + +#endif diff --git a/lib/libsgm/test/generator.cpp b/lib/libsgm/test/generator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..44e48d3f74a09a8aedb45f207727e11211b82d36 --- /dev/null +++ b/lib/libsgm/test/generator.cpp @@ -0,0 +1,3 @@ +#include "generator.hpp" + +std::default_random_engine g_random_engine; diff --git a/lib/libsgm/test/generator.hpp b/lib/libsgm/test/generator.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f8a0516f46eb5ee8d276935b234f4b64df320b3e --- /dev/null +++ b/lib/libsgm/test/generator.hpp @@ -0,0 +1,34 @@ +#ifndef SGM_TEST_GENERATOR_HPP +#define SGM_TEST_GENERATOR_HPP + +#include <random> +#include <limits> +#include <thrust/host_vector.h> + +extern std::default_random_engine g_random_engine; + +template <typename T> +inline thrust::host_vector<T> generate_random_sequence(size_t n){ + std::uniform_int_distribution<T> distribution( + std::numeric_limits<T>::min(), + std::numeric_limits<T>::max()); + thrust::host_vector<T> seq(n); + for(size_t i = 0; i < n; ++i){ + seq[i] = distribution(g_random_engine); + } + return seq; +} + +template <> +inline thrust::host_vector<uint8_t> generate_random_sequence<uint8_t>(size_t n) { + std::uniform_int_distribution<unsigned int> distribution( + std::numeric_limits<uint8_t>::min(), + std::numeric_limits<uint8_t>::max()); + thrust::host_vector<uint8_t> seq(n); + for (size_t i = 0; i < n; ++i) { + seq[i] = static_cast<uint8_t>(distribution(g_random_engine)); + } + return seq; +} + +#endif diff --git a/lib/libsgm/test/horizontal_path_aggregation_test.cu b/lib/libsgm/test/horizontal_path_aggregation_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..106cb32ca1de88f478fa880d9d6fe1cf934c0c56 --- /dev/null +++ b/lib/libsgm/test/horizontal_path_aggregation_test.cu @@ -0,0 +1,53 @@ +#include <gtest/gtest.h> +#include "horizontal_path_aggregation.hpp" +#include "path_aggregation_test.hpp" +#include "generator.hpp" +#include "test_utility.hpp" + +#include "debug.hpp" + +TEST(HorizontalPathAggregationTest, RandomLeft2Right){ + static constexpr size_t width = 631, height = 479, disparity = 128; + static constexpr unsigned int p1 = 20, p2 = 100; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, 1, 0); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_left2right_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} + +TEST(HorizontalPathAggregationTest, RandomRight2Left){ + static constexpr size_t width = 640, height = 480, disparity = 64; + static constexpr unsigned int p1 = 20, p2 = 40; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, -1, 0); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_right2left_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} diff --git a/lib/libsgm/test/main.cpp b/lib/libsgm/test/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36c5def97f36666b0589f3be92a34977399a5f9a --- /dev/null +++ b/lib/libsgm/test/main.cpp @@ -0,0 +1,6 @@ +#include <gtest/gtest.h> + +int main(int argc, char **argv){ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/lib/libsgm/test/oblique_path_aggregation_test.cu b/lib/libsgm/test/oblique_path_aggregation_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f96cf25854b595e2c82af483c65664a384580014 --- /dev/null +++ b/lib/libsgm/test/oblique_path_aggregation_test.cu @@ -0,0 +1,99 @@ +#include <gtest/gtest.h> +#include "oblique_path_aggregation.hpp" +#include "path_aggregation_test.hpp" +#include "generator.hpp" +#include "test_utility.hpp" + +#include "debug.hpp" + +TEST(ObliquePathAggregationTest, RandomUpLeft2DownRight){ + static constexpr size_t width = 631, height = 479, disparity = 128; + static constexpr unsigned int p1 = 20, p2 = 100; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, 1, 1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_upleft2downright_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} + +TEST(ObliquePathAggregationTest, RandomUpRight2DownLeft){ + static constexpr size_t width = 640, height = 479, disparity = 64; + static constexpr unsigned int p1 = 20, p2 = 40; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, -1, 1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_upright2downleft_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} + +TEST(ObliquePathAggregationTest, RandomDownRight2UpLeft){ + static constexpr size_t width = 631, height = 479, disparity = 128; + static constexpr unsigned int p1 = 20, p2 = 100; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, -1, -1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_downright2upleft_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} + +TEST(ObliquePathAggregationTest, RandomDownLeft2UpRight){ + static constexpr size_t width = 640, height = 479, disparity = 64; + static constexpr unsigned int p1 = 20, p2 = 40; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, 1, -1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_downleft2upright_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} diff --git a/lib/libsgm/test/path_aggregation_test.hpp b/lib/libsgm/test/path_aggregation_test.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8f104ba5b7337617c4476fecfc18c26f775f6018 --- /dev/null +++ b/lib/libsgm/test/path_aggregation_test.hpp @@ -0,0 +1,48 @@ +#ifndef SGM_TEST_PATH_AGGREGATION_TEST_HPP +#define SGM_TEST_PATH_AGGREGATION_TEST_HPP + +#include <limits> +#include <algorithm> +#include <thrust/host_vector.h> + +#ifdef _WIN32 +#define popcnt64 __popcnt64 +#else +#define popcnt64 __builtin_popcountll +#endif + +static thrust::host_vector<sgm::cost_type> path_aggregation( + const thrust::host_vector<sgm::feature_type>& left, + const thrust::host_vector<sgm::feature_type>& right, + int width, int height, int max_disparity, + int p1, int p2, int dx, int dy) +{ + thrust::host_vector<sgm::cost_type> result(width * height * max_disparity); + std::vector<int> before(max_disparity); + for(int i = (dy < 0 ? height - 1: 0); 0 <= i && i < height; i += (dy < 0 ? -1 : 1)){ + for(int j = (dx < 0 ? width - 1 : 0); 0 <= j && j < width; j += (dx < 0 ? -1 : 1)){ + const int i2 = i - dy, j2 = j - dx; + const bool inside = (0 <= i2 && i2 < height && 0 <= j2 && j2 < width); + for(int k = 0; k < max_disparity; ++k){ + before[k] = inside ? result[k + (j2 + i2 * width) * max_disparity] : 0; + } + const int min_cost = *min_element(before.begin(), before.end()); + for(int k = 0; k < max_disparity; ++k){ + const auto l = left[j + i * width]; + const auto r = (k > j ? 0 : right[(j - k) + i * width]); + int cost = std::min(before[k] - min_cost, p2); + if(k > 0){ + cost = std::min(cost, before[k - 1] - min_cost + p1); + } + if(k + 1 < max_disparity){ + cost = std::min(cost, before[k + 1] - min_cost + p1); + } + cost += static_cast<int>(popcnt64(l ^ r)); + result[k + (j + i * width) * max_disparity] = static_cast<uint8_t>(cost); + } + } + } + return result; +} + +#endif diff --git a/lib/libsgm/test/test_utility.hpp b/lib/libsgm/test/test_utility.hpp new file mode 100644 index 0000000000000000000000000000000000000000..60cf832f53882f74ed74edbbd75ceeadda7a5285 --- /dev/null +++ b/lib/libsgm/test/test_utility.hpp @@ -0,0 +1,29 @@ +#ifndef SGM_TEST_UTILITY_HPP +#define SGM_TEST_UTILITY_HPP + +#include <thrust/host_vector.h> +#include <thrust/device_vector.h> + +template <typename T> +inline thrust::host_vector<T> to_host_vector(const thrust::device_vector<T>& src){ + thrust::host_vector<T> dest(src.size()); + cudaMemcpy( + dest.data(), + src.data().get(), + sizeof(T) * src.size(), + cudaMemcpyDeviceToHost); + return dest; +} + +template <typename T> +inline thrust::device_vector<T> to_device_vector(const thrust::host_vector<T>& src){ + thrust::device_vector<T> dest(src.size()); + cudaMemcpy( + dest.data().get(), + src.data(), + sizeof(T) * src.size(), + cudaMemcpyHostToDevice); + return dest; +} + +#endif diff --git a/lib/libsgm/test/vertical_path_aggregation_test.cu b/lib/libsgm/test/vertical_path_aggregation_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..a0fb7f93da7d3755956e930b28296df991016f10 --- /dev/null +++ b/lib/libsgm/test/vertical_path_aggregation_test.cu @@ -0,0 +1,53 @@ +#include <gtest/gtest.h> +#include "vertical_path_aggregation.hpp" +#include "path_aggregation_test.hpp" +#include "generator.hpp" +#include "test_utility.hpp" + +#include "debug.hpp" + +TEST(VerticalPathAggregationTest, RandomUp2Down){ + static constexpr size_t width = 631, height = 479, disparity = 128; + static constexpr unsigned int p1 = 20, p2 = 100; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, 0, 1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_up2down_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} + +TEST(VerticalPathAggregationTest, RandomDown2Up){ + static constexpr size_t width = 640, height = 479, disparity = 64; + static constexpr unsigned int p1 = 20, p2 = 40; + const auto left = generate_random_sequence<sgm::feature_type>(width * height); + const auto right = generate_random_sequence<sgm::feature_type>(width * height); + const auto expect = path_aggregation( + left, right, width, height, disparity, p1, p2, 0, -1); + + const auto d_left = to_device_vector(left); + const auto d_right = to_device_vector(right); + thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity); + sgm::path_aggregation::enqueue_aggregate_down2up_path<disparity>( + d_cost.data().get(), + d_left.data().get(), + d_right.data().get(), + width, height, p1, p2, 0); + cudaStreamSynchronize(0); + + const auto actual = to_host_vector(d_cost); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), width, height, disparity); +} diff --git a/lib/libsgm/test/winner_takes_all_test.cu b/lib/libsgm/test/winner_takes_all_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..c882114e52b107d76c53522b2def189d761a48c1 --- /dev/null +++ b/lib/libsgm/test/winner_takes_all_test.cu @@ -0,0 +1,242 @@ +#include <gtest/gtest.h> +#include <utility> +#include <algorithm> +#include <libsgm.h> +#include "winner_takes_all.hpp" +#include "generator.hpp" +#include "test_utility.hpp" + +#include "debug.hpp" + +namespace { + +static constexpr size_t NUM_PATHS = 8; + +thrust::host_vector<sgm::output_type> winner_takes_all_left( + const thrust::host_vector<sgm::cost_type>& src, + size_t width, size_t height, size_t pitch, size_t disparity, + float uniqueness, bool subpixel) +{ + thrust::host_vector<sgm::output_type> result(pitch * height); + for(size_t i = 0; i < height; ++i){ + for(size_t j = 0; j < width; ++j){ + std::vector<std::pair<int, int>> v; + for(size_t k = 0; k < disparity; ++k){ + int cost_sum = 0; + for(size_t p = 0; p < NUM_PATHS; ++p){ + cost_sum += static_cast<int>(src[ + p * disparity * width * height + + i * disparity * width + + j * disparity + + k]); + } + v.emplace_back(cost_sum, static_cast<int>(k)); + } + const auto ite = std::min_element(v.begin(), v.end()); + assert(ite != v.end()); + const auto best = *ite; + const int best_cost = best.first; + sgm::output_type best_disp = best.second; + sgm::output_type dst = best_disp; + if (subpixel) { + dst <<= sgm::StereoSGM::SUBPIXEL_SHIFT; + if (0 < best_disp && best_disp < static_cast<int>(disparity) - 1) { + const int left = v[best_disp - 1].first; + const int right = v[best_disp + 1].first; + const int numer = left - right; + const int denom = left - 2 * best_cost + right; + dst += ((numer << sgm::StereoSGM::SUBPIXEL_SHIFT) + denom) / (2 * denom); + } + } + for (const auto& p : v) { + const int cost = p.first; + const int disp = p.second; + if (cost * uniqueness < best_cost && abs(disp - best_disp) > 1) { + dst = 0; + break; + } + } + result[i * pitch + j] = dst; + } + } + return result; +} + +thrust::host_vector<sgm::output_type> winner_takes_all_right( + const thrust::host_vector<sgm::cost_type>& src, + size_t width, size_t height, size_t pitch, size_t disparity, + float uniqueness) +{ + thrust::host_vector<sgm::output_type> result(pitch * height); + for(size_t i = 0; i < height; ++i){ + for(size_t j = 0; j < width; ++j){ + std::vector<std::pair<int, int>> v; + for(size_t k = 0; j + k < width && k < disparity; ++k){ + int cost_sum = 0; + for(size_t p = 0; p < NUM_PATHS; ++p){ + cost_sum += static_cast<int>(src[ + p * disparity * width * height + + i * disparity * width + + (j + k) * disparity + + k]); + } + v.emplace_back(cost_sum, static_cast<int>(k)); + } + const auto ite = std::min_element(v.begin(), v.end()); + assert(ite != v.end()); + const auto best = *ite; + result[i * pitch + j] = best.second; + } + } + return result; +} + +} + +static void test_random_left(bool subpixel, size_t padding = 0) +{ + static constexpr size_t width = 313, height = 237, disparity = 128; + static constexpr float uniqueness = 0.95f; + const size_t pitch = width + padding; + const auto input = generate_random_sequence<sgm::cost_type>( + width * height * disparity * NUM_PATHS); + const auto expect = winner_takes_all_left( + input, width, height, pitch, disparity, uniqueness, subpixel); + + sgm::WinnerTakesAll<disparity> wta; + const auto d_input = to_device_vector(input); + wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::output_type> d_actual( + wta.get_left_output(), wta.get_left_output() + (pitch * height)); + const auto actual = to_host_vector(d_actual); + + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), pitch, height, 1); +} + +static void test_corner1_left(bool subpixel, size_t padding = 0) +{ + static constexpr size_t width = 1, height = 1, disparity = 64; + static constexpr float uniqueness = 0.95f; + const size_t pitch = width + padding; + static constexpr size_t n = width * height * disparity * NUM_PATHS; + static constexpr size_t step = width * height * disparity; + thrust::host_vector<sgm::cost_type> input(n); + for (auto& v : input) { + v = 1; + } + for (size_t i = 0; i < NUM_PATHS; ++i) { + input[i * step] = 64; + } + const auto expect = winner_takes_all_left( + input, width, height, pitch, disparity, uniqueness, subpixel); + + sgm::WinnerTakesAll<disparity> wta; + const auto d_input = to_device_vector(input); + wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::output_type> d_actual( + wta.get_left_output(), wta.get_left_output() + (pitch * height)); + const auto actual = to_host_vector(d_actual); + + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), pitch, height, 1); +} + +static void test_corner2_left(bool subpixel, size_t padding = 0) +{ + static constexpr size_t width = 1, height = 1, disparity = 64; + static constexpr float uniqueness = 0.95f; + const size_t pitch = width + padding; + static constexpr size_t n = width * height * disparity * NUM_PATHS; + static constexpr size_t step = width * height * disparity; + thrust::host_vector<sgm::cost_type> input(n); + for (auto& v : input) { + v = 64; + } + for (size_t i = 0; i < NUM_PATHS; ++i) { + input[i * step + 16] = 1; + } + for (size_t i = 0; i < NUM_PATHS; ++i) { + input[i * step + 32] = 1; + } + const auto expect = winner_takes_all_left( + input, width, height, pitch, disparity, uniqueness, subpixel); + + sgm::WinnerTakesAll<disparity> wta; + const auto d_input = to_device_vector(input); + wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::output_type> d_actual( + wta.get_left_output(), wta.get_left_output() + (pitch * height)); + const auto actual = to_host_vector(d_actual); + + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), pitch, height, 1); +} + +TEST(WinnerTakesAllTest, RandomLeftNormal){ + test_random_left(false); +} + +TEST(WinnerTakesAllTest, RandomLeftSubpixel){ + test_random_left(true); +} + +TEST(WinnerTakesAllTest, RandomLeftNormalWithPitch){ + test_random_left(false, 27); +} + +TEST(WinnerTakesAllTest, RandomLeftSubpixelWithPitch){ + test_random_left(true, 27); +} + +TEST(WinnerTakesAllTest, Corner1LeftNormal){ + test_corner1_left(false); +} + +TEST(WinnerTakesAllTest, Corner1LeftSubpixel){ + test_corner1_left(true); +} + +TEST(WinnerTakesAllTest, Corner2LeftNormal){ + test_corner2_left(false); +} + +TEST(WinnerTakesAllTest, Corner2LeftSubpixel){ + test_corner2_left(true); +} + +static void test_random_right(size_t padding = 0) +{ + static constexpr size_t width = 313, height = 237, disparity = 64; + static constexpr float uniqueness = 0.95f; + const size_t pitch = width + padding; + const auto input = generate_random_sequence<sgm::cost_type>( + width * height * disparity * NUM_PATHS); + const auto expect = winner_takes_all_right( + input, width, height, pitch, disparity, uniqueness); + + sgm::WinnerTakesAll<disparity> wta; + const auto d_input = to_device_vector(input); + wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, false, 0); + cudaStreamSynchronize(0); + + const thrust::device_vector<sgm::output_type> d_actual( + wta.get_right_output(), wta.get_right_output() + (pitch * height)); + const auto actual = to_host_vector(d_actual); + EXPECT_EQ(actual, expect); + debug_compare(actual.data(), expect.data(), pitch, height, 1); +} + +TEST(WinnerTakesAllTest, RandomRight){ + test_random_right(); +} + +TEST(WinnerTakesAllTest, RandomRightWithPitch){ + test_random_right(27); +}