diff --git a/CMakeLists.txt b/CMakeLists.txt
index 376b06b36dcaa0450f21629f4ac17046fa1b3f56..d1a93b36729e0051a00f7d8c1131399c166a25a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,10 +116,12 @@ endif()
 # ==============================================================================
 
 if (WITH_FIXSTARS)
-	find_package(LibSGM REQUIRED)
-	if (LibSGM_FOUND)
-		set(HAVE_LIBSGM true)
-	endif()
+	set(HAVE_LIBSGM true)
+	add_subdirectory(lib/libsgm)
+	include_directories(lib/libsgm/include)
+	set_property(TARGET sgm PROPERTY FOLDER "dependencies")
+else()
+	add_library(sgm INTERFACE)
 endif()
 
 if (WITH_CERES)
@@ -400,7 +402,7 @@ if (BUILD_CALIBRATION)
 	if (NOT HAVE_CERES)
 		message(ERROR "Ceres is required")
 	endif()
-	
+
 	add_subdirectory(applications/calibration-ceres)
 	add_subdirectory(applications/calibration-multi)
 endif()
diff --git a/components/operators/CMakeLists.txt b/components/operators/CMakeLists.txt
index d0d786884525f45bbf3b4dcb41efda089d0bce89..0fd5466aed057c3d1034501fee00518de5a44bf9 100644
--- a/components/operators/CMakeLists.txt
+++ b/components/operators/CMakeLists.txt
@@ -35,9 +35,9 @@ set(OPERSRC
 	src/gt_analysis.cu
 )
 
-if (LIBSGM_FOUND)
+if (HAVE_LIBSGM)
 	list(APPEND OPERSRC src/disparity/fixstars_sgm.cpp)
-endif (LIBSGM_FOUND)
+endif (HAVE_LIBSGM)
 
 if (HAVE_OPTFLOW)
 	list(APPEND OPERSRC
@@ -52,6 +52,7 @@ target_include_directories(ftloperators PUBLIC
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 	$<INSTALL_INTERFACE:include>
 	PRIVATE src)
-target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon Eigen3::Eigen Threads::Threads ${OpenCV_LIBS})
+
+target_link_libraries(ftloperators ftlrender ftlrgbd ftlcommon sgm Eigen3::Eigen Threads::Threads ${OpenCV_LIBS})
 
 #ADD_SUBDIRECTORY(test)
diff --git a/lib/libsgm/CMakeLists.txt b/lib/libsgm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..497411f85b8f667e0bbb3516fa3e61782f3a34ec
--- /dev/null
+++ b/lib/libsgm/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CUDA_ARCH "-arch=sm_50" CACHE STRING "Value of the NVCC -arch option.")
+
+option(ENABLE_ZED_DEMO      "Build a Demo using ZED Camera" OFF)
+option(ENABLE_SAMPLES       "Build samples" OFF)
+option(ENABLE_TESTS         "Test library" OFF)
+option(LIBSGM_SHARED        "Build a shared library" OFF)
+option(BUILD_OPENCV_WRAPPER "Make library compatible with cv::Mat and cv::cuda::GpuMat of OpenCV" OFF)
+
+if(WIN32)
+  set(ZED_SDK_LIB "C:\\Program Files (x86)\\ZED SDK\\lib\\sl_zed64.lib" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
+  set(ZED_SDK_INCLUDE_DIR "C:\\Program Files (x86)\\ZED SDK\\include" CACHE STRING "ZED SDK include path.")
+else()
+  set(ZED_SDK_LIB "/usr/local/zed/lib/libsl_zed.so" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
+  set(ZED_SDK_INCLUDE_DIR "/usr/local/zed/include" CACHE STRING "ZED SDK include path.")
+endif()
+
+project(libSGM VERSION 2.4.0)
+
+if(BUILD_OPENCV_WRAPPER)
+	find_package(OpenCV REQUIRED core)
+	include_directories(${OpenCV_INCLUDE_DIRS})
+endif()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/libsgm_config.h.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/include/libsgm_config.h
+)
+
+add_subdirectory(src)
+
+if(ENABLE_SAMPLES)
+    add_subdirectory(sample/image)
+    add_subdirectory(sample/movie)
+#    add_subdirectory(sample/reprojection)
+    add_subdirectory(sample/benchmark)
+    if(BUILD_OPENCV_WRAPPER)
+        add_subdirectory(sample/image_cv_gpumat)
+    endif()
+endif()
+
+if(ENABLE_TESTS)
+	add_subdirectory(test)
+endif()
+
+if(ENABLE_ZED_DEMO)
+	add_subdirectory(sample/zed)
+endif()
diff --git a/cmake/FindLibSGM.cmake b/lib/libsgm/FindLibSGM.cmake
similarity index 73%
rename from cmake/FindLibSGM.cmake
rename to lib/libsgm/FindLibSGM.cmake
index 37fe6ca922084499b3ee7e964ea38b5e6fbbbfb6..ac950c25175cc43df740f5c3ee76e09bce4545e3 100644
--- a/cmake/FindLibSGM.cmake
+++ b/lib/libsgm/FindLibSGM.cmake
@@ -6,36 +6,28 @@
 # LIBSGM_INCLUDE_DIRS - Directories containing the LIBSGM include files.
 # LIBSGM_LIBRARY - Libraries needed to use LIBSGM.
 
-if(WIN32)
-find_path(libSGM_DIR libSGM PATHS "C:/Program Files" "C:/Program Files (x86)")
-set(libSGM_DIR ${libSGM_DIR}/libSGM)
-else()
-set(glog_DIR "")
-endif()
-
 # Find lib
 set(LIBSGM_FOUND FALSE CACHE BOOL "" FORCE)
 find_library(LIBSGM_LIBRARY
     NAMES sgm libsgm
-    PATHS ${libSGM_DIR}
     PATH_SUFFIXES lib/
 )
 
 # Find include
 find_path(LIBSGM_INCLUDE_DIRS
     NAMES libsgm.h
-    PATHS ${libSGM_DIR}
     PATH_SUFFIXES include/
 )
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(LibSGM DEFAULT_MSG LIBSGM_LIBRARY LIBSGM_INCLUDE_DIRS)
 
+message(STATUS "(LIBSGM_FOUND : ${LIBSGM_FOUND} include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
+
 mark_as_advanced(LIBSGM_FOUND)
 
 if(LIBSGM_FOUND)
-	include_directories(${LIBSGM_INCLUDE_DIRS})
     set(LIBSGM_FOUND TRUE CACHE BOOL "" FORCE)
     set(LIBSGM_LIBRARIES ${LIBSGM_LIBRARY})
-    message(STATUS "Found libSGM")
+    message(STATUS "LibSGM found ( include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
 endif()
diff --git a/lib/libsgm/LICENSE b/lib/libsgm/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/lib/libsgm/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/lib/libsgm/README.jp.md b/lib/libsgm/README.jp.md
new file mode 100644
index 0000000000000000000000000000000000000000..5811b94dc5f357e99f2aa801ea906f9a876d67d9
--- /dev/null
+++ b/lib/libsgm/README.jp.md
@@ -0,0 +1,88 @@
+# libSGM
+---
+Semi-Global Matchingã‚’ãƒ™ãƒ¼ã‚¹ã¨ã—ãŸCUDAå®Ÿè£…ã§ã™ã€‚
+
+## æ¦‚è¦
+---
+
+libSGMã¯ã€Semi-Global Matchingã‚¢ãƒ«ã‚´ãƒªã‚ºãƒ ã‚’CUDAã§å®Ÿè£…ã—ãŸã‚‚ã®ã§ã™ã€‚  
+é©åˆ‡ã«ã‚ãƒ£ãƒªãƒ–ãƒ¬ãƒ¼ã‚·ãƒ§ãƒ³ã•ã‚ŒãŸ2ã¤ã®å…¥åŠ›ç”»åƒã‹ã‚‰ã€è¦–å·®ç”»åƒã‚’å–å¾—ã™ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚
+
+## ç‰¹å¾´
+---
+CUDAã‚’ä½¿ç”¨ã—ã€é«˜é€Ÿãªè¦–å·®ç”»åƒç®—å‡ºãŒå¯èƒ½
+
+## ãƒ‘ãƒ•ã‚©ãƒ¼ãƒžãƒ³ã‚¹
+benchmarkã‚µãƒ³ãƒ—ãƒ«ã§è¨ˆæ¸¬ã—ãŸå‡¦ç†æ™‚é–“ã‚’ç¤ºã—ã¾ã™
+### Settings
+- image size : 1024 x 440
+- disparity size : 128
+- sgm path : 8 path
+
+### Results
+|Device|Processing Time[Milliseconds]|FPS|
+|---|---|---|
+|Tegra X2|52.4|19.1|
+|GTX 1080 Ti|3.4|296|
+
+## Requirements
+libSGMã¯CUDA (compute capabilities >= 3.0)ã‚’å¿…è¦ã¨ã—ã¾ã™ã€‚  
+ã¾ãŸã€ã‚µãƒ³ãƒ—ãƒ«ã‚’ãƒ“ãƒ«ãƒ‰ã™ã‚‹éš›ã«ã¯ä»¥ä¸‹ã®ãƒ©ã‚¤ãƒ–ãƒ©ãƒªãŒå¿…è¦ã¨ãªã‚Šã¾ã™ã€‚
+- OpenCV 3.0 ä»¥é™
+- CMake 3.10 ä»¥é™
+
+## build
+```
+$ git clone https://github.com/fixstars/libSGM.git
+$ cd libSGM
+$ mkdir build
+$ cd build
+$ cmake ../
+$ make
+```
+
+## ã‚µãƒ³ãƒ—ãƒ«å®Ÿè¡Œ
+```
+$ pwd
+.../libSGM
+$ cd build
+$ cd sample/movie/
+$ ./stereo_movie <left image path format> <right image path format> <disparity> <frame count>
+left image path format: å·¦å´ç”»åƒå…¥åŠ›æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚¡ã‚¤ãƒ«ãƒ‘ã‚¹ã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆ
+right image path format: å³å´ç”»åƒå…¥åŠ›æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚¡ã‚¤ãƒ«ãƒ‘ã‚¹ã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆ
+disparity: è¦–å·®æƒ…å ±ã‚’ä½•æ®µéšŽã§ä¿æŒã™ã‚‹ã‹(çœç•¥å¯)
+frame count: å…¨ç”»åƒãŒä½•æžšå˜åœ¨ã™ã‚‹ã‹(çœç•¥å¯)
+```
+
+disparityã¨frame countã¯çœç•¥ãŒå¯èƒ½ã§ã™ã€‚çœç•¥ã—ãŸæ™‚ã«ã¯ã€ãã‚Œãžã‚Œ64, 100ãŒä»˜ä¸Žã•ã‚Œã¾ã™ã€‚
+
+ã“ã“ã§ã€left image path format, right image path formatã¨ã¯ã€ãƒ•ã‚¡ã‚¤ãƒ«èªã¿è¾¼ã¿æ™‚ã«ä½¿ç”¨ã™ã‚‹ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆã‚’æ„å‘³ã—ã¾ã™ã€‚  
+æ¬¡ã®ã‚ˆã†ãªé€£ç•ªãƒ•ã‚¡ã‚¤ãƒ«ãŒä¸Žãˆã‚‰ã‚Œã¦ã„ãŸã¨ãã€ä¸Žãˆã‚‹ã¹ãpath formatã¯ä»¥ä¸‹ã®ã‚ˆã†ã«ãªã‚Šã¾ã™ã€‚
+```
+left_image_0000.pgm
+left_image_0001.pgm
+left_image_0002.pgm
+left_image_0003.pgm
+...
+
+right_image_0000.pgm
+right_image_0001.pgm
+right_image_0002.pgm
+right_image_0003.pgm
+```
+
+```
+$ ./stereo_movie left_image_%04d.pgm right_image_%04d.pgm
+```
+
+movie, imageã‚µãƒ³ãƒ—ãƒ«ã¯ã€
+http://www.6d-vision.com/scene-labeling
+ã«ã¦æä¾›ã•ã‚Œã¦ã„ã‚‹ã€Daimler Urban Scene Segmentation Benchmark Datasetã«ã¦
+å‹•ä½œç¢ºèªã‚’ã—ã¦ã„ã¾ã™ã€‚
+
+## Authors
+The "SGM Team": Samuel Audet, Yoriyuki Kitta, Yuta Noto, Ryo Sakamoto, Akihiro Takagi  
+[Fixstars Corporation](http://www.fixstars.com/)
+
+## License
+Apache License 2.0
diff --git a/lib/libsgm/README.md b/lib/libsgm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e80ade9fc24868b8efe203ab20e54274d13ef72c
--- /dev/null
+++ b/lib/libsgm/README.md
@@ -0,0 +1,88 @@
+# libSGM
+---
+A CUDA implementation performing Semi-Global Matching.
+
+## Introduction
+---
+
+libSGM is library that implements in CUDA the Semi-Global Matching algorithm.  
+From a pair of appropriately calibrated input images, we can obtain the disparity map.
+
+## Features
+---
+Because it uses CUDA, we can compute the disparity map at high speed.
+
+## Performance
+The libSGM performance obtained from benchmark sample
+### Settings
+- image size : 1024 x 440
+- disparity size : 128
+- sgm path : 8 path
+
+### Results
+|Device|Processing Time[Milliseconds]|FPS|
+|---|---|---|
+|Tegra X2|52.4|19.1|
+|GTX 1080 Ti|3.4|296|
+
+## Requirements
+libSGM needs CUDA (compute capabilities >= 3.0) to be installed.  
+Moreover, to build the sample, we need the following libraries:
+- OpenCV 3.0 or later
+- CMake 3.10 or later
+
+## Build Instructions
+```
+$ git clone https://github.com/fixstars/libSGM.git
+$ cd libSGM
+$ mkdir build
+$ cd build
+$ cmake ../
+$ make
+```
+
+## Sample Execution
+```
+$ pwd
+.../libSGM
+$ cd build
+$ cd sample/movie/
+$ ./stereo_movie <left image path format> <right image path format> <disparity> <frame count>
+left image path format: the format used for the file paths to the left input images
+right image path format: the format used for the file paths to the right input images
+disparity: the maximum number of disparities (optional)
+frame count: the total number of images (optional)
+```
+
+"disparity" and "frame count" are optional. By default, they are 64 and 100, respectively.
+
+Next, we explain the meaning of the "left image path format" and "right image path format".  
+When provided with the following set of files, we should pass the "path formats" given below.
+```
+left_image_0000.pgm
+left_image_0001.pgm
+left_image_0002.pgm
+left_image_0003.pgm
+...
+
+right_image_0000.pgm
+right_image_0001.pgm
+right_image_0002.pgm
+right_image_0003.pgm
+```
+
+```
+$ ./stereo_movie left_image_%04d.pgm right_image_%04d.pgm
+```
+
+The sample movie images available at
+http://www.6d-vision.com/scene-labeling
+under "Daimler Urban Scene Segmentation Benchmark Dataset"
+are used to test the software.
+
+## Authors
+The "SGM Team": Samuel Audet, Yoriyuki Kitta, Yuta Noto, Ryo Sakamoto, Akihiro Takagi  
+[Fixstars Corporation](http://www.fixstars.com/)
+
+## License
+Apache License 2.0
diff --git a/lib/libsgm/include/libsgm.h b/lib/libsgm/include/libsgm.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e69ed8aef137ebf90878e49616a3e87b2bc3bf7
--- /dev/null
+++ b/lib/libsgm/include/libsgm.h
@@ -0,0 +1,155 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#pragma once
+
+/**
+* @mainpage stereo-sgm
+* See sgm::StereoSGM
+*/
+
+/**
+* @file libsgm.h
+* stereo-sgm main header
+*/
+
+#include <stdint.h>
+#include "libsgm_config.h"
+
+#if defined(LIBSGM_SHARED)
+	#if defined(WIN32) || defined(_WIN32)
+		#if defined sgm_EXPORTS
+			#define LIBSGM_API __declspec(dllexport)
+		#else
+			#define LIBSGM_API __declspec(dllimport)
+		#endif
+	#else
+		#define LIBSGM_API __attribute__((visibility("default")))
+	#endif
+#else
+	#define LIBSGM_API
+#endif
+
+namespace sgm {
+	struct CudaStereoSGMResources;
+
+	/**
+	* @enum DST_TYPE
+	* Indicates input/output pointer type.
+	*/
+	enum EXECUTE_INOUT {
+		EXECUTE_INOUT_HOST2HOST = (0 << 1) | 0,
+		EXECUTE_INOUT_HOST2CUDA = (1 << 1) | 0,
+		EXECUTE_INOUT_CUDA2HOST = (0 << 1) | 1,
+		EXECUTE_INOUT_CUDA2CUDA = (1 << 1) | 1,
+	};
+
+	/**
+	* @brief StereoSGM class
+	*/
+	class StereoSGM {
+	public:
+		static const int SUBPIXEL_SHIFT = 4;
+		static const int SUBPIXEL_SCALE = (1 << SUBPIXEL_SHIFT);
+		struct Parameters
+		{
+			int P1;
+			int P2;
+			float uniqueness;
+			bool subpixel;
+			Parameters(int P1 = 10, int P2 = 120, float uniqueness = 0.95f, bool subpixel = false) : P1(P1), P2(P2), uniqueness(uniqueness), subpixel(subpixel) {}
+		};
+
+		/**
+		* @param width Processed image's width.
+		* @param height Processed image's height.
+		* @param disparity_size It must be 64 or 128.
+		* @param input_depth_bits Processed image's bits per pixel. It must be 8 or 16.
+		* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
+		* @param inout_type 	Specify input/output pointer type. See sgm::EXECUTE_TYPE.
+		* @attention
+		* output_depth_bits must be set to 16 when subpixel is enabled.
+		*/
+		LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, 
+			EXECUTE_INOUT inout_type, const Parameters& param = Parameters());
+
+		/**
+		* @param width Processed image's width.
+		* @param height Processed image's height.
+		* @param disparity_size It must be 64 or 128.
+		* @param input_depth_bits Processed image's bits per pixel. It must be 8 or 16.
+		* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
+		* @param src_pitch Source image's pitch (pixels).
+		* @param dst_pitch Destination image's pitch (pixels).
+		* @param inout_type 	Specify input/output pointer type. See sgm::EXECUTE_TYPE.
+		* @attention
+		* output_depth_bits must be set to 16 when subpixel is enabled.
+		*/
+		LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch,
+			EXECUTE_INOUT inout_type, const Parameters& param = Parameters());
+
+		LIBSGM_API virtual ~StereoSGM();
+
+		/**
+		* Execute stereo semi global matching.
+		* @param left_pixels	A pointer stored input left image.
+		* @param right_pixels	A pointer stored input right image.
+		* @param dst	        Output pointer. User must allocate enough memory.
+		* @attention
+		* You need to allocate dst memory at least width x height x sizeof(element_type) bytes.
+		* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
+		* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+		*/
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst);
+
+		/**
+		 * Same as execute(left_pixels, right_pixels, dst) with image size parameters.
+		 * Dimensions must be smaller or equal to dimensions provided in constructor.
+		 */
+		LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch);
+
+		/**
+		 * Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values
+		 * will be masked in final disparity image.
+		 */
+		LIBSGM_API void setMask(uint8_t* mask, int pitch);
+
+		/**
+		 * Update parameters. Returns true if successful. 
+		 */
+		LIBSGM_API bool updateParameters(const Parameters &params);
+
+	private:
+		StereoSGM(const StereoSGM&);
+		StereoSGM& operator=(const StereoSGM&);
+
+		void cuda_resource_allocate();
+
+		CudaStereoSGMResources* cu_res_;
+
+		int width_;
+		int height_;
+		int disparity_size_;
+		int input_depth_bits_;
+		int output_depth_bits_;
+		int src_pitch_;
+		int dst_pitch_;
+		EXECUTE_INOUT inout_type_;
+		Parameters param_;
+	};
+}
+
+#include "libsgm_wrapper.h"
diff --git a/lib/libsgm/include/libsgm_config.h b/lib/libsgm/include/libsgm_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..67444c41f80c77412b25b2109bb764d2e8f57497
--- /dev/null
+++ b/lib/libsgm/include/libsgm_config.h
@@ -0,0 +1,13 @@
+#ifndef __LIBSGM_CONFIG_H__
+#define __LIBSGM_CONFIG_H__
+
+/* #undef LIBSGM_SHARED */
+
+#define LIBSGM_VERSION 2.4.0
+#define LIBSGM_VERSION_MAJOR 2
+#define LIBSGM_VERSION_MINOR 4
+#define LIBSGM_VERSION_PATCH 0
+
+/* #undef BUILD_OPENCV_WRAPPER */
+
+#endif // __LIBSGM_CONFIG_H__
diff --git a/lib/libsgm/include/libsgm_config.h.in b/lib/libsgm/include/libsgm_config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..26ac62442965d31dc53e319d9910197e475dc2ad
--- /dev/null
+++ b/lib/libsgm/include/libsgm_config.h.in
@@ -0,0 +1,13 @@
+#ifndef __LIBSGM_CONFIG_H__
+#define __LIBSGM_CONFIG_H__
+
+#cmakedefine LIBSGM_SHARED
+
+#define LIBSGM_VERSION @libSGM_VERSION@
+#define LIBSGM_VERSION_MAJOR @libSGM_VERSION_MAJOR@
+#define LIBSGM_VERSION_MINOR @libSGM_VERSION_MINOR@
+#define LIBSGM_VERSION_PATCH @libSGM_VERSION_PATCH@
+
+#cmakedefine BUILD_OPENCV_WRAPPER
+
+#endif // __LIBSGM_CONFIG_H__
diff --git a/lib/libsgm/include/libsgm_wrapper.h b/lib/libsgm/include/libsgm_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..376be5ff458a308915ef4a61daf7c76545fa69f1
--- /dev/null
+++ b/lib/libsgm/include/libsgm_wrapper.h
@@ -0,0 +1,64 @@
+#ifndef __LIBSGM_WRAPPER_H__
+#define __LIBSGM_WRAPPER_H__
+
+#include "libsgm.h"
+#include <memory>
+#ifdef BUILD_OPENCV_WRAPPER
+#include <opencv2/core/cuda.hpp>
+#endif
+
+namespace sgm {
+	/**
+	 * @brief LibSGMWrapper class which is wrapper for sgm::StereoSGM.
+	 */
+	class LibSGMWrapper {
+	public:
+		/**
+		 * @param numDisparity Maximum possible disparity value
+		 * @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels
+		 * @param P2 Penalty on the disparity change by more than 1 between neighbor pixels
+		 * @param uniquenessRatio Margin in ratio by which the best cost function value should be at least second one
+		 * @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled
+		 */
+		LIBSGM_API LibSGMWrapper(int numDisparity = 128, int P1 = 10, int P2 = 120, float uniquenessRatio = 0.95f, bool subpixel = false);
+		LIBSGM_API ~LibSGMWrapper();
+
+		LIBSGM_API int getNumDisparities() const;
+		LIBSGM_API int getP1() const;
+		LIBSGM_API int getP2() const;
+		LIBSGM_API float getUniquenessRatio() const;
+		LIBSGM_API bool hasSubpixel() const;
+
+#ifdef BUILD_OPENCV_WRAPPER
+		/**
+		 * Execute stereo semi global matching via wrapper class
+		 * @param I1        Input left image.  Image's type is must be CV_8U or CV_16U
+		 * @param I2        Input right image.  Image's size and type must be same with I1.
+		 * @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
+		 * @attention
+		 * type of output image `disparity` is CV_16U.
+		 * Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+		 */
+		LIBSGM_API void execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity);
+		/**
+		 * Execute stereo semi global matching via wrapper class
+		 * @param I1        Input left image.  Image's type is must be CV_8U or CV_16U
+		 * @param I2        Input right image.  Image's size and type must be same with I1.
+		 * @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
+		 * @attention
+		 * type of output image `disparity` is CV_16U.
+		 * Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+		 */
+		LIBSGM_API void execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity);
+#endif // BUILD_OPRENCV_WRAPPER
+
+	private:
+		struct Creator;
+		std::unique_ptr<sgm::StereoSGM> sgm_;
+		int numDisparity_;
+		sgm::StereoSGM::Parameters param_;
+		std::unique_ptr<Creator> prev_;
+	};
+}
+
+#endif // __LIBSGM_WRAPPER_H__
diff --git a/lib/libsgm/sample/benchmark/CMakeLists.txt b/lib/libsgm/sample/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6c20233bd31b028f103652dbdc098f6807a956de
--- /dev/null
+++ b/lib/libsgm/sample/benchmark/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+endif()
+
+find_package(CUDA REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(../../include)
+
+cuda_add_executable(stereo_benchmark stereosgm_benchmark.cpp)
+target_link_libraries(stereo_benchmark sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS})
diff --git a/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp b/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28947bad4b320c494bd800a179d8c9fe043737ff
--- /dev/null
+++ b/lib/libsgm/sample/benchmark/stereosgm_benchmark.cpp
@@ -0,0 +1,131 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <cuda_runtime.h>
+
+#include <libsgm.h>
+
+#define ASSERT_MSG(expr, msg) \
+	if (!(expr)) { \
+		std::cerr << msg << std::endl; \
+		std::exit(EXIT_FAILURE); \
+	} \
+
+struct device_buffer
+{
+	device_buffer() : data(nullptr) {}
+	device_buffer(size_t count) { allocate(count); }
+	void allocate(size_t count) { cudaMalloc(&data, count); }
+	~device_buffer() { cudaFree(data); }
+	void* data;
+};
+
+int main(int argc, char* argv[])
+{
+	if (argc < 3) {
+		std::cout << "usage: " << argv[0] << " left_img right_img [disp_size] [out_depth] [subpixel_enable(0: false, 1:true)] [iterations]" << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	cv::Mat I1 = cv::imread(argv[1], -1);
+	cv::Mat I2 = cv::imread(argv[2], -1);
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+
+	const int disp_size = argc > 3 ? std::stoi(argv[3]) : 128;
+	const int out_depth = argc > 4 ? std::stoi(argv[4]) : 8;
+	const bool subpixel = argc > 5 ? std::stoi(argv[5]) != 0 : false;
+
+	ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128.");
+	if (subpixel) {
+		ASSERT_MSG(out_depth == 16, "output depth bits must be 16 if subpixel option is enabled.");
+	} else {
+		ASSERT_MSG(out_depth == 8 || out_depth == 16, "output depth bits must be 8 or 16");
+	}
+	const int iterations = argc > 6 ? std::stoi(argv[6]) : 100;
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int input_depth = I1.type() == CV_8U ? 8 : 16;
+	const int input_bytes = input_depth * width * height / 8;
+	const int output_bytes = out_depth * width * height / 8;
+
+	const sgm::StereoSGM::Parameters params{10, 120, 0.95f, subpixel};
+
+	sgm::StereoSGM sgm(width, height, disp_size, input_depth, out_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, params);
+
+	device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes);
+	cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice);
+	cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice);
+
+	cudaDeviceProp prop;
+	int version;
+	cudaGetDeviceProperties(&prop, 0);
+	cudaRuntimeGetVersion(&version);
+
+	// show settings
+	std::cout << "# Settings" << std::endl;
+	std::cout << "device name         : " << prop.name << std::endl;
+	std::cout << "CUDA runtime version: " << version << std::endl;
+	std::cout << "image size          : " << I1.size() << std::endl;
+	std::cout << "disparity size      : " << disp_size << std::endl;
+	std::cout << "output depth        : " << out_depth << std::endl;
+	std::cout << "subpixel option     : " << (subpixel ? "true" : "false") << std::endl;
+	std::cout << "sgm path            : " << "8 path" << std::endl;
+	std::cout << "iterations          : " << iterations << std::endl;
+	std::cout << std::endl;
+
+	// run benchmark
+	std::cout << "Running benchmark..." << std::endl;
+	uint64_t sum = 0;
+	for (int i = 0; i <= iterations; i++) {
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		if (i > 0)
+			sum += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+	}
+	std::cout << "Done." << std::endl << std::endl;
+
+	// show results
+	const double time_millisec = 1e-3 * sum / iterations;
+	const double fps = 1e3 / time_millisec;
+	std::cout << "# Results" << std::endl;
+	std::cout.setf(std::ios::fixed);
+	std::cout << std::setprecision(1) << "Processing Time[Milliseconds]: " << time_millisec << std::endl;
+	std::cout << std::setprecision(1) << "FPS                          : " << fps << std::endl;
+	std::cout << std::endl;
+
+	// save disparity image
+	cv::Mat disparity(height, width, out_depth == 8 ? CV_8U : CV_16U);
+	cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost);
+	disparity *= 255. / disp_size;
+	cv::imwrite("disparity.png", disparity);
+
+	return 0;
+}
diff --git a/lib/libsgm/sample/image/CMakeLists.txt b/lib/libsgm/sample/image/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80e404936d9aa0f17e0f7b431707e6ee38879572
--- /dev/null
+++ b/lib/libsgm/sample/image/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+endif()
+
+find_package(CUDA REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(../../include)
+
+cuda_add_executable(stereo_test stereosgm_image.cpp)
+target_link_libraries(stereo_test sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS})
diff --git a/lib/libsgm/sample/image/stereosgm_image.cpp b/lib/libsgm/sample/image/stereosgm_image.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a735de5d997fef9914e3a331a481338b969af8f
--- /dev/null
+++ b/lib/libsgm/sample/image/stereosgm_image.cpp
@@ -0,0 +1,108 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/core/version.hpp>
+
+#include <libsgm.h>
+
+#define ASSERT_MSG(expr, msg) \
+	if (!(expr)) { \
+		std::cerr << msg << std::endl; \
+		std::exit(EXIT_FAILURE); \
+	} \
+
+int main(int argc, char* argv[]) {
+	if (argc < 3) {
+		std::cerr << "usage: stereosgm left_img right_img [disp_size]" << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	cv::Mat left = cv::imread(argv[1], -1);
+	cv::Mat right = cv::imread(argv[2], -1);
+
+	int disp_size = 64;
+	if (argc >= 4) {
+		disp_size = atoi(argv[3]);
+	}
+
+	ASSERT_MSG(left.size() == right.size() && left.type() == right.type(), "input images must be same size and type.");
+	//ASSERT_MSG(left.type() == CV_8U || left.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+
+	cv::resize(left, left, cv::Size(left.cols/2, left.rows/2));
+	cv::resize(right, right, cv::Size(right.cols/2, right.rows/2));
+
+	cv::cvtColor(left, left, cv::COLOR_BGR2GRAY);
+	cv::cvtColor(right, right, cv::COLOR_BGR2GRAY);
+	int bits = 0;
+
+	switch (left.type()) {
+	case CV_8UC1: bits = 8; break;
+	case CV_16UC1: bits = 16; break;
+	default:
+		std::cerr << "invalid input image color format" << left.type() << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	sgm::StereoSGM ssgm(left.cols, left.rows, disp_size, bits, 8, sgm::EXECUTE_INOUT_HOST2HOST);
+
+	cv::Mat output(cv::Size(left.cols, left.rows), CV_8UC1);
+
+	ssgm.execute(left.data, right.data, output.data);
+	// show image
+	cv::imshow("image", output * 256 / disp_size);
+	
+	int key = cv::waitKey();
+	int mode = 0;
+	while (key != 27) {
+		if (key == 's') {
+			mode += 1;
+			if (mode >= 3) mode = 0;
+
+			switch (mode) {
+			case 0:
+				{
+					cv::setWindowTitle("image", "disparity");
+					cv::imshow("image", output * 256 / disp_size);
+					break;
+				}
+			case 1:
+				{
+					cv::Mat m;
+					cv::applyColorMap(output * 256 / disp_size, m, cv::COLORMAP_JET);
+					cv::setWindowTitle("image", "disparity color");
+					cv::imshow("image", m);
+					break;
+				}
+			case 2:
+				{
+					cv::setWindowTitle("image", "input");
+					cv::imshow("image", left);
+					break;
+				}
+			}
+		}
+		key = cv::waitKey();
+	}
+
+	return 0;
+}
diff --git a/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt b/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8486103999ec02a94207515c0ba0db36c0b0533c
--- /dev/null
+++ b/lib/libsgm/sample/image_cv_gpumat/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+endif()
+
+find_package(CUDA REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(../../include)
+
+cuda_add_executable(stereo_image_cv_gpumat stereosgm_image_cv_gpumat.cpp ${CUDA_SRC})
+target_link_libraries(stereo_image_cv_gpumat sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS})
diff --git a/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp b/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3bb7201789e1366e2c2d68831d0bcb386139b55
--- /dev/null
+++ b/lib/libsgm/sample/image_cv_gpumat/stereosgm_image_cv_gpumat.cpp
@@ -0,0 +1,111 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <iostream>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/core/version.hpp>
+
+#include <libsgm.h>
+
+#define ASSERT_MSG(expr, msg) \
+	if (!(expr)) { \
+		std::cerr << msg << std::endl; \
+		std::exit(EXIT_FAILURE); \
+	} \
+
+static void execute(sgm::LibSGMWrapper& sgmw, const cv::Mat& h_left, const cv::Mat& h_right, cv::Mat& h_disparity) noexcept(false)
+{
+	cv::cuda::GpuMat d_left, d_right;
+	d_left.upload(h_left);
+	d_right.upload(h_right);
+
+	cv::cuda::GpuMat d_disparity;
+
+	sgmw.execute(d_left, d_right, d_disparity);
+
+	// normalize result
+	cv::cuda::GpuMat d_normalized_disparity;
+	d_disparity.convertTo(d_normalized_disparity, CV_8UC1, 256. / sgmw.getNumDisparities());
+	d_normalized_disparity.download(h_disparity);
+}
+
+int main(int argc, char* argv[]) {
+	ASSERT_MSG(argc >= 3, "usage: stereosgm left_img right_img [disp_size]");
+
+	const cv::Mat left = cv::imread(argv[1], -1);
+	const cv::Mat right = cv::imread(argv[2], -1);
+	const int disp_size = argc > 3 ? std::atoi(argv[3]) : 128;
+
+	ASSERT_MSG(left.size() == right.size() && left.type() == right.type(), "input images must be same size and type.");
+	ASSERT_MSG(left.type() == CV_8U || left.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128.");
+
+	sgm::LibSGMWrapper sgmw{disp_size};
+	cv::Mat disparity;
+	try {
+		execute(sgmw, left, right, disparity);
+	} catch (const cv::Exception& e) {
+		std::cerr << e.what() << std::endl;
+		if (e.code == cv::Error::GpuNotSupported) {
+			return 1;
+		} else {
+			return -1;
+		}
+	}
+
+	// post-process for showing image
+	cv::Mat colored;
+	cv::applyColorMap(disparity, colored, cv::COLORMAP_JET);
+	cv::imshow("image", disparity);
+
+	int key = cv::waitKey();
+	int mode = 0;
+	while (key != 27) {
+		if (key == 's') {
+			mode += 1;
+			if (mode >= 3) mode = 0;
+
+			switch (mode) {
+			case 0:
+				{
+					cv::setWindowTitle("image", "disparity");
+					cv::imshow("image", disparity);
+					break;
+				}
+			case 1:
+				{
+					cv::setWindowTitle("image", "disparity color");
+					cv::imshow("image", colored);
+					break;
+				}
+			case 2:
+				{
+					cv::setWindowTitle("image", "input");
+					cv::imshow("image", left);
+					break;
+				}
+			}
+		}
+		key = cv::waitKey();
+	}
+
+	return 0;
+}
diff --git a/lib/libsgm/sample/movie/CMakeLists.txt b/lib/libsgm/sample/movie/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6cddac16acdacd3a9a9c4a69f458d0d56d0be0
--- /dev/null
+++ b/lib/libsgm/sample/movie/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+endif()
+
+find_package(CUDA REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(../../include)
+
+cuda_add_executable(stereo_movie stereosgm_movie.cpp)
+target_link_libraries(stereo_movie sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS})
diff --git a/lib/libsgm/sample/movie/stereosgm_movie.cpp b/lib/libsgm/sample/movie/stereosgm_movie.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8082478e7d243a6cccda347eef949bae27e787e
--- /dev/null
+++ b/lib/libsgm/sample/movie/stereosgm_movie.cpp
@@ -0,0 +1,129 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <chrono>
+
+#include <cuda_runtime.h>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/core/version.hpp>
+
+#include <libsgm.h>
+
+#define ASSERT_MSG(expr, msg) \
+	if (!(expr)) { \
+		std::cerr << msg << std::endl; \
+		std::exit(EXIT_FAILURE); \
+	} \
+
+struct device_buffer
+{
+	device_buffer() : data(nullptr) {}
+	device_buffer(size_t count) { allocate(count); }
+	void allocate(size_t count) { cudaMalloc(&data, count); }
+	~device_buffer() { cudaFree(data); }
+	void* data;
+};
+
+template <class... Args>
+static std::string format_string(const char* fmt, Args... args)
+{
+	const int BUF_SIZE = 1024;
+	char buf[BUF_SIZE];
+	std::snprintf(buf, BUF_SIZE, fmt, args...);
+	return std::string(buf);
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc < 3) {
+		std::cout << "usage: " << argv[0] << " left-image-format right-image-format [disp_size]" << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	const int first_frame = 1;
+
+	cv::Mat I1 = cv::imread(format_string(argv[1], first_frame), -1);
+	cv::Mat I2 = cv::imread(format_string(argv[2], first_frame), -1);
+	const int disp_size = argc >= 4 ? std::stoi(argv[3]) : 128;
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128.");
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int input_depth = I1.type() == CV_8U ? 8 : 16;
+	const int input_bytes = input_depth * width * height / 8;
+	const int output_depth = 8;
+	const int output_bytes = output_depth * width * height / 8;
+
+	sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, sgm::EXECUTE_INOUT_CUDA2CUDA);
+
+	cv::Mat disparity(height, width, output_depth == 8 ? CV_8U : CV_16U);
+	cv::Mat disparity_8u, disparity_color;
+
+	device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes);
+
+	for (int frame_no = first_frame;; frame_no++) {
+
+		I1 = cv::imread(format_string(argv[1], frame_no), -1);
+		I2 = cv::imread(format_string(argv[2], frame_no), -1);
+		if (I1.empty() || I2.empty()) {
+			frame_no = first_frame;
+			continue;
+		}
+
+		cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice);
+		cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice);
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost);
+
+		// draw results
+		if (I1.type() != CV_8U) {
+			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX);
+			I1.convertTo(I1, CV_8U);
+		}
+
+		disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
+		cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET);
+		cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps),
+			cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		cv::imshow("left image", I1);
+		cv::imshow("disparity", disparity_color);
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+
+	return 0;
+}
diff --git a/lib/libsgm/sample/reprojection/CMakeLists.txt b/lib/libsgm/sample/reprojection/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..43b584ab9b0ff1785198db7d581a20de03492d40
--- /dev/null
+++ b/lib/libsgm/sample/reprojection/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+endif()
+
+find_package(CUDA REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(../../include)
+
+cuda_add_executable(stereo_reprojection stereosgm_reprojection.cpp)
+target_link_libraries(stereo_reprojection sgm ${CUDA_LIBRARIES} ${OpenCV_LIBS})
diff --git a/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml b/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml
new file mode 100644
index 0000000000000000000000000000000000000000..002fbada79b183172b29f618089d6c69f94e6a52
--- /dev/null
+++ b/lib/libsgm/sample/reprojection/camera_daimler_gt_stixel.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<opencv_storage>
+
+<!--  Intrinsic parameters -->
+<FocalLengthX>1267.485352</FocalLengthX> <!--  focal length x (pixel) -->
+<FocalLengthY>1224.548950</FocalLengthY> <!--  focal length y (pixel) -->
+<CenterX>472.735474</CenterX>            <!--  principal point x (pixel) -->
+<CenterY>175.787781</CenterY>            <!--  principal point y (pixel) -->
+
+<!--  Extrinsic parameters -->
+<BaseLine>0.214382</BaseLine>            <!--  baseline (meter) -->
+<Height>1.170000</Height>                <!--  height position (meter) -->
+<Tilt>0.081276</Tilt>                    <!--  tilt angle (radian) -->
+
+</opencv_storage>
diff --git a/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml b/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0a0cc094a27cc5a10127a7f77ff78fbe6e2e833e
--- /dev/null
+++ b/lib/libsgm/sample/reprojection/camera_daimler_urban_seg.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<FocalLengthX>1249.7700195</FocalLengthX>
+<FocalLengthY>1249.7700195</FocalLengthY>
+<CenterX>480.8460083</CenterX>
+<CenterY>237.4100037</CenterY>
+<BaseLine>0.2339240</BaseLine>
+<Height>1.2000000</Height>
+<Tilt>0.07</Tilt>
+</opencv_storage>
diff --git a/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp b/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36ade078a8a2f73f7818424ec6d3921a63e54241
--- /dev/null
+++ b/lib/libsgm/sample/reprojection/stereosgm_reprojection.cpp
@@ -0,0 +1,270 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <chrono>
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <cuda_runtime.h>
+
+#include <libsgm.h>
+
+#define ASSERT_MSG(expr, msg) \
+	if (!(expr)) { \
+		std::cerr << msg << std::endl; \
+		std::exit(EXIT_FAILURE); \
+	} \
+
+struct device_buffer
+{
+	device_buffer() : data(nullptr) {}
+	device_buffer(size_t count) { allocate(count); }
+	void allocate(size_t count) { cudaMalloc(&data, count); }
+	~device_buffer() { cudaFree(data); }
+	void* data;
+};
+
+// Camera Parameters
+struct CameraParameters
+{
+	float fu;                 //!< focal length x (pixel)
+	float fv;                 //!< focal length y (pixel)
+	float u0;                 //!< principal point x (pixel)
+	float v0;                 //!< principal point y (pixel)
+	float baseline;           //!< baseline (meter)
+	float height;             //!< height position (meter), ignored when ROAD_ESTIMATION_AUTO
+	float tilt;               //!< tilt angle (radian), ignored when ROAD_ESTIMATION_AUTO
+};
+
+// Transformation between pixel coordinate and world coordinate
+struct CoordinateTransform
+{
+	CoordinateTransform(const CameraParameters& camera) : camera(camera)
+	{
+		sinTilt = (sinf(camera.tilt));
+		cosTilt = (cosf(camera.tilt));
+		bf = camera.baseline * camera.fu;
+		invfu = 1.f / camera.fu;
+		invfv = 1.f / camera.fv;
+	}
+
+	inline cv::Point3f imageToWorld(const cv::Point2f& pt, float d) const
+	{
+		const float u = pt.x;
+		const float v = pt.y;
+
+		const float Zc = bf / d;
+		const float Xc = invfu * (u - camera.u0) * Zc;
+		const float Yc = invfv * (v - camera.v0) * Zc;
+
+		const float Xw = Xc;
+		const float Yw = Yc * cosTilt + Zc * sinTilt;
+		const float Zw = Zc * cosTilt - Yc * sinTilt;
+
+		return cv::Point3f(Xw, Yw, Zw);
+	}
+
+	CameraParameters camera;
+	float sinTilt, cosTilt, bf, invfu, invfv;
+};
+
+template <class... Args>
+static std::string format_string(const char* fmt, Args... args)
+{
+	const int BUF_SIZE = 1024;
+	char buf[BUF_SIZE];
+	std::snprintf(buf, BUF_SIZE, fmt, args...);
+	return std::string(buf);
+}
+
+static cv::Scalar computeColor(float val)
+{
+	const float hscale = 6.f;
+	float h = 0.6f * (1.f - val), s = 1.f, v = 1.f;
+	float r, g, b;
+
+	static const int sector_data[][3] =
+	{ { 1,3,0 },{ 1,0,2 },{ 3,0,1 },{ 0,2,1 },{ 0,1,3 },{ 2,1,0 } };
+	float tab[4];
+	int sector;
+	h *= hscale;
+	if (h < 0)
+		do h += 6; while (h < 0);
+	else if (h >= 6)
+		do h -= 6; while (h >= 6);
+	sector = cvFloor(h);
+	h -= sector;
+	if ((unsigned)sector >= 6u)
+	{
+		sector = 0;
+		h = 0.f;
+	}
+
+	tab[0] = v;
+	tab[1] = v*(1.f - s);
+	tab[2] = v*(1.f - s*h);
+	tab[3] = v*(1.f - s*(1.f - h));
+
+	b = tab[sector_data[sector][0]];
+	g = tab[sector_data[sector][1]];
+	r = tab[sector_data[sector][2]];
+	return 255 * cv::Scalar(b, g, r);
+}
+
+void reprojectPointsTo3D(const cv::Mat& disparity, const CameraParameters& camera, std::vector<cv::Point3f>& points, bool subpixeled)
+{
+	CV_Assert(disparity.type() == CV_32F);
+
+	CoordinateTransform tf(camera);
+
+	points.clear();
+	points.reserve(disparity.rows * disparity.cols);
+
+	for (int y = 0; y < disparity.rows; y++)
+	{
+		for (int x = 0; x < disparity.cols; x++)
+		{
+			const float d = disparity.at<float>(y, x);
+			if (d > 0)
+				points.push_back(tf.imageToWorld(cv::Point(x, y), d));
+		}
+	}
+}
+
+void drawPoints3D(const std::vector<cv::Point3f>& points, cv::Mat& draw)
+{
+	const int SIZE_X = 512;
+	const int SIZE_Z = 1024;
+	const int maxz = 20; // [meter]
+	const double pixelsPerMeter = 1. * SIZE_Z / maxz;
+
+	draw = cv::Mat::zeros(SIZE_Z, SIZE_X, CV_8UC3);
+
+	for (const cv::Point3f& pt : points)
+	{
+		const float X = pt.x;
+		const float Z = pt.z;
+
+		const int u = cvRound(pixelsPerMeter * X) + SIZE_X / 2;
+		const int v = SIZE_Z - cvRound(pixelsPerMeter * Z);
+
+		const cv::Scalar color = computeColor(std::min(Z, 1.f * maxz) / maxz);
+		cv::circle(draw, cv::Point(u, v), 1, color);
+	}
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc < 4) {
+		std::cout << "usage: " << argv[0] << " left-image-format right-image-format camera.xml [disp_size] [subpixel_enable(0: false, 1:true)]" << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	const int first_frame = 1;
+
+	cv::Mat I1 = cv::imread(format_string(argv[1], first_frame), -1);
+	cv::Mat I2 = cv::imread(format_string(argv[2], first_frame), -1);
+	const cv::FileStorage cvfs(argv[3], cv::FileStorage::READ);
+	const int disp_size = argc >= 5 ? std::stoi(argv[4]) : 128;
+	const bool subpixel = argc >= 6 ? std::stoi(argv[5]) != 0 : true;
+	const int output_depth = 16;
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(cvfs.isOpened(), "camera.xml read failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128, "disparity size must be 64 or 128.");
+
+	// read camera parameters
+	const cv::FileNode node(cvfs.fs, NULL);
+	CameraParameters camera;
+	camera.fu = node["FocalLengthX"];
+	camera.fv = node["FocalLengthY"];
+	camera.u0 = node["CenterX"];
+	camera.v0 = node["CenterY"];
+	camera.baseline = node["BaseLine"];
+	camera.tilt = node["Tilt"];
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int input_depth = I1.type() == CV_8U ? 8 : 16;
+	const int input_bytes = input_depth * width * height / 8;
+	const int output_bytes = output_depth * width * height / 8;
+
+	const sgm::StereoSGM::Parameters params{10, 120, 0.95f, subpixel};
+
+	sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, params);
+
+	cv::Mat disparity(height, width, output_depth == 8 ? CV_8U : CV_16U);
+	cv::Mat disparity_8u, disparity_32f, disparity_color, draw;
+	std::vector<cv::Point3f> points;
+
+	device_buffer d_I1(input_bytes), d_I2(input_bytes), d_disparity(output_bytes);
+
+	for (int frame_no = first_frame;; frame_no++) {
+
+		I1 = cv::imread(format_string(argv[1], frame_no), -1);
+		I2 = cv::imread(format_string(argv[2], frame_no), -1);
+		if (I1.empty() || I2.empty()) {
+			frame_no = first_frame;
+			continue;
+		}
+
+		cudaMemcpy(d_I1.data, I1.data, input_bytes, cudaMemcpyHostToDevice);
+		cudaMemcpy(d_I2.data, I2.data, input_bytes, cudaMemcpyHostToDevice);
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost);
+
+		// draw results
+		if (I1.type() != CV_8U) {
+			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX);
+			I1.convertTo(I1, CV_8U);
+		}
+
+		disparity.convertTo(disparity_32f, CV_32F, subpixel ? 1. / sgm::StereoSGM::SUBPIXEL_SCALE : 1);
+		reprojectPointsTo3D(disparity_32f, camera, points, subpixel);
+		drawPoints3D(points, draw);
+
+		disparity_32f.convertTo(disparity_8u, CV_8U, 255. / disp_size);
+		cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET);
+		cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps),
+			cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		cv::imshow("left image", I1);
+		cv::imshow("disparity", disparity_color);
+		cv::imshow("points", draw);
+
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+
+	return 0;
+}
diff --git a/lib/libsgm/sample/zed/CMakeLists.txt b/lib/libsgm/sample/zed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30659d31473ddd50522ee1cdab04853d5a88be46
--- /dev/null
+++ b/lib/libsgm/sample/zed/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../movie/cmake)
+
+message(STATUS ${CMAKE_MODULE_PATH})
+
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+
+find_package(ZED 2 REQUIRED)
+find_package(CUDA ${ZED_CUDA_VERSION} REQUIRED)
+
+find_package(OpenCV REQUIRED)
+if (OpenCV_VERSION VERSION_LESS 3.0)
+	message(FATAL_ERROR "Error: OpenCV version requires at least 3.0")
+endif()
+
+include_directories(../../include)
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(${OpenCV_INCLUDE_DIRS})
+include_directories(${ZED_INCLUDE_DIRS})
+
+link_directories(${ZED_LIBRARY_DIR})
+link_directories(${CUDA_LIBRARY_DIRS})
+
+if(NOT WIN32)
+	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+endif()
+
+
+CUDA_ADD_EXECUTABLE(zed_demo
+	zed_demo.cpp
+)
+
+
+TARGET_LINK_LIBRARIES(zed_demo 
+	sgm 
+	${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES_ZED}
+	${OpenCV_LIBS} 
+	${ZED_LIBRARIES}
+)
diff --git a/lib/libsgm/sample/zed/zed_demo.cpp b/lib/libsgm/sample/zed/zed_demo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9139dad8719792241b41a1dac4d4070675ac595c
--- /dev/null
+++ b/lib/libsgm/sample/zed/zed_demo.cpp
@@ -0,0 +1,108 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <chrono>
+
+#include <cuda_runtime.h>
+#include <opencv2/opencv.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <sl/Camera.hpp>
+
+#include <libsgm.h>
+
+template <class... Args>
+static std::string format_string(const char* fmt, Args... args)
+{
+	const int BUF_SIZE = 1024;
+	char buf[BUF_SIZE];
+	std::snprintf(buf, BUF_SIZE, fmt, args...);
+	return std::string(buf);
+}
+
+struct device_buffer {
+	device_buffer() : data(nullptr) {}
+	device_buffer(size_t count) { allocate(count); }
+	void allocate(size_t count) { cudaMalloc(&data, count); }
+	~device_buffer() { cudaFree(data); }
+	void* data;
+};
+
+int main(int argc, char* argv[]) {	
+	
+	const int disp_size = 128;
+	
+	sl::Camera zed;
+	sl::InitParameters initParameters;
+	initParameters.camera_resolution = sl::RESOLUTION_VGA;
+	sl::ERROR_CODE err = zed.open(initParameters);
+	if (err != sl::SUCCESS) {
+		std::cout << toString(err) << std::endl;
+		zed.close();
+		return 1;
+	}
+	const int width = static_cast<int>(zed.getResolution().width);
+	const int height = static_cast<int>(zed.getResolution().height);
+
+	sl::Mat d_zed_image_l(zed.getResolution(), sl::MAT_TYPE_8U_C1, sl::MEM_GPU);
+	sl::Mat d_zed_image_r(zed.getResolution(), sl::MAT_TYPE_8U_C1, sl::MEM_GPU);
+
+	const int input_depth = 8;
+	const int output_depth = 8;
+	const int output_bytes = output_depth * width * height / 8;
+
+	CV_Assert(d_zed_image_l.getStep(sl::MEM_GPU) == d_zed_image_r.getStep(sl::MEM_GPU));
+	sgm::StereoSGM sgm(width, height, disp_size, input_depth, output_depth, static_cast<int>(d_zed_image_l.getStep(sl::MEM_GPU)), width, sgm::EXECUTE_INOUT_CUDA2CUDA);
+
+	cv::Mat disparity(height, width, CV_8U);
+	cv::Mat disparity_8u, disparity_color;
+
+	device_buffer d_disparity(output_bytes);
+	while (1) {
+		if (zed.grab() == sl::SUCCESS) {
+			zed.retrieveImage(d_zed_image_l, sl::VIEW_LEFT_GRAY, sl::MEM_GPU);
+			zed.retrieveImage(d_zed_image_r, sl::VIEW_RIGHT_GRAY, sl::MEM_GPU);
+		} else continue;
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_zed_image_l.getPtr<uchar>(sl::MEM_GPU), d_zed_image_r.getPtr<uchar>(sl::MEM_GPU), d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		cudaMemcpy(disparity.data, d_disparity.data, output_bytes, cudaMemcpyDeviceToHost);
+
+		disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
+		cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_JET);
+		cv::putText(disparity_color, format_string("sgm execution time: %4.1f[msec] %4.1f[FPS]", 1e-3 * duration, fps),
+			cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		cv::imshow("disparity", disparity_color);
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+	zed.close();
+	return 0;
+}
diff --git a/lib/libsgm/src/CMakeLists.txt b/lib/libsgm/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89e5035877010c146dd4d0d988b263a9ccddf9df
--- /dev/null
+++ b/lib/libsgm/src/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 3.1)
+
+find_package(CUDA REQUIRED)
+
+include_directories(../include)
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall -fPIC")
+	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+endif()
+
+SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH}")
+
+file(GLOB STEREOSRCS "*.cu" "*.cpp")
+
+if(LIBSGM_SHARED)
+	CUDA_ADD_LIBRARY(sgm stereo_sgm.cpp ${STEREOSRCS} SHARED)
+	target_link_libraries(sgm ${CUDA_LIBRARIES})
+	if(BUILD_OPENCV_WRAPPER)
+		target_link_libraries(sgm ${OpenCV_LIBS})
+	endif()
+else()
+	CUDA_ADD_LIBRARY(sgm stereo_sgm.cpp ${STEREOSRCS} STATIC)
+endif()
+
+install(
+	TARGETS sgm
+	ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+	LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+	RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+)
+
+install(
+	DIRECTORY ${CMAKE_SOURCE_DIR}/include
+	DESTINATION ${CMAKE_INSTALL_PREFIX}
+	FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp"
+)
+
+install(
+	FILES ${CMAKE_SOURCE_DIR}/FindLibSGM.cmake
+	DESTINATION ${CMAKE_INSTALL_PREFIX}
+)
diff --git a/lib/libsgm/src/census_transform.cu b/lib/libsgm/src/census_transform.cu
new file mode 100644
index 0000000000000000000000000000000000000000..70f67bcffd1eb2540c1f6c64da0ced5483bca2c9
--- /dev/null
+++ b/lib/libsgm/src/census_transform.cu
@@ -0,0 +1,150 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdio>
+#include "census_transform.hpp"
+
+namespace sgm {
+
+namespace {
+
+static constexpr int WINDOW_WIDTH  = 9;
+static constexpr int WINDOW_HEIGHT = 7;
+
+static constexpr int BLOCK_SIZE = 128;
+static constexpr int LINES_PER_BLOCK = 16;
+
+template <typename T>
+__global__ void census_transform_kernel(
+	feature_type *dest,
+	const T *src,
+	int width,
+	int height,
+	int pitch)
+{
+	using pixel_type = T;
+	static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1;
+
+	const int half_kw = WINDOW_WIDTH  / 2;
+	const int half_kh = WINDOW_HEIGHT / 2;
+
+	__shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE];
+
+	const int tid = threadIdx.x;
+	const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw;
+	const int y0 = blockIdx.y * LINES_PER_BLOCK;
+
+	for(int i = 0; i < WINDOW_HEIGHT; ++i){
+		const int x = x0 + tid, y = y0 - half_kh + i;
+		pixel_type value = 0;
+		if(0 <= x && x < width && 0 <= y && y < height){
+			value = src[x + y * pitch];
+		}
+		smem_lines[i][tid] = value;
+	}
+	__syncthreads();
+
+#pragma unroll
+	for(int i = 0; i < LINES_PER_BLOCK; ++i){
+		if(i + 1 < LINES_PER_BLOCK){
+			// Load to smem
+			const int x = x0 + tid, y = y0 + half_kh + i + 1;
+			pixel_type value = 0;
+			if(0 <= x && x < width && 0 <= y && y < height){
+				value = src[x + y * pitch];
+			}
+			const int smem_x = tid;
+			const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE;
+			smem_lines[smem_y][smem_x] = value;
+		}
+
+		if(half_kw <= tid && tid < BLOCK_SIZE - half_kw){
+			// Compute and store
+			const int x = x0 + tid, y = y0 + i;
+			if(half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh){
+				const int smem_x = tid;
+				const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE;
+				feature_type f = 0;
+				for(int dy = -half_kh; dy < 0; ++dy){
+					const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
+					const int smem_y2 = (smem_y - dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
+					for(int dx = -half_kw; dx <= half_kw; ++dx){
+						const int smem_x1 = smem_x + dx;
+						const int smem_x2 = smem_x - dx;
+						const auto a = smem_lines[smem_y1][smem_x1];
+						const auto b = smem_lines[smem_y2][smem_x2];
+						f = (f << 1) | (a > b);
+					}
+				}
+				for(int dx = -half_kw; dx < 0; ++dx){
+					const int smem_x1 = smem_x + dx;
+					const int smem_x2 = smem_x - dx;
+					const auto a = smem_lines[smem_y][smem_x1];
+					const auto b = smem_lines[smem_y][smem_x2];
+					f = (f << 1) | (a > b);
+				}
+				dest[x + y * width] = f;
+			}
+		}
+		__syncthreads();
+	}
+}
+
+template <typename T>
+void enqueue_census_transform(
+	feature_type *dest,
+	const T *src,
+	int width,
+	int height,
+	int pitch,
+	cudaStream_t stream)
+{
+	const int width_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
+	const int height_per_block = LINES_PER_BLOCK;
+	const dim3 gdim(
+		(width  + width_per_block  - 1) / width_per_block,
+		(height + height_per_block - 1) / height_per_block);
+	const dim3 bdim(BLOCK_SIZE);
+	census_transform_kernel<<<gdim, bdim, 0, stream>>>(dest, src, width, height, pitch);
+}
+
+}
+
+
+template <typename T>
+CensusTransform<T>::CensusTransform()
+	: m_feature_buffer()
+{ }
+
+template <typename T>
+void CensusTransform<T>::enqueue(
+	const input_type *src,
+	int width,
+	int height,
+	int pitch,
+	cudaStream_t stream)
+{
+	if(m_feature_buffer.size() < static_cast<size_t>(width * height)){
+		m_feature_buffer = DeviceBuffer<feature_type>(width * height);
+	}
+	enqueue_census_transform(
+		m_feature_buffer.data(), src, width, height, pitch, stream);
+}
+
+template class CensusTransform<uint8_t>;
+template class CensusTransform<uint16_t>;
+
+}
diff --git a/lib/libsgm/src/census_transform.hpp b/lib/libsgm/src/census_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a80b903b4b20152c0c8fad7fada8032d60a1565
--- /dev/null
+++ b/lib/libsgm/src/census_transform.hpp
@@ -0,0 +1,52 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_CENSUS_TRANSFORM_HPP
+#define SGM_CENSUS_TRANSFORM_HPP
+
+#include "device_buffer.hpp"
+#include "types.hpp"
+
+namespace sgm {
+
+template <typename T>
+class CensusTransform {
+
+public:
+	using input_type = T;
+
+private:
+	DeviceBuffer<feature_type> m_feature_buffer;
+
+public:
+	CensusTransform();
+
+	const feature_type *get_output() const {
+		return m_feature_buffer.data();
+	}
+	
+	void enqueue(
+		const input_type *src,
+		int width,
+		int height,
+		int pitch,
+		cudaStream_t stream);
+
+};
+
+}
+
+#endif
diff --git a/lib/libsgm/src/check_consistency.cu b/lib/libsgm/src/check_consistency.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f0a58247b56d6062ae47bedbf5c4e7554c73c82e
--- /dev/null
+++ b/lib/libsgm/src/check_consistency.cu
@@ -0,0 +1,77 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <libsgm.h>
+#include "internal.h"
+
+namespace {
+	template<typename SRC_T, typename DST_T>
+	__global__ void check_consistency_kernel(DST_T* d_leftDisp, const DST_T* d_rightDisp, const uint8_t* d_mask, int width, int height, int src_pitch, int dst_pitch, bool subpixel)  {
+
+		const int j = blockIdx.x * blockDim.x + threadIdx.x;
+		const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+		// left-right consistency check, only on leftDisp, but could be done for rightDisp too
+
+		uint8_t mask = d_mask[i * src_pitch + j];
+		int d = d_leftDisp[i * dst_pitch + j];
+		if (subpixel) {
+			d >>= sgm::StereoSGM::SUBPIXEL_SHIFT;
+		}
+		int k = j - d;
+		if (mask != 0 || d <= 0 || (k >= 0 && k < width)) {
+			int diff = abs(d_rightDisp[i * dst_pitch + k] - d);
+			if (mask != 0 || diff > 1) {
+				// masked or left-right inconsistent pixel -> invalid
+				d_leftDisp[i * dst_pitch + j] = (256 << (sgm::StereoSGM::SUBPIXEL_SHIFT+1));
+			}
+		}
+	}
+}
+
+namespace sgm {
+	namespace details {
+
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) {
+
+			const dim3 blocks(width / 16, height / 16);
+			const dim3 threads(16, 16);
+			if (depth_bits == 16) {
+				check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+			}
+			else if (depth_bits == 8) {
+				check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+			}
+
+			CudaKernelCheck();
+		}
+
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel) {
+
+			const dim3 blocks(width / 16, height / 16);
+			const dim3 threads(16, 16);
+			if (depth_bits == 16) {
+				check_consistency_kernel<uint16_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+			}
+			else if (depth_bits == 8) {
+				check_consistency_kernel<uint8_t> << < blocks, threads >> > (d_left_disp, d_right_disp, d_mask, width, height, src_pitch, dst_pitch, subpixel);
+			}
+			
+			CudaKernelCheck();	
+		}
+
+	}
+}
diff --git a/lib/libsgm/src/cuda_utils.cu b/lib/libsgm/src/cuda_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5c6ddf1b8b3e915cfcf4aa1ee15a09fdc41344cc
--- /dev/null
+++ b/lib/libsgm/src/cuda_utils.cu
@@ -0,0 +1,55 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+namespace {
+
+	__global__ void cast_16bit_8bit_array_kernel(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements) {
+		int i = blockIdx.x * blockDim.x + threadIdx.x;
+		arr8bits[i] = (uint8_t)arr16bits[i];
+	}
+
+	__global__ void cast_8bit_16bit_array_kernel(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements) {
+		int i = blockIdx.x * blockDim.x + threadIdx.x;
+		arr16bits[i] = (uint16_t)arr8bits[i];
+	}
+
+}
+
+namespace sgm {
+	namespace details {
+
+		void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements) {
+			for (int mod = 1024; mod != 0; mod >>= 1) {
+				if (num_elements % mod == 0) {
+					cast_16bit_8bit_array_kernel << <num_elements / mod, mod >> >(arr16bits, arr8bits, num_elements);
+					break;
+				}
+			}
+		}
+
+		void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements) {
+			for (int mod = 1024; mod != 0; mod >>= 1) {
+				if (num_elements % mod == 0) {
+					cast_8bit_16bit_array_kernel << <num_elements / mod, mod >> >(arr8bits, arr16bits, num_elements);
+					break;
+				}
+			}
+		}
+
+	}
+}
diff --git a/lib/libsgm/src/device_buffer.hpp b/lib/libsgm/src/device_buffer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..82b4829d1cf6ec4eead42837759d2830e286753d
--- /dev/null
+++ b/lib/libsgm/src/device_buffer.hpp
@@ -0,0 +1,90 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_DEVICE_BUFFER_HPP
+#define SGM_DEVICE_BUFFER_HPP
+
+#include <cstddef>
+
+namespace sgm {
+
+template <typename T>
+class DeviceBuffer {
+
+public:
+	using value_type = T;
+
+private:
+	T *m_data;
+	size_t m_size;
+
+public:
+	DeviceBuffer()
+		: m_data(nullptr)
+		, m_size(0)
+	{ }
+
+	explicit DeviceBuffer(size_t n)
+		: m_data(nullptr)
+		, m_size(n)
+	{
+		cudaMalloc(reinterpret_cast<void **>(&m_data), sizeof(T) * n);
+	}
+
+	DeviceBuffer(const DeviceBuffer&) = delete;
+
+	DeviceBuffer(DeviceBuffer&& obj)
+		: m_data(obj.m_data)
+		, m_size(obj.m_size)
+	{
+		obj.m_data = nullptr;
+		obj.m_size = 0;
+	}
+
+	~DeviceBuffer(){
+		cudaFree(m_data);
+	}
+
+
+	DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+	DeviceBuffer& operator=(DeviceBuffer&& obj){
+		if (m_data != nullptr) cudaFree(m_data);
+		m_data = obj.m_data;
+		m_size = obj.m_size;
+		obj.m_data = nullptr;
+		obj.m_size = 0;
+		return *this;
+	}
+
+
+	size_t size() const {
+		return m_size;
+	}
+
+	const value_type *data() const {
+		return m_data;
+	}
+
+	value_type *data(){
+		return m_data;
+	}
+
+};
+
+}
+
+#endif
diff --git a/lib/libsgm/src/horizontal_path_aggregation.cu b/lib/libsgm/src/horizontal_path_aggregation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2a9f1a87f7e95e0212c1dd7d37f7f4e79edc2440
--- /dev/null
+++ b/lib/libsgm/src/horizontal_path_aggregation.cu
@@ -0,0 +1,262 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdio>
+#include "horizontal_path_aggregation.hpp"
+#include "path_aggregation_common.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+static constexpr unsigned int DP_BLOCK_SIZE = 8u;
+static constexpr unsigned int DP_BLOCKS_PER_THREAD = 1u;
+
+static constexpr unsigned int WARPS_PER_BLOCK = 4u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;
+
+
+template <int DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_horizontal_path_kernel(
+	uint8_t *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_WARP =
+		WARP_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
+	if(width == 0 || height == 0){
+		return;
+	}
+
+	feature_type right_buffer[DP_BLOCKS_PER_THREAD][DP_BLOCK_SIZE];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp[DP_BLOCKS_PER_THREAD];
+
+	const unsigned int warp_id  = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id  = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE);
+
+	const unsigned int y0 =
+		PATHS_PER_BLOCK * blockIdx.x +
+		PATHS_PER_WARP  * warp_id +
+		group_id;
+	const unsigned int feature_step = SUBGROUPS_PER_WARP * width;
+	const unsigned int dest_step = SUBGROUPS_PER_WARP * MAX_DISPARITY * width;
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+	left  += y0 * width;
+	right += y0 * width;
+	dest  += y0 * MAX_DISPARITY * width;
+
+	if(y0 >= height){
+		return;
+	}
+
+	if(DIRECTION > 0){
+		for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				right_buffer[i][j] = 0;
+			}
+		}
+	}else{
+		for(unsigned int i = 0; i < DP_BLOCKS_PER_THREAD; ++i){
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				const int x = static_cast<int>(width - (j + dp_offset));
+				if(0 <= x && x < static_cast<int>(width)){
+					right_buffer[i][j] = __ldg(&right[i * feature_step + x]);
+				}else{
+					right_buffer[i][j] = 0;
+				}
+			}
+		}
+	}
+
+	int x0 = (DIRECTION > 0) ? 0 : static_cast<int>((width - 1) & ~(DP_BLOCK_SIZE - 1));
+	for(unsigned int iter = 0; iter < width; iter += DP_BLOCK_SIZE){
+		for(unsigned int i = 0; i < DP_BLOCK_SIZE; ++i){
+			const unsigned int x = x0 + (DIRECTION > 0 ? i : (DP_BLOCK_SIZE - 1 - i));
+			if(x >= width){
+				continue;
+			}
+			for(unsigned int j = 0; j < DP_BLOCKS_PER_THREAD; ++j){
+				const unsigned int y = y0 + j * SUBGROUPS_PER_WARP;
+				if(y >= height){
+					continue;
+				}
+				const feature_type left_value = __ldg(&left[j * feature_step + x]);
+				if(DIRECTION > 0){
+					const feature_type t = right_buffer[j][DP_BLOCK_SIZE - 1];
+					for(unsigned int k = DP_BLOCK_SIZE - 1; k > 0; --k){
+						right_buffer[j][k] = right_buffer[j][k - 1];
+					}
+#if CUDA_VERSION >= 9000
+					right_buffer[j][0] = __shfl_up_sync(shfl_mask, t, 1, SUBGROUP_SIZE);
+#else
+					right_buffer[j][0] = __shfl_up(t, 1, SUBGROUP_SIZE);
+#endif
+					if(lane_id == 0){
+						right_buffer[j][0] =
+							__ldg(&right[j * feature_step + x - dp_offset]);
+					}
+				}else{
+					const feature_type t = right_buffer[j][0];
+					for(unsigned int k = 1; k < DP_BLOCK_SIZE; ++k){
+						right_buffer[j][k - 1] = right_buffer[j][k];
+					}
+#if CUDA_VERSION >= 9000
+					right_buffer[j][DP_BLOCK_SIZE - 1] =
+						__shfl_down_sync(shfl_mask, t, 1, SUBGROUP_SIZE);
+#else
+					right_buffer[j][DP_BLOCK_SIZE - 1] = __shfl_down(t, 1, SUBGROUP_SIZE);
+#endif
+					if(lane_id + 1 == SUBGROUP_SIZE){
+						if(x >= dp_offset + DP_BLOCK_SIZE - 1){
+							right_buffer[j][DP_BLOCK_SIZE - 1] =
+								__ldg(&right[j * feature_step + x - (dp_offset + DP_BLOCK_SIZE - 1)]);
+						}else{
+							right_buffer[j][DP_BLOCK_SIZE - 1] = 0;
+						}
+					}
+				}
+				uint32_t local_costs[DP_BLOCK_SIZE];
+				for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){
+					local_costs[k] = __popc(left_value ^ right_buffer[j][k]);
+				}
+				dp[j].update(local_costs, p1, p2, shfl_mask);
+				store_uint8_vector<DP_BLOCK_SIZE>(
+					&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
+					dp[j].dp);
+			}
+		}
+		x0 += static_cast<int>(DP_BLOCK_SIZE) * DIRECTION;
+	}
+}
+
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_left2right_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_right2left_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+
+template void enqueue_aggregate_left2right_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_left2right_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_left2right_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_right2left_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_right2left_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_right2left_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
diff --git a/lib/libsgm/src/horizontal_path_aggregation.hpp b/lib/libsgm/src/horizontal_path_aggregation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce825e5fa1c56d9daa904285677f1257b9b172d0
--- /dev/null
+++ b/lib/libsgm/src/horizontal_path_aggregation.hpp
@@ -0,0 +1,50 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_HORIZONTAL_PATH_AGGREGATION_HPP
+#define SGM_HORIZONTAL_PATH_AGGREGATION_HPP
+
+#include "types.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_left2right_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_right2left_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/lib/libsgm/src/internal.h b/lib/libsgm/src/internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..577101728e33b528a9def320d21f5785b4319ac2
--- /dev/null
+++ b/lib/libsgm/src/internal.h
@@ -0,0 +1,54 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include <stdexcept>
+
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#define CudaSafeCall(error) sgm::details::cuda_safe_call(error, __FILE__, __LINE__)
+
+#define CudaKernelCheck() CudaSafeCall(cudaGetLastError())
+
+namespace sgm {
+	namespace details {
+
+		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch);
+		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch);
+
+		void check_consistency(uint8_t* d_left_disp, const uint8_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel);
+		void check_consistency(uint16_t* d_left_disp, const uint16_t* d_right_disp, const uint8_t* d_mask, int width, int height, int depth_bits, int src_pitch, int dst_pitch, bool subpixel);
+
+		void cast_16bit_8bit_array(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements);
+		void cast_8bit_16bit_array(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements);
+
+		inline void cuda_safe_call(cudaError error, const char *file, const int line)
+		{
+			if (error != cudaSuccess) {
+				fprintf(stderr, "cuda error %s : %d %s\n", file, line, cudaGetErrorString(error));
+				exit(-1);
+			}
+		}
+
+	}
+}
diff --git a/lib/libsgm/src/libsgm_wrapper.cpp b/lib/libsgm/src/libsgm_wrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b78f3c6ad5c5bac4619690eab3f366ff81214de7
--- /dev/null
+++ b/lib/libsgm/src/libsgm_wrapper.cpp
@@ -0,0 +1,119 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <libsgm_wrapper.h>
+
+namespace sgm {
+	LibSGMWrapper::LibSGMWrapper(int numDisparity, int P1, int P2, float uniquenessRatio, bool subpixel)
+		: sgm_(nullptr), numDisparity_(numDisparity), param_(P1, P2, uniquenessRatio, subpixel), prev_(nullptr) {}
+	LibSGMWrapper::~LibSGMWrapper() = default;
+
+	int LibSGMWrapper::getNumDisparities() const { return numDisparity_; }
+	float LibSGMWrapper::getUniquenessRatio() const { return param_.uniqueness; }
+	int LibSGMWrapper::getP1() const { return param_.P1; }
+	int LibSGMWrapper::getP2() const { return param_.P2; }
+	bool LibSGMWrapper::hasSubpixel() const { return param_.subpixel; }
+
+	struct LibSGMWrapper::Creator {
+		int width;
+		int height;
+		int src_pitch;
+		int dst_pitch;
+		int input_depth_bits;
+		int output_depth_bits;
+		sgm::EXECUTE_INOUT inout_type;
+
+		bool operator==(const Creator& rhs) const {
+			return
+				width == rhs.width
+				&& height == rhs.height
+				&& src_pitch == rhs.src_pitch
+				&& dst_pitch == rhs.dst_pitch
+				&& input_depth_bits == rhs.input_depth_bits
+				&& output_depth_bits == rhs.output_depth_bits
+				&& inout_type == rhs.inout_type;
+		}
+		bool operator!=(const Creator& rhs) const {
+			return !(*this == rhs);
+		}
+
+		StereoSGM* createStereoSGM(int disparity_size, const StereoSGM::Parameters& param) {
+			return new StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, src_pitch, dst_pitch, inout_type, param);
+		}
+
+#ifdef BUILD_OPENCV_WRAPPER
+		Creator(const cv::cuda::GpuMat& src, const cv::cuda::GpuMat& dst) {
+			const int depth = src.depth();
+			CV_Assert(depth == CV_8U || depth == CV_16U);
+			width = src.cols;
+			height = src.rows;
+			src_pitch = static_cast<int>(src.step1());
+			dst_pitch = static_cast<int>(dst.step1());
+			input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
+			output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
+			inout_type = sgm::EXECUTE_INOUT_CUDA2CUDA;
+		}
+		Creator(const cv::Mat& src, const cv::Mat& dst) {
+			const int depth = src.depth();
+			CV_Assert(depth == CV_8U || depth == CV_16U);
+			width = src.cols;
+			height = src.rows;
+			src_pitch = static_cast<int>(src.step1());
+			dst_pitch = static_cast<int>(dst.step1());
+			input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
+			output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
+			inout_type = sgm::EXECUTE_INOUT_HOST2HOST;
+		}
+#endif // BUILD_OPRENCV_WRAPPER
+	};
+
+#ifdef BUILD_OPENCV_WRAPPER
+	void LibSGMWrapper::execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity) {
+		const cv::Size size = I1.size();
+		CV_Assert(size == I2.size());
+		CV_Assert(I1.type() == I2.type());
+		const int depth = I1.depth();
+		CV_Assert(depth == CV_8U || depth == CV_16U);
+		if (disparity.size() != size || disparity.depth() != CV_16U) {
+			disparity.create(size, CV_16U);
+		}
+		std::unique_ptr<Creator> creator(new Creator(I1, disparity));
+		if (!sgm_ || !prev_ || *creator != *prev_) {
+			sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
+		}
+		prev_ = std::move(creator);
+
+		sgm_->execute(I1.data, I2.data, disparity.data);
+	}
+	void LibSGMWrapper::execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity) {
+		const cv::Size size = I1.size();
+		CV_Assert(size == I2.size());
+		CV_Assert(I1.type() == I2.type());
+		const int depth = I1.depth();
+		CV_Assert(depth == CV_8U || depth == CV_16U);
+		if (disparity.size() != size || disparity.depth() != CV_16U) {
+			disparity.create(size, CV_16U);
+		}
+		std::unique_ptr<Creator> creator(new Creator(I1, disparity));
+		if (!sgm_ || !prev_ || *creator != *prev_) {
+			sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
+		}
+		prev_ = std::move(creator);
+
+		sgm_->execute(I1.data, I2.data, disparity.data);
+	}
+#endif // BUILD_OPENCV_WRAPPER
+}
diff --git a/lib/libsgm/src/median_filter.cu b/lib/libsgm/src/median_filter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fffe24f0422a984ebb919bf188431c7e9994f148
--- /dev/null
+++ b/lib/libsgm/src/median_filter.cu
@@ -0,0 +1,292 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+namespace {
+
+	const int BLOCK_X = 16;
+	const int BLOCK_Y = 16;
+	const int KSIZE = 3;
+	const int RADIUS = KSIZE / 2;
+	const int KSIZE_SQ = KSIZE * KSIZE;
+
+	inline int divup(int total, int grain)
+	{
+		return (total + grain - 1) / grain;
+	}
+
+	template <typename T>
+	__device__ inline void swap(T& x, T& y)
+	{
+		T tmp(x);
+		x = y;
+		y = tmp;
+	}
+
+	// sort, min, max of 1 element
+	template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); }
+	template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); }
+	template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); }
+
+	// sort, min, max of 2 elements
+	__device__ inline void dev_sort_2(uint32_t& x, uint32_t& y)
+	{
+		const uint32_t mask = __vcmpgtu2(x, y);
+		const uint32_t tmp = (x ^ y) & mask;
+		x ^= tmp;
+		y ^= tmp;
+	}
+	__device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); }
+	__device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); }
+
+	template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); }
+	template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); }
+	template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); }
+
+	// sort, min, max of 4 elements
+	__device__ inline void dev_sort_4(uint32_t& x, uint32_t& y)
+	{
+		const uint32_t mask = __vcmpgtu4(x, y);
+		const uint32_t tmp = (x ^ y) & mask;
+		x ^= tmp;
+		y ^= tmp;
+	}
+	__device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); }
+	__device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); }
+
+	template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); }
+	template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); }
+	template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); }
+
+	template <typename T, int V = 1>
+	__device__ inline void median_selection_network_9(T* buf)
+	{
+#define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j])
+#define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j])
+#define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j])
+
+		SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
+		SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8);
+		SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
+		MAX_OP(0, 3); MAX_OP(3, 6);
+		SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4);
+		MIN_OP(5, 8); MIN_OP(2, 5);
+		SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4);
+
+#undef SWAP_OP
+#undef MIN_OP
+#undef MAX_OP
+	}
+
+	__global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+		if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS)
+			return;
+
+		uint8_t buf[KSIZE_SQ];
+		for (int i = 0; i < KSIZE_SQ; i++)
+			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
+
+		median_selection_network_9(buf);
+
+		dst[y * p + x] = buf[KSIZE_SQ / 2];
+	}
+
+	__global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p)
+	{
+		const int x = blockIdx.x * blockDim.x + threadIdx.x;
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+		if (x < RADIUS || x >= w - RADIUS || y < RADIUS || y >= h - RADIUS)
+			return;
+
+		uint16_t buf[KSIZE_SQ];
+		for (int i = 0; i < KSIZE_SQ; i++)
+			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
+
+		median_selection_network_9(buf);
+
+		dst[y * p + x] = buf[KSIZE_SQ / 2];
+	}
+
+	__global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch)
+	{
+		const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (y < RADIUS || y >= h - RADIUS)
+			return;
+
+		uint32_t buf[KSIZE_SQ];
+		if (x_4 >= 4 && x_4 + 7 < w)
+		{
+			buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]);
+			buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]);
+			buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]);
+
+			buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]);
+			buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]);
+			buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]);
+
+			buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]);
+			buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]);
+			buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]);
+
+			buf[0] = (buf[1] << 8) | (buf[0] >> 24);
+			buf[2] = (buf[1] >> 8) | (buf[2] << 24);
+
+			buf[3] = (buf[4] << 8) | (buf[3] >> 24);
+			buf[5] = (buf[4] >> 8) | (buf[5] << 24);
+
+			buf[6] = (buf[7] << 8) | (buf[6] >> 24);
+			buf[8] = (buf[7] >> 8) | (buf[8] << 24);
+
+			median_selection_network_9<uint32_t, 4>(buf);
+
+			*((uint32_t*)&dst[y * pitch + x_4]) = buf[KSIZE_SQ / 2];
+		}
+		else if (x_4 == 0)
+		{
+			for (int x = RADIUS; x < 4; x++)
+			{
+				uint8_t* buf_u8 = (uint8_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+
+				median_selection_network_9(buf_u8);
+
+				dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
+			}
+		}
+		else if (x_4 < w)
+		{
+			for (int x = x_4; x < min(x_4 + 4, w - RADIUS); x++)
+			{
+				uint8_t* buf_u8 = (uint8_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+
+				median_selection_network_9(buf_u8);
+
+				dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
+			}
+		}
+	}
+
+	__global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch)
+	{
+		const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
+		const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+		if (y < RADIUS || y >= h - RADIUS)
+			return;
+
+		uint32_t buf[KSIZE_SQ];
+		if (x_2 >= 2 && x_2 + 3 < w)
+		{
+			buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]);
+			buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]);
+			buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]);
+
+			buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]);
+			buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]);
+			buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]);
+
+			buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]);
+			buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]);
+			buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]);
+
+			buf[0] = (buf[1] << 16) | (buf[0] >> 16);
+			buf[2] = (buf[1] >> 16) | (buf[2] << 16);
+
+			buf[3] = (buf[4] << 16) | (buf[3] >> 16);
+			buf[5] = (buf[4] >> 16) | (buf[5] << 16);
+
+			buf[6] = (buf[7] << 16) | (buf[6] >> 16);
+			buf[8] = (buf[7] >> 16) | (buf[8] << 16);
+
+			median_selection_network_9<uint32_t, 2>(buf);
+
+			*((uint32_t*)&dst[y * pitch + x_2]) = buf[KSIZE_SQ / 2];
+		}
+		else if (x_2 == 0)
+		{
+			for (int x = RADIUS; x < 2; x++)
+			{
+				uint8_t* buf_u8 = (uint8_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+
+				median_selection_network_9(buf_u8);
+
+				dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
+			}
+		}
+		else if (x_2 < w)
+		{
+			for (int x = x_2; x < min(x_2 + 2, w - RADIUS); x++)
+			{
+				uint8_t* buf_u8 = (uint8_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+
+				median_selection_network_9(buf_u8);
+
+				dst[y * pitch + x] = buf_u8[KSIZE_SQ / 2];
+			}
+		}
+	}
+}
+
+namespace sgm {
+	namespace details {
+
+		void median_filter(const uint8_t* d_src, uint8_t* d_dst, int width, int height, int pitch) {
+
+			if (pitch % 4 == 0) {
+				const dim3 block(BLOCK_X, BLOCK_Y);
+				const dim3 grid(divup(width / 4, block.x), divup(height, block.y));
+				median_kernel_3x3_8u_v4<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+			}
+			else {
+				const dim3 block(BLOCK_X, BLOCK_Y);
+				const dim3 grid(divup(width, block.x), divup(height, block.y));
+				median_kernel_3x3_8u<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+			}
+
+			CudaSafeCall(cudaGetLastError());
+		}
+
+		void median_filter(const uint16_t* d_src, uint16_t* d_dst, int width, int height, int pitch) {
+			
+			if (pitch % 2 == 0) {
+				const dim3 block(BLOCK_X, BLOCK_Y);
+				const dim3 grid(divup(width / 2, block.x), divup(height, block.y));
+				median_kernel_3x3_16u_v2<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+			}
+			else {
+				const dim3 block(BLOCK_X, BLOCK_Y);
+				const dim3 grid(divup(width, block.x), divup(height, block.y));
+				median_kernel_3x3_16u<<<grid, block>>>(d_src, d_dst, width, height, pitch);
+			}
+
+			CudaSafeCall(cudaGetLastError());
+		}
+
+	}
+}
diff --git a/lib/libsgm/src/oblique_path_aggregation.cu b/lib/libsgm/src/oblique_path_aggregation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..21785c947e4e0ed91273eecb92cfb5ffa8bb767b
--- /dev/null
+++ b/lib/libsgm/src/oblique_path_aggregation.cu
@@ -0,0 +1,319 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdio>
+#include "oblique_path_aggregation.hpp"
+#include "path_aggregation_common.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+static constexpr unsigned int DP_BLOCK_SIZE = 16u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
+
+template <int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_oblique_path_kernel(
+	uint8_t *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
+	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
+
+	static_assert(X_DIRECTION == 1 || X_DIRECTION == -1, "");
+	static_assert(Y_DIRECTION == 1 || Y_DIRECTION == -1, "");
+	if(width == 0 || height == 0){
+		return;
+	}
+
+	__shared__ feature_type right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
+
+	const unsigned int warp_id  = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id  = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE);
+
+	const int x0 =
+		blockIdx.x * PATHS_PER_BLOCK +
+		warp_id * PATHS_PER_WARP +
+		group_id +
+		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
+	const int right_x00 =
+		blockIdx.x * PATHS_PER_BLOCK +
+		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+
+	const unsigned int right0_addr =
+		static_cast<unsigned int>(right_x00 + PATHS_PER_BLOCK - 1 - x0) + dp_offset;
+	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
+	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
+
+	for(unsigned int iter = 0; iter < height; ++iter){
+		const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter);
+		const int x = x0 + static_cast<int>(iter) * X_DIRECTION;
+		const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION;
+		// Load right to smem
+		for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){
+			const unsigned int i = i0 + threadIdx.x;
+			if(i < RIGHT_BUFFER_SIZE){
+				const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i);
+				feature_type right_value = 0;
+				if(0 <= x && x < static_cast<int>(width)){
+					right_value = right[x + y * width];
+				}
+				const unsigned int lo = i % DP_BLOCK_SIZE;
+				const unsigned int hi = i / DP_BLOCK_SIZE;
+				right_buffer[lo][hi] = right_value;
+				if(hi > 0){
+					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
+				}
+			}
+		}
+		__syncthreads();
+		// Compute
+		if(0 <= x && x < static_cast<int>(width)){
+			const feature_type left_value = __ldg(&left[x + y * width]);
+			feature_type right_values[DP_BLOCK_SIZE];
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
+			}
+			uint32_t local_costs[DP_BLOCK_SIZE];
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				local_costs[j] = __popc(left_value ^ right_values[j]);
+			}
+			dp.update(local_costs, p1, p2, shfl_mask);
+			store_uint8_vector<DP_BLOCK_SIZE>(
+				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
+				dp.dp);
+		}
+		__syncthreads();
+	}
+}
+
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_upleft2downright_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_upright2downleft_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_downright2upleft_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_downleft2upright_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+
+template void enqueue_aggregate_upleft2downright_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_upleft2downright_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_upleft2downright_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_upright2downleft_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_upright2downleft_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_upright2downleft_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downright2upleft_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downright2upleft_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_downright2upleft_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downleft2upright_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_downleft2upright_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_downleft2upright_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
diff --git a/lib/libsgm/src/oblique_path_aggregation.hpp b/lib/libsgm/src/oblique_path_aggregation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4f949b1e3f95fbccd42184f3e27d354ebe4ac58
--- /dev/null
+++ b/lib/libsgm/src/oblique_path_aggregation.hpp
@@ -0,0 +1,72 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_OBLIQUE_PATH_AGGREGATION_HPP
+#define SGM_OBLIQUE_PATH_AGGREGATION_HPP
+
+#include "types.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_upleft2downright_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_upright2downleft_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_downright2upleft_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_downleft2upright_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/lib/libsgm/src/path_aggregation.cu b/lib/libsgm/src/path_aggregation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..117cac507b1dded2c42415f968cec9e38d7674fe
--- /dev/null
+++ b/lib/libsgm/src/path_aggregation.cu
@@ -0,0 +1,94 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "path_aggregation.hpp"
+#include "vertical_path_aggregation.hpp"
+#include "horizontal_path_aggregation.hpp"
+#include "oblique_path_aggregation.hpp"
+
+namespace sgm {
+
+template <size_t MAX_DISPARITY>
+PathAggregation<MAX_DISPARITY>::PathAggregation()
+	: m_cost_buffer()
+{
+	for(unsigned int i = 0; i < NUM_PATHS; ++i){
+		cudaStreamCreate(&m_streams[i]);
+		cudaEventCreate(&m_events[i]);
+	}
+}
+
+template <size_t MAX_DISPARITY>
+PathAggregation<MAX_DISPARITY>::~PathAggregation(){
+	for(unsigned int i = 0; i < NUM_PATHS; ++i){
+		cudaStreamSynchronize(m_streams[i]);
+		cudaStreamDestroy(m_streams[i]);
+		cudaEventDestroy(m_events[i]);
+	}
+}
+
+template <size_t MAX_DISPARITY>
+void PathAggregation<MAX_DISPARITY>::enqueue(
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS;
+	if(m_cost_buffer.size() < buffer_size){
+		m_cost_buffer = DeviceBuffer<cost_type>(buffer_size);
+	}
+	const size_t buffer_step = width * height * MAX_DISPARITY;
+	cudaStreamSynchronize(stream);
+	path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 0 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[0]);
+	path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 1 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[1]);
+	path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 2 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[2]);
+	path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 3 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[3]);
+	path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 4 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[4]);
+	path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 5 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[5]);
+	path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 6 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[6]);
+	path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>(
+		m_cost_buffer.data() + 7 * buffer_step,
+		left, right, width, height, p1, p2, m_streams[7]);
+	for(unsigned int i = 0; i < NUM_PATHS; ++i){
+		cudaEventRecord(m_events[i], m_streams[i]);
+		cudaStreamWaitEvent(stream, m_events[i], 0);
+	}
+}
+
+
+template class PathAggregation< 64>;
+template class PathAggregation<128>;
+template class PathAggregation<256>;
+
+}
diff --git a/lib/libsgm/src/path_aggregation.hpp b/lib/libsgm/src/path_aggregation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d2b35d8e9854587025b3d91ce3f96607aad3f82
--- /dev/null
+++ b/lib/libsgm/src/path_aggregation.hpp
@@ -0,0 +1,56 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_PATH_AGGREGATION_HPP
+#define SGM_PATH_AGGREGATION_HPP
+
+#include "device_buffer.hpp"
+#include "types.hpp"
+
+namespace sgm {
+
+template <size_t MAX_DISPARITY>
+class PathAggregation {
+
+private:
+	static const unsigned int NUM_PATHS = 8;
+
+	DeviceBuffer<cost_type> m_cost_buffer;
+	cudaStream_t m_streams[NUM_PATHS];
+	cudaEvent_t m_events[NUM_PATHS];
+	
+public:
+	PathAggregation();
+	~PathAggregation();
+
+	const cost_type *get_output() const {
+		return m_cost_buffer.data();
+	}
+
+	void enqueue(
+		const feature_type *left,
+		const feature_type *right,
+		int width,
+		int height,
+		unsigned int p1,
+		unsigned int p2,
+		cudaStream_t stream);
+
+};
+
+}
+
+#endif
diff --git a/lib/libsgm/src/path_aggregation_common.hpp b/lib/libsgm/src/path_aggregation_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b560e732ff20e5983186e9137b70ab91e76a5f9f
--- /dev/null
+++ b/lib/libsgm/src/path_aggregation_common.hpp
@@ -0,0 +1,98 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_PATH_AGGREGATION_COMMON_HPP
+#define SGM_PATH_AGGREGATION_COMMON_HPP
+
+#include <cstdint>
+#include <cuda.h>
+#include "utility.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+template <
+	unsigned int DP_BLOCK_SIZE,
+	unsigned int SUBGROUP_SIZE>
+struct DynamicProgramming {
+	static_assert(
+		DP_BLOCK_SIZE >= 2,
+		"DP_BLOCK_SIZE must be greater than or equal to 2");
+	static_assert(
+		(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,	
+		"SUBGROUP_SIZE must be a power of 2");
+
+	uint32_t last_min;
+	uint32_t dp[DP_BLOCK_SIZE];
+
+	__device__ DynamicProgramming()
+		: last_min(0)
+	{
+		for(unsigned int i = 0; i < DP_BLOCK_SIZE; ++i){ dp[i] = 0; }
+	}
+
+	__device__ void update(
+		uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
+	{
+		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
+
+		const auto dp0 = dp[0];
+		uint32_t lazy_out = 0, local_min = 0;
+		{
+			const unsigned int k = 0;
+#if CUDA_VERSION >= 9000
+			const uint32_t prev =
+				__shfl_up_sync(mask, dp[DP_BLOCK_SIZE - 1], 1);
+#else
+			const uint32_t prev = __shfl_up(dp[DP_BLOCK_SIZE - 1], 1);
+#endif
+			uint32_t out = min(dp[k] - last_min, p2);
+			if(lane_id != 0){ out = min(out, prev - last_min + p1); }
+			out = min(out, dp[k + 1] - last_min + p1);
+			lazy_out = local_min = out + local_costs[k];
+		}
+		for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){
+			uint32_t out = min(dp[k] - last_min, p2);
+			out = min(out, dp[k - 1] - last_min + p1);
+			out = min(out, dp[k + 1] - last_min + p1);
+			dp[k - 1] = lazy_out;
+			lazy_out = out + local_costs[k];
+			local_min = min(local_min, lazy_out);
+		}
+		{
+			const unsigned int k = DP_BLOCK_SIZE - 1;
+#if CUDA_VERSION >= 9000
+			const uint32_t next = __shfl_down_sync(mask, dp0, 1);
+#else
+			const uint32_t next = __shfl_down(dp0, 1);
+#endif
+			uint32_t out = min(dp[k] - last_min, p2);
+			out = min(out, dp[k - 1] - last_min + p1);
+			if(lane_id + 1 != SUBGROUP_SIZE){
+				out = min(out, next - last_min + p1);
+			}
+			dp[k - 1] = lazy_out;
+			dp[k] = out + local_costs[k];
+			local_min = min(local_min, dp[k]);
+		}
+		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
+	}
+};
+
+}
+}
+
+#endif
diff --git a/lib/libsgm/src/sgm.cu b/lib/libsgm/src/sgm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d276c3fa9eb3b7dc76297eec694c02c2ad061559
--- /dev/null
+++ b/lib/libsgm/src/sgm.cu
@@ -0,0 +1,148 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "sgm.hpp"
+#include "census_transform.hpp"
+#include "path_aggregation.hpp"
+#include "winner_takes_all.hpp"
+
+namespace sgm {
+
+template <typename T, size_t MAX_DISPARITY>
+class SemiGlobalMatching<T, MAX_DISPARITY>::Impl {
+
+private:
+	DeviceBuffer<T> m_input_left;
+	DeviceBuffer<T> m_input_right;
+	CensusTransform<T> m_census_left;
+	CensusTransform<T> m_census_right;
+	PathAggregation<MAX_DISPARITY> m_path_aggregation;
+	WinnerTakesAll<MAX_DISPARITY> m_winner_takes_all;
+
+public:
+	Impl()
+		: m_input_left()
+		, m_input_right()
+		, m_census_left()
+		, m_census_right()
+		, m_path_aggregation()
+		, m_winner_takes_all()
+	{ }
+
+	void enqueue(
+		output_type *dest_left,
+		output_type *dest_right,
+		const input_type *src_left,
+		const input_type *src_right,
+		int width,
+		int height,
+		int src_pitch,
+		int dst_pitch,
+		unsigned int penalty1,
+		unsigned int penalty2,
+		float uniqueness,
+		bool subpixel,
+		cudaStream_t stream)
+	{
+		m_census_left.enqueue(
+			src_left, width, height, src_pitch, stream);
+		m_census_right.enqueue(
+			src_right, width, height, src_pitch, stream);
+		m_path_aggregation.enqueue(
+			m_census_left.get_output(),
+			m_census_right.get_output(),
+			width, height,
+			penalty1, penalty2,
+			stream);
+		m_winner_takes_all.enqueue(
+			dest_left, dest_right,
+			m_path_aggregation.get_output(),
+			width, height, dst_pitch, uniqueness, subpixel,
+			stream);
+	}
+
+};
+
+
+template <typename T, size_t MAX_DISPARITY>
+SemiGlobalMatching<T, MAX_DISPARITY>::SemiGlobalMatching()
+	: m_impl(new Impl())
+{ }
+
+template <typename T, size_t MAX_DISPARITY>
+SemiGlobalMatching<T, MAX_DISPARITY>::~SemiGlobalMatching() = default;
+
+
+template <typename T, size_t MAX_DISPARITY>
+void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
+	output_type *dest_left,
+	output_type *dest_right,
+	const input_type *src_left,
+	const input_type *src_right,
+	int width,
+	int height,
+	int src_pitch,
+	int dst_pitch,
+	unsigned int penalty1,
+	unsigned int penalty2,
+	float uniqueness,
+	bool subpixel)
+{
+	m_impl->enqueue(
+		dest_left, dest_right,
+		src_left, src_right,
+		width, height,
+		src_pitch, dst_pitch,
+		penalty1, penalty2,
+		uniqueness, subpixel,
+		0);
+	cudaStreamSynchronize(0);
+}
+
+template <typename T, size_t MAX_DISPARITY>
+void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
+	output_type *dest_left,
+	output_type *dest_right,
+	const input_type *src_left,
+	const input_type *src_right,
+	int width,
+	int height,
+	int src_pitch,
+	int dst_pitch,
+	unsigned int penalty1,
+	unsigned int penalty2,
+	float uniqueness,
+	bool subpixel,
+	cudaStream_t stream)
+{
+	m_impl->enqueue(
+		dest_left, dest_right,
+		src_left, src_right,
+		width, height,
+		src_pitch, dst_pitch,
+		penalty1, penalty2,
+		uniqueness, subpixel,
+		stream);
+}
+
+
+template class SemiGlobalMatching<uint8_t,   64>;
+template class SemiGlobalMatching<uint8_t,  128>;
+template class SemiGlobalMatching<uint8_t,  256>;
+template class SemiGlobalMatching<uint16_t,  64>;
+template class SemiGlobalMatching<uint16_t, 128>;
+
+}
diff --git a/lib/libsgm/src/sgm.hpp b/lib/libsgm/src/sgm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..afb308a25a1bf4bfc4366160d813e27f815337ab
--- /dev/null
+++ b/lib/libsgm/src/sgm.hpp
@@ -0,0 +1,74 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_SGM_HPP
+#define SGM_SGM_HPP
+
+#include <memory>
+#include <cstdint>
+#include "types.hpp"
+
+namespace sgm {
+
+template <typename T, size_t MAX_DISPARITY>
+class SemiGlobalMatching {
+
+public:
+	using input_type = T;
+	using output_type = sgm::output_type;
+
+private:
+	class Impl;
+	std::unique_ptr<Impl> m_impl;
+
+public:
+	SemiGlobalMatching();
+	~SemiGlobalMatching();
+
+	void execute(
+		output_type *dest_left,
+		output_type *dest_right,
+		const input_type *src_left,
+		const input_type *src_right,
+		int width,
+		int height,
+		int src_pitch,
+		int dst_pitch,
+		unsigned int penalty1,
+		unsigned int penalty2,
+		float uniqueness,
+		bool subpixel);
+
+	void enqueue(
+		output_type *dest_left,
+		output_type *dest_right,
+		const input_type *src_left,
+		const input_type *src_right,
+		int width,
+		int height,
+		int src_pitch,
+		int dst_pitch,
+		unsigned int penalty1,
+		unsigned int penalty2,
+		float uniqueness,
+		bool subpixel,
+		cudaStream_t stream);
+
+};
+
+}
+
+#endif
diff --git a/lib/libsgm/src/stereo_sgm.cpp b/lib/libsgm/src/stereo_sgm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0219091f91448d4befa39dbfe190024d97816a7
--- /dev/null
+++ b/lib/libsgm/src/stereo_sgm.cpp
@@ -0,0 +1,228 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+
+#include <libsgm.h>
+
+#include "internal.h"
+#include "sgm.hpp"
+
+namespace sgm {
+	static bool is_cuda_input(EXECUTE_INOUT type) { return (type & 0x1) > 0; }
+	static bool is_cuda_output(EXECUTE_INOUT type) { return (type & 0x2) > 0; }
+
+	class SemiGlobalMatchingBase {
+	public:
+		using output_type = sgm::output_type;
+		virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R, 
+			int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) = 0;
+
+		virtual ~SemiGlobalMatchingBase() {}
+	};
+
+	template <typename input_type, int DISP_SIZE>
+	class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase {
+	public:
+		void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
+			int w, int h, int sp, int dp, unsigned int P1, unsigned int P2, float uniqueness, bool subpixel) override
+		{
+			sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel);
+		}
+	private:
+		SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_;
+	};
+
+	struct CudaStereoSGMResources {
+		void* d_src_left;
+		void* d_src_right;
+		void* d_left_disp;
+		void* d_right_disp;
+		void* d_tmp_left_disp;
+		void* d_tmp_right_disp;
+		uint8_t* d_mask;
+		
+		SemiGlobalMatchingBase* sgm_engine;
+
+		CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, int src_pitch_, int dst_pitch_, EXECUTE_INOUT inout_type_) {
+
+			if (input_depth_bits_ == 8 && disparity_size_ == 64)
+				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 64>();
+			else if (input_depth_bits_ == 8 && disparity_size_ == 128)
+				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 128>();
+			else if (input_depth_bits_ == 8 && disparity_size_ == 256)
+				sgm_engine = new SemiGlobalMatchingImpl<uint8_t, 256>();
+			else if (input_depth_bits_ == 16 && disparity_size_ == 64)
+				sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 64>();
+			else if (input_depth_bits_ == 16 && disparity_size_ == 128)
+				sgm_engine = new SemiGlobalMatchingImpl<uint16_t, 128>();
+			else
+				throw std::logic_error("depth bits must be 8 or 16, and disparity size must be 64, 128 or 256");
+
+			if (is_cuda_input(inout_type_)) {
+				this->d_src_left = NULL;
+				this->d_src_right = NULL;
+			}
+			else {
+				CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * src_pitch_ * height_));
+				CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * src_pitch_ * height_));
+			}
+			
+			CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * dst_pitch_ * height_));
+			CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * dst_pitch_ * height_));
+
+			CudaSafeCall(cudaMalloc(&this->d_tmp_left_disp, sizeof(uint16_t) * dst_pitch_ * height_));
+			CudaSafeCall(cudaMalloc(&this->d_tmp_right_disp, sizeof(uint16_t) * dst_pitch_ * height_));
+
+			CudaSafeCall(cudaMemset(this->d_left_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_));
+			CudaSafeCall(cudaMemset(this->d_right_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_));
+			CudaSafeCall(cudaMemset(this->d_tmp_left_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_));
+			CudaSafeCall(cudaMemset(this->d_tmp_right_disp, 0, sizeof(uint16_t) * dst_pitch_ * height_));
+
+			CudaSafeCall(cudaMalloc(&this->d_mask, src_pitch_ * height_));
+			CudaSafeCall(cudaMemset(this->d_mask, 0, src_pitch_ * height_));
+		}
+
+		~CudaStereoSGMResources() {
+			CudaSafeCall(cudaFree(this->d_src_left));
+			CudaSafeCall(cudaFree(this->d_src_right));
+
+			CudaSafeCall(cudaFree(this->d_left_disp));
+			CudaSafeCall(cudaFree(this->d_right_disp));
+
+			CudaSafeCall(cudaFree(this->d_tmp_left_disp));
+			CudaSafeCall(cudaFree(this->d_tmp_right_disp));
+
+			CudaSafeCall(cudaFree(this->d_mask));
+
+			delete sgm_engine;
+		}
+	};
+
+	StereoSGM::StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
+		EXECUTE_INOUT inout_type, const Parameters& param) : StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, width, width, inout_type, param) {}
+
+	StereoSGM::StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch,
+		EXECUTE_INOUT inout_type, const Parameters& param) :
+		cu_res_(NULL),
+		width_(width),
+		height_(height),
+		disparity_size_(disparity_size),
+		input_depth_bits_(input_depth_bits),
+		output_depth_bits_(output_depth_bits),
+		src_pitch_(src_pitch),
+		dst_pitch_(dst_pitch),
+		inout_type_(inout_type),
+		param_(param)
+	{
+		// check values
+		if (input_depth_bits_ != 8 && input_depth_bits_ != 16 && output_depth_bits_ != 8 && output_depth_bits_ != 16) {
+			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
+			throw std::logic_error("depth bits must be 8 or 16");
+		}
+		if (disparity_size_ != 64 && disparity_size_ != 128 && disparity_size_ != 256) {
+			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
+			throw std::logic_error("disparity size must be 64, 128 or 256");
+		}
+		if (param.subpixel && output_depth_bits != 16) {
+			width_ = height_ = input_depth_bits_ = output_depth_bits_ = disparity_size_ = 0;
+			throw std::logic_error("output depth bits must be 16 if sub-pixel option was enabled");
+		}
+
+		cu_res_ = new CudaStereoSGMResources(width_, height_, disparity_size_, input_depth_bits_, output_depth_bits_, src_pitch, dst_pitch, inout_type_);
+	}
+
+	StereoSGM::~StereoSGM() {
+		if (cu_res_) { delete cu_res_; }
+	}
+
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch) {
+
+		const void *d_input_left, *d_input_right;
+
+		if (is_cuda_input(inout_type_)) {
+			d_input_left = left_pixels;
+			d_input_right = right_pixels;
+		}
+		else {
+			CudaSafeCall(cudaMemcpy(cu_res_->d_src_left, left_pixels, input_depth_bits_ / 8 * src_pitch * height, cudaMemcpyHostToDevice));
+			CudaSafeCall(cudaMemcpy(cu_res_->d_src_right, right_pixels, input_depth_bits_ / 8 * src_pitch * height, cudaMemcpyHostToDevice));
+			d_input_left = cu_res_->d_src_left;
+			d_input_right = cu_res_->d_src_right;
+		}
+
+		void* d_tmp_left_disp = cu_res_->d_tmp_left_disp;
+		void* d_tmp_right_disp = cu_res_->d_tmp_right_disp;
+		void* d_left_disp = cu_res_->d_left_disp;
+		void* d_right_disp = cu_res_->d_right_disp;
+
+		if (is_cuda_output(inout_type_) && output_depth_bits_ == 16)
+			d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
+		
+		cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
+			d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, param_.P2, param_.uniqueness, param_.subpixel);
+
+		sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch);
+		sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch);
+		sgm::details::check_consistency((uint16_t*)d_left_disp, (uint16_t*)d_right_disp, cu_res_->d_mask, width, height, input_depth_bits_, src_pitch, dst_pitch, param_.subpixel);
+
+		if (!is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
+			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)d_tmp_left_disp, dst_pitch * height);
+			CudaSafeCall(cudaMemcpy(dst, d_tmp_left_disp, sizeof(uint8_t) * dst_pitch * height, cudaMemcpyDeviceToHost));
+		}
+		else if (is_cuda_output(inout_type_) && output_depth_bits_ == 8) {
+			sgm::details::cast_16bit_8bit_array((const uint16_t*)d_left_disp, (uint8_t*)dst, dst_pitch * height);
+		}
+		else if (!is_cuda_output(inout_type_) && output_depth_bits_ == 16) {
+			CudaSafeCall(cudaMemcpy(dst, d_left_disp, sizeof(uint16_t) * dst_pitch * height, cudaMemcpyDeviceToHost));
+		}
+		else if (is_cuda_output(inout_type_) && output_depth_bits_ == 16) {
+			// optimize! no-copy!
+		}
+		else {
+			std::cerr << "not impl" << std::endl;
+		}
+	}
+
+	void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst) {
+		execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_);
+	}
+
+	bool StereoSGM::updateParameters(const Parameters &params) {
+		if (params.P1 > params.P2) {
+			return false;
+		}
+		if ((params.uniqueness < 0.0) || (params.uniqueness > 1.0)) {
+			return false;
+		}
+		
+		Parameters params_ = params;
+		std::swap(params_, this->param_);
+		return true;
+	}
+
+	void StereoSGM::setMask(uint8_t* mask, int pitch) {
+		if (pitch != src_pitch_) {
+			throw std::logic_error("Mask pitch must be the same as source pitch");
+		}
+
+		CudaSafeCall(cudaMemcpy(cu_res_->d_mask, mask, src_pitch_ * height_, cudaMemcpyHostToDevice));
+	}
+
+	/*void StereoSGM::removeMask() {
+		CudaSafeCall(cudaMemset(cu_res_->d_mask, 0, src_pitch_ * height_));
+	}*/
+}
diff --git a/lib/libsgm/src/types.hpp b/lib/libsgm/src/types.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7725a2bedb36248c87c75d3ea0c61b533dcd29
--- /dev/null
+++ b/lib/libsgm/src/types.hpp
@@ -0,0 +1,31 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_TYPES_HPP
+#define SGM_TYPES_HPP
+
+#include <cstdint>
+
+namespace sgm {
+
+using feature_type = uint32_t;
+using cost_type = uint8_t;
+using cost_sum_type = uint16_t;
+using output_type = uint16_t;
+
+}
+
+#endif
diff --git a/lib/libsgm/src/utility.hpp b/lib/libsgm/src/utility.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..17b9f45f5375e1756f581a2d7fd3e72e5f588d60
--- /dev/null
+++ b/lib/libsgm/src/utility.hpp
@@ -0,0 +1,246 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_UTILITY_HPP
+#define SGM_UTILITY_HPP
+
+#include <cuda.h>
+
+namespace sgm {
+
+static constexpr unsigned int WARP_SIZE = 32u;
+
+namespace detail {
+	template <typename T, unsigned int GROUP_SIZE, unsigned int STEP>
+	struct subgroup_min_impl {
+		static __device__ T call(T x, uint32_t mask){
+#if CUDA_VERSION >= 9000
+			x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE));
+#else
+			x = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE));
+#endif
+			return subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask);
+		}
+	};
+	template <typename T, unsigned int GROUP_SIZE>
+	struct subgroup_min_impl<T, GROUP_SIZE, 1u> {
+		static __device__ T call(T x, uint32_t){
+			return x;
+		}
+	};
+	template <unsigned int GROUP_SIZE, unsigned int STEP>
+	struct subgroup_and_impl {
+		static __device__ bool call(bool x, uint32_t mask){
+#if CUDA_VERSION >= 9000
+			x &= __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE);
+#else
+			x &= __shfl_xor(x, STEP / 2, GROUP_SIZE);
+#endif
+			return subgroup_and_impl<GROUP_SIZE, STEP / 2>::call(x, mask);
+		}
+	};
+	template <unsigned int GROUP_SIZE>
+	struct subgroup_and_impl<GROUP_SIZE, 1u> {
+		static __device__ bool call(bool x, uint32_t){
+			return x;
+		}
+	};
+}
+
+template <unsigned int GROUP_SIZE, typename T>
+__device__ inline T subgroup_min(T x, uint32_t mask){
+	return detail::subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask);
+}
+
+template <unsigned int GROUP_SIZE>
+__device__ inline bool subgroup_and(bool x, uint32_t mask){
+	return detail::subgroup_and_impl<GROUP_SIZE, GROUP_SIZE>::call(x, mask);
+}
+
+template <typename T, typename S>
+__device__ inline T load_as(const S *p){
+	return *reinterpret_cast<const T *>(p);
+}
+
+template <typename T, typename S>
+__device__ inline void store_as(S *p, const T& x){
+	*reinterpret_cast<T *>(p) = x;
+}
+
+
+template <typename T>
+__device__ inline uint32_t pack_uint8x4(T x, T y, T z, T w){
+	uchar4 uint8x4;
+	uint8x4.x = static_cast<uint8_t>(x);
+	uint8x4.y = static_cast<uint8_t>(y);
+	uint8x4.z = static_cast<uint8_t>(z);
+	uint8x4.w = static_cast<uint8_t>(w);
+	return load_as<uint32_t>(&uint8x4);
+}
+
+
+template <unsigned int N>
+__device__ inline void load_uint8_vector(uint32_t *dest, const uint8_t *ptr);
+
+template <>
+__device__ inline void load_uint8_vector<1u>(uint32_t *dest, const uint8_t *ptr){
+	dest[0] = static_cast<uint32_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void load_uint8_vector<2u>(uint32_t *dest, const uint8_t *ptr){
+	const auto uint8x2 = load_as<uchar2>(ptr);
+	dest[0] = uint8x2.x; dest[1] = uint8x2.y;
+}
+
+template <>
+__device__ inline void load_uint8_vector<4u>(uint32_t *dest, const uint8_t *ptr){
+	const auto uint8x4 = load_as<uchar4>(ptr);
+	dest[0] = uint8x4.x; dest[1] = uint8x4.y; dest[2] = uint8x4.z; dest[3] = uint8x4.w;
+}
+
+template <>
+__device__ inline void load_uint8_vector<8u>(uint32_t *dest, const uint8_t *ptr){
+	const auto uint32x2 = load_as<uint2>(ptr);
+	load_uint8_vector<4u>(dest + 0, reinterpret_cast<const uint8_t *>(&uint32x2.x));
+	load_uint8_vector<4u>(dest + 4, reinterpret_cast<const uint8_t *>(&uint32x2.y));
+}
+
+template <>
+__device__ inline void load_uint8_vector<16u>(uint32_t *dest, const uint8_t *ptr){
+	const auto uint32x4 = load_as<uint4>(ptr);
+	load_uint8_vector<4u>(dest +  0, reinterpret_cast<const uint8_t *>(&uint32x4.x));
+	load_uint8_vector<4u>(dest +  4, reinterpret_cast<const uint8_t *>(&uint32x4.y));
+	load_uint8_vector<4u>(dest +  8, reinterpret_cast<const uint8_t *>(&uint32x4.z));
+	load_uint8_vector<4u>(dest + 12, reinterpret_cast<const uint8_t *>(&uint32x4.w));
+}
+
+
+template <unsigned int N>
+__device__ inline void store_uint8_vector(uint8_t *dest, const uint32_t *ptr);
+
+template <>
+__device__ inline void store_uint8_vector<1u>(uint8_t *dest, const uint32_t *ptr){
+	dest[0] = static_cast<uint8_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void store_uint8_vector<2u>(uint8_t *dest, const uint32_t *ptr){
+	uchar2 uint8x2;
+	uint8x2.x = static_cast<uint8_t>(ptr[0]);
+	uint8x2.y = static_cast<uint8_t>(ptr[0]);
+	store_as<uchar2>(dest, uint8x2);
+}
+
+template <>
+__device__ inline void store_uint8_vector<4u>(uint8_t *dest, const uint32_t *ptr){
+	store_as<uint32_t>(dest, pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]));
+}
+
+template <>
+__device__ inline void store_uint8_vector<8u>(uint8_t *dest, const uint32_t *ptr){
+	uint2 uint32x2;
+	uint32x2.x = pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]);
+	uint32x2.y = pack_uint8x4(ptr[4], ptr[5], ptr[6], ptr[7]);
+	store_as<uint2>(dest, uint32x2);
+}
+
+template <>
+__device__ inline void store_uint8_vector<16u>(uint8_t *dest, const uint32_t *ptr){
+	uint4 uint32x4;
+	uint32x4.x = pack_uint8x4(ptr[ 0], ptr[ 1], ptr[ 2], ptr[ 3]);
+	uint32x4.y = pack_uint8x4(ptr[ 4], ptr[ 5], ptr[ 6], ptr[ 7]);
+	uint32x4.z = pack_uint8x4(ptr[ 8], ptr[ 9], ptr[10], ptr[11]);
+	uint32x4.w = pack_uint8x4(ptr[12], ptr[13], ptr[14], ptr[15]);
+	store_as<uint4>(dest, uint32x4);
+}
+
+
+template <unsigned int N>
+__device__ inline void load_uint16_vector(uint32_t *dest, const uint16_t *ptr);
+
+template <>
+__device__ inline void load_uint16_vector<1u>(uint32_t *dest, const uint16_t *ptr){
+	dest[0] = static_cast<uint32_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void load_uint16_vector<2u>(uint32_t *dest, const uint16_t *ptr){
+	const auto uint16x2 = load_as<ushort2>(ptr);
+	dest[0] = uint16x2.x; dest[1] = uint16x2.y;
+}
+
+template <>
+__device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *ptr){
+	const auto uint16x4 = load_as<ushort4>(ptr);
+	dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w;
+}
+
+template <>
+__device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr){
+	const auto uint32x4 = load_as<uint4>(ptr);
+	load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x4.x));
+	load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x4.y));
+	load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x4.z));
+	load_uint16_vector<2u>(dest + 6, reinterpret_cast<const uint16_t *>(&uint32x4.w));
+}
+
+
+template <unsigned int N>
+__device__ inline void store_uint16_vector(uint16_t *dest, const uint32_t *ptr);
+
+template <>
+__device__ inline void store_uint16_vector<1u>(uint16_t *dest, const uint32_t *ptr){
+	dest[0] = static_cast<uint16_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void store_uint16_vector<2u>(uint16_t *dest, const uint32_t *ptr){
+	ushort2 uint16x2;
+	uint16x2.x = static_cast<uint16_t>(ptr[0]);
+	uint16x2.y = static_cast<uint16_t>(ptr[1]);
+	store_as<ushort2>(dest, uint16x2);
+}
+
+template <>
+__device__ inline void store_uint16_vector<4u>(uint16_t *dest, const uint32_t *ptr){
+	ushort4 uint16x4;
+	uint16x4.x = static_cast<uint16_t>(ptr[0]);
+	uint16x4.y = static_cast<uint16_t>(ptr[1]);
+	uint16x4.z = static_cast<uint16_t>(ptr[2]);
+	uint16x4.w = static_cast<uint16_t>(ptr[3]);
+	store_as<ushort4>(dest, uint16x4);
+}
+
+template <>
+__device__ inline void store_uint16_vector<8u>(uint16_t *dest, const uint32_t *ptr){
+	uint4 uint32x4;
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.x), &ptr[0]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.y), &ptr[2]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.z), &ptr[4]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.w), &ptr[6]);
+	store_as<uint4>(dest, uint32x4);
+}
+
+template <>
+__device__ inline void store_uint16_vector<16u>(uint16_t *dest, const uint32_t *ptr){
+	store_uint16_vector<8u>(dest + 0, ptr + 0);
+	store_uint16_vector<8u>(dest + 8, ptr + 8);
+}
+
+}
+
+#endif
diff --git a/lib/libsgm/src/vertical_path_aggregation.cu b/lib/libsgm/src/vertical_path_aggregation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d318f46b465d7c238c3beb8d67d0b858eada38d1
--- /dev/null
+++ b/lib/libsgm/src/vertical_path_aggregation.cu
@@ -0,0 +1,216 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdio>
+#include "vertical_path_aggregation.hpp"
+#include "path_aggregation_common.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+static constexpr unsigned int DP_BLOCK_SIZE = 16u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
+
+template <int DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_vertical_path_kernel(
+	uint8_t *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
+	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
+
+	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
+	if(width == 0 || height == 0){
+		return;
+	}
+
+	__shared__ feature_type right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS + 1];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
+
+	const unsigned int warp_id  = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id  = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		((1u << SUBGROUP_SIZE) - 1u) << (group_id * SUBGROUP_SIZE);
+
+	const unsigned int x =
+		blockIdx.x * PATHS_PER_BLOCK +
+		warp_id    * PATHS_PER_WARP +
+		group_id;
+	const unsigned int right_x0 = blockIdx.x * PATHS_PER_BLOCK;
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+
+	const unsigned int right0_addr =
+		(right_x0 + PATHS_PER_BLOCK - 1) - x + dp_offset;
+	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
+	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
+
+	for(unsigned int iter = 0; iter < height; ++iter){
+		const unsigned int y = (DIRECTION > 0 ? iter : height - 1 - iter);
+		// Load left to register
+		feature_type left_value;
+		if(x < width){
+			left_value = left[x + y * width];
+		}
+		// Load right to smem
+		for(unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE){
+			const unsigned int i = i0 + threadIdx.x;
+			if(i < RIGHT_BUFFER_SIZE){
+				const int x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i);
+				feature_type right_value = 0;
+				if(0 <= x && x < static_cast<int>(width)){
+					right_value = right[x + y * width];
+				}
+				const unsigned int lo = i % DP_BLOCK_SIZE;
+				const unsigned int hi = i / DP_BLOCK_SIZE;
+				right_buffer[lo][hi] = right_value;
+				if(hi > 0){
+					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
+				}
+			}
+		}
+		__syncthreads();
+		// Compute
+		if(x < width){
+			feature_type right_values[DP_BLOCK_SIZE];
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
+			}
+			uint32_t local_costs[DP_BLOCK_SIZE];
+			for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
+				local_costs[j] = __popc(left_value ^ right_values[j]);
+			}
+			dp.update(local_costs, p1, p2, shfl_mask);
+			store_uint8_vector<DP_BLOCK_SIZE>(
+				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
+				dp.dp);
+		}
+		__syncthreads();
+	}
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_up2down_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_down2up_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2);
+}
+
+
+template void enqueue_aggregate_up2down_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_up2down_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_up2down_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_down2up_path<64u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template void enqueue_aggregate_down2up_path<128u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+	
+template void enqueue_aggregate_down2up_path<256u>(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
diff --git a/lib/libsgm/src/vertical_path_aggregation.hpp b/lib/libsgm/src/vertical_path_aggregation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ae04016a3f412c75a173c9a6c03dbd5d7c4a7c2
--- /dev/null
+++ b/lib/libsgm/src/vertical_path_aggregation.hpp
@@ -0,0 +1,50 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_VERTICAL_PATH_AGGREGATION_HPP
+#define SGM_VERTICAL_PATH_AGGREGATION_HPP
+
+#include "types.hpp"
+
+namespace sgm {
+namespace path_aggregation {
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_up2down_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+template <unsigned int MAX_DISPARITY>
+void enqueue_aggregate_down2up_path(
+	cost_type *dest,
+	const feature_type *left,
+	const feature_type *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	cudaStream_t stream);
+
+}
+}
+
+#endif
diff --git a/lib/libsgm/src/winner_takes_all.cu b/lib/libsgm/src/winner_takes_all.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f33d3005289a5041d03160af960032e221055e59
--- /dev/null
+++ b/lib/libsgm/src/winner_takes_all.cu
@@ -0,0 +1,302 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <cstdio>
+#include <libsgm.h>
+#include "winner_takes_all.hpp"
+#include "utility.hpp"
+
+namespace sgm {
+
+namespace {
+
+static constexpr unsigned int NUM_PATHS = 8u;
+
+static constexpr unsigned int WARPS_PER_BLOCK = 8u;
+static constexpr unsigned int BLOCK_SIZE = WARPS_PER_BLOCK * WARP_SIZE;
+
+
+__device__ inline uint32_t pack_cost_index(uint32_t cost, uint32_t index){
+	union {
+		uint32_t uint32;
+		ushort2 uint16x2;
+	} u;
+	u.uint16x2.x = static_cast<uint16_t>(index);
+	u.uint16x2.y = static_cast<uint16_t>(cost);
+	return u.uint32;
+}
+
+__device__ uint32_t unpack_cost(uint32_t packed){
+	return packed >> 16;
+}
+
+__device__ int unpack_index(uint32_t packed){
+	return packed & 0xffffu;
+}
+
+using ComputeDisparity = uint32_t(*)(uint32_t, uint32_t, uint16_t*);
+
+__device__ inline uint32_t compute_disparity_normal(uint32_t disp, uint32_t cost = 0, uint16_t* smem = nullptr)
+{
+	return disp;
+}
+
+template <size_t MAX_DISPARITY>
+__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, uint32_t cost, uint16_t* smem)
+{
+	int subp = disp;
+	subp <<= sgm::StereoSGM::SUBPIXEL_SHIFT;
+	if (disp > 0 && disp < MAX_DISPARITY - 1) {
+		const int left = smem[disp - 1];
+		const int right = smem[disp + 1];
+		const int numer = left - right;
+		const int denom = left - 2 * cost + right;
+		subp += ((numer << sgm::StereoSGM::SUBPIXEL_SHIFT) + denom) / (2 * denom);
+	}
+	return subp;
+}
+
+
+template <unsigned int MAX_DISPARITY, ComputeDisparity compute_disparity = compute_disparity_normal>
+__global__ void winner_takes_all_kernel(
+	output_type *left_dest,
+	output_type *right_dest,
+	const cost_type *src,
+	int width,
+	int height,
+	int pitch,
+	float uniqueness)
+{
+	static const unsigned int ACCUMULATION_PER_THREAD = 16u;
+	static const unsigned int REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE;
+	static const unsigned int ACCUMULATION_INTERVAL = ACCUMULATION_PER_THREAD / REDUCTION_PER_THREAD;
+	static const unsigned int UNROLL_DEPTH = 
+		(REDUCTION_PER_THREAD > ACCUMULATION_INTERVAL)
+			? REDUCTION_PER_THREAD
+			: ACCUMULATION_INTERVAL;
+
+	const unsigned int cost_step = MAX_DISPARITY * width * height;
+	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
+	const unsigned int lane_id = threadIdx.x % WARP_SIZE;
+
+	const unsigned int y = blockIdx.x * WARPS_PER_BLOCK + warp_id;
+	src += y * MAX_DISPARITY * width;
+	left_dest  += y * pitch;
+	right_dest += y * pitch;
+
+	if(y >= height){
+		return;
+	}
+
+	__shared__ uint16_t smem_cost_sum[WARPS_PER_BLOCK][ACCUMULATION_INTERVAL][MAX_DISPARITY];
+
+	uint32_t right_best[REDUCTION_PER_THREAD];
+	for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+		right_best[i] = 0xffffffffu;
+	}
+
+	for(unsigned int x0 = 0; x0 < width; x0 += UNROLL_DEPTH){
+#pragma unroll
+		for(unsigned int x1 = 0; x1 < UNROLL_DEPTH; ++x1){
+			if(x1 % ACCUMULATION_INTERVAL == 0){
+				const unsigned int k = lane_id * ACCUMULATION_PER_THREAD;
+				const unsigned int k_hi = k / MAX_DISPARITY;
+				const unsigned int k_lo = k % MAX_DISPARITY;
+				const unsigned int x = x0 + x1 + k_hi;
+				if(x < width){
+					const unsigned int offset = x * MAX_DISPARITY + k_lo;
+					uint32_t sum[ACCUMULATION_PER_THREAD];
+					for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){
+						sum[i] = 0;
+					}
+					for(unsigned int p = 0; p < NUM_PATHS; ++p){
+						uint32_t load_buffer[ACCUMULATION_PER_THREAD];
+						load_uint8_vector<ACCUMULATION_PER_THREAD>(
+							load_buffer, &src[p * cost_step + offset]);
+						for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){
+							sum[i] += load_buffer[i];
+						}
+					}
+					store_uint16_vector<ACCUMULATION_PER_THREAD>(
+						&smem_cost_sum[warp_id][k_hi][k_lo], sum);
+				}
+#if CUDA_VERSION >= 9000
+				__syncwarp();
+#else
+				__threadfence_block();
+#endif
+			}
+			const unsigned int x = x0 + x1;
+			if(x < width){
+				// Load sum of costs
+				const unsigned int smem_x = x1 % ACCUMULATION_INTERVAL;
+				const unsigned int k0 = lane_id * REDUCTION_PER_THREAD;
+				uint32_t local_cost_sum[REDUCTION_PER_THREAD];
+				load_uint16_vector<REDUCTION_PER_THREAD>(
+					local_cost_sum, &smem_cost_sum[warp_id][smem_x][k0]);
+				// Pack sum of costs and dispairty
+				uint32_t local_packed_cost[REDUCTION_PER_THREAD];
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					local_packed_cost[i] = pack_cost_index(local_cost_sum[i], k0 + i);
+				}
+				// Update left
+				uint32_t best = 0xffffffffu;
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					best = min(best, local_packed_cost[i]);
+				}
+				best = subgroup_min<WARP_SIZE>(best, 0xffffffffu);
+				// Update right
+#pragma unroll
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					const unsigned int k = lane_id * REDUCTION_PER_THREAD + i;
+					const int p = static_cast<int>(((x - k) & ~(MAX_DISPARITY - 1)) + k);
+					const unsigned int d = static_cast<unsigned int>(x - p);
+#if CUDA_VERSION >= 9000
+					const uint32_t recv = __shfl_sync(0xffffffffu,
+						local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD],
+						d / REDUCTION_PER_THREAD,
+						WARP_SIZE);
+#else
+					const uint32_t recv = __shfl(
+						local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD],
+						d / REDUCTION_PER_THREAD,
+						WARP_SIZE);
+#endif
+					right_best[i] = min(right_best[i], recv);
+					if(d == MAX_DISPARITY - 1){
+						if(0 <= p){
+							right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+						}
+						right_best[i] = 0xffffffffu;
+					}
+				}
+				// Resume updating left to avoid execution dependency
+				const uint32_t bestCost = unpack_cost(best);
+				const int bestDisp = unpack_index(best);
+				bool uniq = true;
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					const uint32_t x = local_packed_cost[i];
+					const bool uniq1 = unpack_cost(x) * uniqueness >= bestCost;
+					const bool uniq2 = abs(unpack_index(x) - bestDisp) <= 1;
+					uniq &= uniq1 || uniq2;
+				}
+				uniq = subgroup_and<WARP_SIZE>(uniq, 0xffffffffu);
+				if(lane_id == 0){
+					left_dest[x] = uniq ? compute_disparity(bestDisp, bestCost, smem_cost_sum[warp_id][smem_x]) : 0;
+				}
+			}
+		}
+	}
+	for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+		const unsigned int k = lane_id * REDUCTION_PER_THREAD + i;
+		const int p = static_cast<int>(((width - k) & ~(MAX_DISPARITY - 1)) + k);
+		if(p < width){
+			right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+		}
+	}
+}
+
+template <size_t MAX_DISPARITY>
+void enqueue_winner_takes_all(
+	output_type *left_dest,
+	output_type *right_dest,
+	const cost_type *src,
+	int width,
+	int height,
+	int pitch,
+	float uniqueness,
+	bool subpixel,
+	cudaStream_t stream)
+{
+	const int gdim =
+		(height + WARPS_PER_BLOCK - 1) / WARPS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	if (subpixel) {
+		winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim, 0, stream>>>(
+			left_dest, right_dest, src, width, height, pitch, uniqueness);
+	} else {
+		winner_takes_all_kernel<MAX_DISPARITY, compute_disparity_normal><<<gdim, bdim, 0, stream>>>(
+			left_dest, right_dest, src, width, height, pitch, uniqueness);
+	}
+}
+
+}
+
+
+template <size_t MAX_DISPARITY>
+WinnerTakesAll<MAX_DISPARITY>::WinnerTakesAll()
+	: m_left_buffer()
+	, m_right_buffer()
+{ }
+
+template <size_t MAX_DISPARITY>
+void WinnerTakesAll<MAX_DISPARITY>::enqueue(
+	const cost_type *src,
+	int width,
+	int height,
+	int pitch,
+	float uniqueness,
+	bool subpixel,
+	cudaStream_t stream)
+{
+	if(m_left_buffer.size() < static_cast<size_t>(pitch * height)){
+		m_left_buffer = DeviceBuffer<output_type>(pitch * height);
+	}
+	if(m_right_buffer.size() < static_cast<size_t>(pitch * height)){
+		m_right_buffer = DeviceBuffer<output_type>(pitch * height);
+	}
+	enqueue_winner_takes_all<MAX_DISPARITY>(
+		m_left_buffer.data(),
+		m_right_buffer.data(),
+		src,
+		width,
+		height,
+		pitch,
+		uniqueness,
+		subpixel,
+		stream);
+}
+
+template <size_t MAX_DISPARITY>
+void WinnerTakesAll<MAX_DISPARITY>::enqueue(
+	output_type* left,
+	output_type* right,
+	const cost_type *src,
+	int width,
+	int height,
+	int pitch,
+	float uniqueness,
+	bool subpixel,
+	cudaStream_t stream)
+{
+	enqueue_winner_takes_all<MAX_DISPARITY>(
+		left,
+		right,
+		src,
+		width,
+		height,
+		pitch,
+		uniqueness,
+		subpixel,
+		stream);
+}
+
+
+template class WinnerTakesAll< 64>;
+template class WinnerTakesAll<128>;
+template class WinnerTakesAll<256>;
+
+}
diff --git a/lib/libsgm/src/winner_takes_all.hpp b/lib/libsgm/src/winner_takes_all.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3dbae82735581c6b1051fd973efd4346e24c2543
--- /dev/null
+++ b/lib/libsgm/src/winner_takes_all.hpp
@@ -0,0 +1,67 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef SGM_WINNER_TAKES_ALL_HPP
+#define SGM_WINNER_TAKES_ALL_HPP
+
+#include "device_buffer.hpp"
+#include "types.hpp"
+
+namespace sgm {
+
+template <size_t MAX_DISPARITY>
+class WinnerTakesAll {
+
+private:
+	DeviceBuffer<output_type> m_left_buffer;
+	DeviceBuffer<output_type> m_right_buffer;
+
+public:
+	WinnerTakesAll();
+
+	const output_type *get_left_output() const {
+		return m_left_buffer.data();
+	}
+
+	const output_type *get_right_output() const {
+		return m_right_buffer.data();
+	}
+
+	void enqueue(
+		const cost_type *src,
+		int width,
+		int height,
+		int pitch,
+		float uniqueness,
+		bool subpixel,
+		cudaStream_t stream);
+
+	void enqueue(
+		output_type *left,
+		output_type *right,
+		const cost_type *src,
+		int width,
+		int height,
+		int pitch,
+		float uniqueness,
+		bool subpixel,
+		cudaStream_t stream);
+
+};
+
+}
+
+#endif
diff --git a/lib/libsgm/test/CMakeLists.txt b/lib/libsgm/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..883b89ab50c29c55a907d52b341e216aa0a8c466
--- /dev/null
+++ b/lib/libsgm/test/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 3.0)
+project(sgm-test)
+
+find_package(CUDA REQUIRED)
+
+if (MSVC)
+	option(gtest_force_shared_crt "Force Gmock to use standard compiler flags" ON)
+endif()
+
+add_subdirectory(googletest)
+
+include_directories(${gtest_SOURCE_DIR}/include)
+include_directories(../include)
+include_directories(../src)
+
+file(GLOB TEST_SOURCES "*.cpp" "*.cu")
+
+if (CMAKE_COMPILER_IS_GNUCXX)
+	set(CMAKE_CXX_FLAGS "-O3 -Wall")
+	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+endif()
+
+SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${CUDA_ARCH} -lineinfo")
+
+cuda_add_executable(sgm-test ${TEST_SOURCES})
+target_link_libraries(sgm-test sgm gtest)
diff --git a/lib/libsgm/test/census_transform_test.cu b/lib/libsgm/test/census_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b369f3927966cc9c21cc9477e3cc720feed1bcf6
--- /dev/null
+++ b/lib/libsgm/test/census_transform_test.cu
@@ -0,0 +1,71 @@
+#include <gtest/gtest.h>
+#include "census_transform.hpp"
+#include "generator.hpp"
+#include "test_utility.hpp"
+
+#include "debug.hpp"
+
+namespace {
+
+template <typename T>
+thrust::host_vector<sgm::feature_type> census_transform(
+	const thrust::host_vector<T>& src, size_t width, size_t height, size_t pitch)
+{
+	const int hor = 9 / 2, ver = 7 / 2;
+	thrust::host_vector<sgm::feature_type> result(width * height, 0);
+	for(int y = ver; y < static_cast<int>(height) - ver; ++y){
+		for(int x = hor; x < static_cast<int>(width) - hor; ++x){
+			const auto c = src[x + y * pitch];
+			sgm::feature_type value = 0;
+			for(int dy = -ver; dy <= 0; ++dy){
+				for(int dx = -hor; dx <= (dy == 0 ? -1 : hor); ++dx){
+					const auto a = src[(x + dx) + (y + dy) * pitch];
+					const auto b = src[(x - dx) + (y - dy) * pitch];
+					value <<= 1;
+					if(a > b){ value |= 1; }
+				}
+			}
+			result[x + y * width] = value;
+		}
+	}
+	return result;
+}
+
+}
+
+
+TEST(CensusTransformTest, RandomU8){
+	using input_type = uint8_t;
+	const size_t width = 631, height = 479;
+	const auto input = generate_random_sequence<input_type>(width * height);
+	const auto expect = census_transform(input, width, height, width);
+
+	sgm::CensusTransform<uint8_t> transform;
+	const auto d_input = to_device_vector(input);
+	transform.enqueue(d_input.data().get(), width, height, width, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::feature_type> d_actual(
+		transform.get_output(), transform.get_output() + (width * height));
+	const auto actual = to_host_vector(d_actual);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, 1);
+}
+
+TEST(CensusTransformTest, RandomU8WithPitch){
+	using input_type = uint8_t;
+	const size_t width = 631, height = 479, pitch = 640;
+	const auto input = generate_random_sequence<input_type>(pitch * height);
+	const auto expect = census_transform(input, width, height, pitch);
+
+	sgm::CensusTransform<uint8_t> transform;
+	const auto d_input = to_device_vector(input);
+	transform.enqueue(d_input.data().get(), width, height, pitch, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::feature_type> d_actual(
+		transform.get_output(), transform.get_output() + (width * height));
+	const auto actual = to_host_vector(d_actual);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, 1);
+}
diff --git a/lib/libsgm/test/debug.hpp b/lib/libsgm/test/debug.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ac1ab26f1ee09bc2e2aa5b496009951ff096cc4
--- /dev/null
+++ b/lib/libsgm/test/debug.hpp
@@ -0,0 +1,31 @@
+#ifndef SGM_TEST_DEBUG_HPP
+#define SGM_TEST_DEBUG_HPP
+
+#include <iostream>
+#include <cstdint>
+
+template <typename T>
+inline bool debug_compare(
+	const T *actual,
+	const T *expect,
+	size_t width,
+	size_t height,
+	size_t disparity)
+{
+	bool accept = true;
+	for(size_t y = 0, i = 0; y < height; ++y){
+		for(size_t x = 0; x < width; ++x){
+			for(size_t k = 0; k < disparity; ++k, ++i){
+				if(actual[i] != expect[i]){
+					std::cerr << "(" << y << ", " << x << ", " << k << "): ";
+					std::cerr << static_cast<uint64_t>(actual[i]) << " / ";
+					std::cerr << static_cast<uint64_t>(expect[i]) << std::endl;
+					accept = false;
+				}
+			}
+		}
+	}
+	return accept;
+}
+
+#endif
diff --git a/lib/libsgm/test/generator.cpp b/lib/libsgm/test/generator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44e48d3f74a09a8aedb45f207727e11211b82d36
--- /dev/null
+++ b/lib/libsgm/test/generator.cpp
@@ -0,0 +1,3 @@
+#include "generator.hpp"
+
+std::default_random_engine g_random_engine;
diff --git a/lib/libsgm/test/generator.hpp b/lib/libsgm/test/generator.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8a0516f46eb5ee8d276935b234f4b64df320b3e
--- /dev/null
+++ b/lib/libsgm/test/generator.hpp
@@ -0,0 +1,34 @@
+#ifndef SGM_TEST_GENERATOR_HPP
+#define SGM_TEST_GENERATOR_HPP
+
+#include <random>
+#include <limits>
+#include <thrust/host_vector.h>
+
+extern std::default_random_engine g_random_engine;
+
+template <typename T>
+inline thrust::host_vector<T> generate_random_sequence(size_t n){
+	std::uniform_int_distribution<T> distribution(
+		std::numeric_limits<T>::min(),
+		std::numeric_limits<T>::max());
+	thrust::host_vector<T> seq(n);
+	for(size_t i = 0; i < n; ++i){
+		seq[i] = distribution(g_random_engine);
+	}
+	return seq;
+}
+
+template <>
+inline thrust::host_vector<uint8_t> generate_random_sequence<uint8_t>(size_t n) {
+	std::uniform_int_distribution<unsigned int> distribution(
+		std::numeric_limits<uint8_t>::min(),
+		std::numeric_limits<uint8_t>::max());
+	thrust::host_vector<uint8_t> seq(n);
+	for (size_t i = 0; i < n; ++i) {
+		seq[i] = static_cast<uint8_t>(distribution(g_random_engine));
+	}
+	return seq;
+}
+
+#endif
diff --git a/lib/libsgm/test/horizontal_path_aggregation_test.cu b/lib/libsgm/test/horizontal_path_aggregation_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..106cb32ca1de88f478fa880d9d6fe1cf934c0c56
--- /dev/null
+++ b/lib/libsgm/test/horizontal_path_aggregation_test.cu
@@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+#include "horizontal_path_aggregation.hpp"
+#include "path_aggregation_test.hpp"
+#include "generator.hpp"
+#include "test_utility.hpp"
+
+#include "debug.hpp"
+
+TEST(HorizontalPathAggregationTest, RandomLeft2Right){
+	static constexpr size_t width = 631, height = 479, disparity = 128;
+	static constexpr unsigned int p1 = 20, p2 = 100;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, 1, 0);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_left2right_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
+
+TEST(HorizontalPathAggregationTest, RandomRight2Left){
+	static constexpr size_t width = 640, height = 480, disparity = 64;
+	static constexpr unsigned int p1 = 20, p2 = 40;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, -1, 0);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_right2left_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
diff --git a/lib/libsgm/test/main.cpp b/lib/libsgm/test/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36c5def97f36666b0589f3be92a34977399a5f9a
--- /dev/null
+++ b/lib/libsgm/test/main.cpp
@@ -0,0 +1,6 @@
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv){
+	::testing::InitGoogleTest(&argc, argv);
+	return RUN_ALL_TESTS();
+}
diff --git a/lib/libsgm/test/oblique_path_aggregation_test.cu b/lib/libsgm/test/oblique_path_aggregation_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f96cf25854b595e2c82af483c65664a384580014
--- /dev/null
+++ b/lib/libsgm/test/oblique_path_aggregation_test.cu
@@ -0,0 +1,99 @@
+#include <gtest/gtest.h>
+#include "oblique_path_aggregation.hpp"
+#include "path_aggregation_test.hpp"
+#include "generator.hpp"
+#include "test_utility.hpp"
+
+#include "debug.hpp"
+
+TEST(ObliquePathAggregationTest, RandomUpLeft2DownRight){
+	static constexpr size_t width = 631, height = 479, disparity = 128;
+	static constexpr unsigned int p1 = 20, p2 = 100;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, 1, 1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_upleft2downright_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
+
+TEST(ObliquePathAggregationTest, RandomUpRight2DownLeft){
+	static constexpr size_t width = 640, height = 479, disparity = 64;
+	static constexpr unsigned int p1 = 20, p2 = 40;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, -1, 1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_upright2downleft_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
+
+TEST(ObliquePathAggregationTest, RandomDownRight2UpLeft){
+	static constexpr size_t width = 631, height = 479, disparity = 128;
+	static constexpr unsigned int p1 = 20, p2 = 100;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, -1, -1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_downright2upleft_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
+
+TEST(ObliquePathAggregationTest, RandomDownLeft2UpRight){
+	static constexpr size_t width = 640, height = 479, disparity = 64;
+	static constexpr unsigned int p1 = 20, p2 = 40;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, 1, -1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_downleft2upright_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
diff --git a/lib/libsgm/test/path_aggregation_test.hpp b/lib/libsgm/test/path_aggregation_test.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f104ba5b7337617c4476fecfc18c26f775f6018
--- /dev/null
+++ b/lib/libsgm/test/path_aggregation_test.hpp
@@ -0,0 +1,48 @@
+#ifndef SGM_TEST_PATH_AGGREGATION_TEST_HPP
+#define SGM_TEST_PATH_AGGREGATION_TEST_HPP
+
+#include <limits>
+#include <algorithm>
+#include <thrust/host_vector.h>
+
+#ifdef _WIN32
+#define popcnt64 __popcnt64
+#else
+#define popcnt64 __builtin_popcountll
+#endif
+
+static thrust::host_vector<sgm::cost_type> path_aggregation(
+	const thrust::host_vector<sgm::feature_type>& left,
+	const thrust::host_vector<sgm::feature_type>& right,
+	int width, int height, int max_disparity,
+	int p1, int p2, int dx, int dy)
+{
+	thrust::host_vector<sgm::cost_type> result(width * height * max_disparity);
+	std::vector<int> before(max_disparity);
+	for(int i = (dy < 0 ? height - 1: 0); 0 <= i && i < height; i += (dy < 0 ? -1 : 1)){
+		for(int j = (dx < 0 ? width - 1 : 0); 0 <= j && j < width; j += (dx < 0 ? -1 : 1)){
+			const int i2 = i - dy, j2 = j - dx;
+			const bool inside = (0 <= i2 && i2 < height && 0 <= j2 && j2 < width);
+			for(int k = 0; k < max_disparity; ++k){
+				before[k] = inside ? result[k + (j2 + i2 * width) * max_disparity] : 0;
+			}
+			const int min_cost = *min_element(before.begin(), before.end());
+			for(int k = 0; k < max_disparity; ++k){
+				const auto l = left[j + i * width];
+				const auto r = (k > j ? 0 : right[(j - k) + i * width]);
+				int cost = std::min(before[k] - min_cost, p2);
+				if(k > 0){
+					cost = std::min(cost, before[k - 1] - min_cost + p1);
+				}
+				if(k + 1 < max_disparity){
+					cost = std::min(cost, before[k + 1] - min_cost + p1);
+				}
+				cost += static_cast<int>(popcnt64(l ^ r));
+				result[k + (j + i * width) * max_disparity] = static_cast<uint8_t>(cost);
+			}
+		}
+	}
+	return result;
+}
+
+#endif
diff --git a/lib/libsgm/test/test_utility.hpp b/lib/libsgm/test/test_utility.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..60cf832f53882f74ed74edbbd75ceeadda7a5285
--- /dev/null
+++ b/lib/libsgm/test/test_utility.hpp
@@ -0,0 +1,29 @@
+#ifndef SGM_TEST_UTILITY_HPP
+#define SGM_TEST_UTILITY_HPP
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+template <typename T>
+inline thrust::host_vector<T> to_host_vector(const thrust::device_vector<T>& src){
+	thrust::host_vector<T> dest(src.size());
+	cudaMemcpy(
+		dest.data(),
+		src.data().get(),
+		sizeof(T) * src.size(),
+		cudaMemcpyDeviceToHost);
+	return dest;
+}
+
+template <typename T>
+inline thrust::device_vector<T> to_device_vector(const thrust::host_vector<T>& src){
+	thrust::device_vector<T> dest(src.size());
+	cudaMemcpy(
+		dest.data().get(),
+		src.data(),
+		sizeof(T) * src.size(),
+		cudaMemcpyHostToDevice);
+	return dest;
+}
+
+#endif
diff --git a/lib/libsgm/test/vertical_path_aggregation_test.cu b/lib/libsgm/test/vertical_path_aggregation_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0fb7f93da7d3755956e930b28296df991016f10
--- /dev/null
+++ b/lib/libsgm/test/vertical_path_aggregation_test.cu
@@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+#include "vertical_path_aggregation.hpp"
+#include "path_aggregation_test.hpp"
+#include "generator.hpp"
+#include "test_utility.hpp"
+
+#include "debug.hpp"
+
+TEST(VerticalPathAggregationTest, RandomUp2Down){
+	static constexpr size_t width = 631, height = 479, disparity = 128;
+	static constexpr unsigned int p1 = 20, p2 = 100;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, 0, 1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_up2down_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
+
+TEST(VerticalPathAggregationTest, RandomDown2Up){
+	static constexpr size_t width = 640, height = 479, disparity = 64;
+	static constexpr unsigned int p1 = 20, p2 = 40;
+	const auto left  = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto right = generate_random_sequence<sgm::feature_type>(width * height);
+	const auto expect = path_aggregation(
+		left, right, width, height, disparity, p1, p2, 0, -1);
+
+	const auto d_left = to_device_vector(left);
+	const auto d_right = to_device_vector(right);
+	thrust::device_vector<sgm::cost_type> d_cost(width * height * disparity);
+	sgm::path_aggregation::enqueue_aggregate_down2up_path<disparity>(
+		d_cost.data().get(),
+		d_left.data().get(),
+		d_right.data().get(),
+		width, height, p1, p2, 0);
+	cudaStreamSynchronize(0);
+
+	const auto actual = to_host_vector(d_cost);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), width, height, disparity);
+}
diff --git a/lib/libsgm/test/winner_takes_all_test.cu b/lib/libsgm/test/winner_takes_all_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c882114e52b107d76c53522b2def189d761a48c1
--- /dev/null
+++ b/lib/libsgm/test/winner_takes_all_test.cu
@@ -0,0 +1,242 @@
+#include <gtest/gtest.h>
+#include <utility>
+#include <algorithm>
+#include <libsgm.h>
+#include "winner_takes_all.hpp"
+#include "generator.hpp"
+#include "test_utility.hpp"
+
+#include "debug.hpp"
+
+namespace {
+
+static constexpr size_t NUM_PATHS = 8;
+
+thrust::host_vector<sgm::output_type> winner_takes_all_left(
+	const thrust::host_vector<sgm::cost_type>& src,
+	size_t width, size_t height, size_t pitch, size_t disparity,
+	float uniqueness, bool subpixel)
+{
+	thrust::host_vector<sgm::output_type> result(pitch * height);
+	for(size_t i = 0; i < height; ++i){
+		for(size_t j = 0; j < width; ++j){
+			std::vector<std::pair<int, int>> v;
+			for(size_t k = 0; k < disparity; ++k){
+				int cost_sum = 0;
+				for(size_t p = 0; p < NUM_PATHS; ++p){
+					cost_sum += static_cast<int>(src[
+						p * disparity * width * height +
+						i * disparity * width +
+						j * disparity +
+						k]);
+				}
+				v.emplace_back(cost_sum, static_cast<int>(k));
+			}
+			const auto ite = std::min_element(v.begin(), v.end());
+			assert(ite != v.end());
+			const auto best = *ite;
+			const int best_cost = best.first;
+			sgm::output_type best_disp = best.second;
+			sgm::output_type dst = best_disp;
+			if (subpixel) {
+				dst <<= sgm::StereoSGM::SUBPIXEL_SHIFT;
+				if (0 < best_disp && best_disp < static_cast<int>(disparity) - 1) {
+					const int left = v[best_disp - 1].first;
+					const int right = v[best_disp + 1].first;
+					const int numer = left - right;
+					const int denom = left - 2 * best_cost + right;
+					dst += ((numer << sgm::StereoSGM::SUBPIXEL_SHIFT) + denom) / (2 * denom);
+				}
+			}
+			for (const auto& p : v) {
+				const int cost = p.first;
+				const int disp = p.second;
+				if (cost * uniqueness < best_cost && abs(disp - best_disp) > 1) {
+					dst = 0;
+					break;
+				}
+			}
+			result[i * pitch + j] = dst;
+		}
+	}
+	return result;
+}
+
+thrust::host_vector<sgm::output_type> winner_takes_all_right(
+	const thrust::host_vector<sgm::cost_type>& src,
+	size_t width, size_t height, size_t pitch, size_t disparity,
+	float uniqueness)
+{
+	thrust::host_vector<sgm::output_type> result(pitch * height);
+	for(size_t i = 0; i < height; ++i){
+		for(size_t j = 0; j < width; ++j){
+			std::vector<std::pair<int, int>> v;
+			for(size_t k = 0; j + k < width && k < disparity; ++k){
+				int cost_sum = 0;
+				for(size_t p = 0; p < NUM_PATHS; ++p){
+					cost_sum += static_cast<int>(src[
+						p * disparity * width * height +
+						i * disparity * width +
+						(j + k) * disparity +
+						k]);
+				}
+				v.emplace_back(cost_sum, static_cast<int>(k));
+			}
+			const auto ite = std::min_element(v.begin(), v.end());
+			assert(ite != v.end());
+			const auto best = *ite;
+			result[i * pitch + j] = best.second;
+		}
+	}
+	return result;
+}
+
+}
+
+static void test_random_left(bool subpixel, size_t padding = 0)
+{
+	static constexpr size_t width = 313, height = 237, disparity = 128;
+	static constexpr float uniqueness = 0.95f;
+	const size_t pitch = width + padding;
+	const auto input = generate_random_sequence<sgm::cost_type>(
+	width * height * disparity * NUM_PATHS);
+	const auto expect = winner_takes_all_left(
+		input, width, height, pitch, disparity, uniqueness, subpixel);
+
+	sgm::WinnerTakesAll<disparity> wta;
+	const auto d_input = to_device_vector(input);
+	wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::output_type> d_actual(
+		wta.get_left_output(), wta.get_left_output() + (pitch * height));
+	const auto actual = to_host_vector(d_actual);
+
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), pitch, height, 1);
+}
+
+static void test_corner1_left(bool subpixel, size_t padding = 0)
+{
+	static constexpr size_t width = 1, height = 1, disparity = 64;
+	static constexpr float uniqueness = 0.95f;
+	const size_t pitch = width + padding;
+	static constexpr size_t n = width * height * disparity * NUM_PATHS;
+	static constexpr size_t step = width * height * disparity;
+	thrust::host_vector<sgm::cost_type> input(n);
+	for (auto& v : input) {
+		v = 1;
+	}
+	for (size_t i = 0; i < NUM_PATHS; ++i) {
+		input[i * step] = 64;
+	}
+	const auto expect = winner_takes_all_left(
+		input, width, height, pitch, disparity, uniqueness, subpixel);
+
+	sgm::WinnerTakesAll<disparity> wta;
+	const auto d_input = to_device_vector(input);
+	wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::output_type> d_actual(
+		wta.get_left_output(), wta.get_left_output() + (pitch * height));
+	const auto actual = to_host_vector(d_actual);
+
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), pitch, height, 1);
+}
+
+static void test_corner2_left(bool subpixel, size_t padding = 0)
+{
+	static constexpr size_t width = 1, height = 1, disparity = 64;
+	static constexpr float uniqueness = 0.95f;
+	const size_t pitch = width + padding;
+	static constexpr size_t n = width * height * disparity * NUM_PATHS;
+	static constexpr size_t step = width * height * disparity;
+	thrust::host_vector<sgm::cost_type> input(n);
+	for (auto& v : input) {
+		v = 64;
+	}
+	for (size_t i = 0; i < NUM_PATHS; ++i) {
+		input[i * step + 16] = 1;
+	}
+	for (size_t i = 0; i < NUM_PATHS; ++i) {
+		input[i * step + 32] = 1;
+	}
+	const auto expect = winner_takes_all_left(
+		input, width, height, pitch, disparity, uniqueness, subpixel);
+
+	sgm::WinnerTakesAll<disparity> wta;
+	const auto d_input = to_device_vector(input);
+	wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, subpixel, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::output_type> d_actual(
+		wta.get_left_output(), wta.get_left_output() + (pitch * height));
+	const auto actual = to_host_vector(d_actual);
+
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), pitch, height, 1);
+}
+
+TEST(WinnerTakesAllTest, RandomLeftNormal){
+	test_random_left(false);
+}
+
+TEST(WinnerTakesAllTest, RandomLeftSubpixel){
+	test_random_left(true);
+}
+
+TEST(WinnerTakesAllTest, RandomLeftNormalWithPitch){
+	test_random_left(false, 27);
+}
+
+TEST(WinnerTakesAllTest, RandomLeftSubpixelWithPitch){
+	test_random_left(true, 27);
+}
+
+TEST(WinnerTakesAllTest, Corner1LeftNormal){
+	test_corner1_left(false);
+}
+
+TEST(WinnerTakesAllTest, Corner1LeftSubpixel){
+	test_corner1_left(true);
+}
+
+TEST(WinnerTakesAllTest, Corner2LeftNormal){
+	test_corner2_left(false);
+}
+
+TEST(WinnerTakesAllTest, Corner2LeftSubpixel){
+	test_corner2_left(true);
+}
+
+static void test_random_right(size_t padding = 0)
+{
+	static constexpr size_t width = 313, height = 237, disparity = 64;
+	static constexpr float uniqueness = 0.95f;
+	const size_t pitch = width + padding;
+	const auto input = generate_random_sequence<sgm::cost_type>(
+		width * height * disparity * NUM_PATHS);
+	const auto expect = winner_takes_all_right(
+		input, width, height, pitch, disparity, uniqueness);
+
+	sgm::WinnerTakesAll<disparity> wta;
+	const auto d_input = to_device_vector(input);
+	wta.enqueue(d_input.data().get(), width, height, static_cast<int>(pitch), uniqueness, false, 0);
+	cudaStreamSynchronize(0);
+
+	const thrust::device_vector<sgm::output_type> d_actual(
+		wta.get_right_output(), wta.get_right_output() + (pitch * height));
+	const auto actual = to_host_vector(d_actual);
+	EXPECT_EQ(actual, expect);
+	debug_compare(actual.data(), expect.data(), pitch, height, 1);
+}
+
+TEST(WinnerTakesAllTest, RandomRight){
+	test_random_right();
+}
+
+TEST(WinnerTakesAllTest, RandomRightWithPitch){
+	test_random_right(27);
+}