Skip to content
Snippets Groups Projects
Commit acbc855f authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Variance weights for libsgm

parent 71fc80f5
No related branches found
No related tags found
No related merge requests found
Showing
with 202 additions and 55 deletions
......@@ -65,6 +65,8 @@ class FixstarsSGM : public ftl::operators::Operator {
cv::cuda::GpuMat disp_int_;
cv::cuda::GpuMat P2_map_;
cv::cuda::GpuMat weights_;
cv::cuda::GpuMat weightsF_;
cv::cuda::GpuMat edges_;
cv::Ptr<cv::cuda::CannyEdgeDetector> canny_;
......
......@@ -5,6 +5,7 @@
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudafilters.hpp>
using cv::Size;
using cv::cuda::GpuMat;
......@@ -15,6 +16,33 @@ using ftl::rgbd::Frame;
using ftl::rgbd::Source;
using ftl::operators::FixstarsSGM;
static void variance_mask(cv::InputArray in, cv::OutputArray out, int wsize, cv::cuda::Stream &cvstream) {
if (in.isGpuMat() && out.isGpuMat()) {
cv::cuda::GpuMat im;
cv::cuda::GpuMat im2;
cv::cuda::GpuMat mean;
cv::cuda::GpuMat mean2;
mean.create(in.size(), CV_32FC1);
mean2.create(in.size(), CV_32FC1);
im2.create(in.size(), CV_32FC1);
in.getGpuMat().convertTo(im, CV_32FC1, cvstream);
cv::cuda::multiply(im, im, im2, 1.0, CV_32FC1, cvstream);
auto filter = cv::cuda::createBoxFilter(CV_32FC1, CV_32FC1, cv::Size(wsize,wsize));
filter->apply(im, mean, cvstream); // E[X]
filter->apply(im2, mean2, cvstream); // E[X^2]
cv::cuda::multiply(mean, mean, mean, 1.0, -1, cvstream); // (E[X])^2
// NOTE: floating point accuracy in subtraction
// (cv::cuda::createBoxFilter only supports float and 8 bit integer types)
cv::cuda::subtract(mean2, mean, out.getGpuMatRef(), cv::noArray(), -1, cvstream); // E[X^2] - (E[X])^2
}
else { throw std::exception(); /* todo CPU version */ }
}
void FixstarsSGM::computeP2(cudaStream_t &stream) {
const int P3 = config()->value("P3", P2_);
auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream);
......@@ -115,6 +143,8 @@ bool FixstarsSGM::init() {
lbw_.create(size_, CV_8UC1);
rbw_.create(size_, CV_8UC1);
disp_int_.create(size_, CV_16SC1);
weights_.create(size_, CV_32FC1);
weights_.setTo(1.0);
LOG(INFO) << "INIT FIXSTARS";
......@@ -164,8 +194,19 @@ bool FixstarsSGM::apply(Frame &in, Frame &out, cudaStream_t stream) {
//cvstream.waitForCompletion();
computeP2(stream);
//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, stream);
bool use_variance = config()->value("use_variance", true);
if (use_variance) {
variance_mask(lbw_, weightsF_, config()->value("var_wsize", 11), cvstream);
float minweight = std::min(1.0f, std::max(0.0f, config()->value("var_minweight", 0.5f)));
cv::cuda::normalize(weightsF_, weightsF_, minweight, 1.0, cv::NORM_MINMAX, -1, cv::noArray(), cvstream);
weightsF_.convertTo(weights_, CV_8UC1, 255.0f);
//if ((int)P2_map_.step != P2_map_.cols) LOG(ERROR) << "P2 map step error: " << P2_map_.cols << "," << P2_map_.step;
ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, (uint8_t*) weights_.data, weights_.step1(), stream);
} else {
ssgm_->execute(lbw_.data, rbw_.data, disp_int_.data, P2_map_.data, nullptr, 0, stream);
}
// GpuMat left_pixels(dispt_, cv::Rect(0, 0, max_disp_, dispt_.rows));
// left_pixels.setTo(0);
......
......@@ -84,7 +84,7 @@ namespace sgm {
* @attention
* output_depth_bits must be set to 16 when subpixel is enabled.
*/
LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
EXECUTE_INOUT inout_type, const Parameters& param = Parameters());
/**
......@@ -114,13 +114,13 @@ namespace sgm {
* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
*/
LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream);
LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
/**
* Same as execute(left_pixels, right_pixels, dst) with image size parameters.
* Dimensions must be smaller or equal to dimensions provided in constructor.
*/
LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream);
LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream);
/**
* Mask for invalid pixels. Must have same shape and pitch as src. Pixels which have non-zero values
......@@ -129,7 +129,7 @@ namespace sgm {
LIBSGM_API void setMask(uint8_t* mask, int pitch);
/**
* Update parameters. Returns true if successful.
* Update parameters. Returns true if successful.
*/
LIBSGM_API bool updateParameters(const Parameters &params);
......
......@@ -37,7 +37,9 @@ __global__ void aggregate_horizontal_path_kernel(
int height,
unsigned int p1,
const uint8_t* __restrict__ p2,
int p2_pitch)
int p2_pitch,
const uint8_t* __restrict__ w,
int w_pitch)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
......@@ -146,7 +148,7 @@ __global__ void aggregate_horizontal_path_kernel(
for(unsigned int k = 0; k < DP_BLOCK_SIZE; ++k){
local_costs[k] = __popc(left_value ^ right_buffer[j][k]);
}
dp[j].update(local_costs, p1, p2[x], shfl_mask);
dp[j].update(local_costs, p1, p2[x], w ? float(w[x])/255.0f : 1.0f, shfl_mask);
store_uint8_vector<DP_BLOCK_SIZE>(
&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
dp[j].dp);
......@@ -167,6 +169,8 @@ void enqueue_aggregate_left2right_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -176,7 +180,7 @@ void enqueue_aggregate_left2right_path(
const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_horizontal_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
template <unsigned int MAX_DISPARITY>
......@@ -189,6 +193,8 @@ void enqueue_aggregate_right2left_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -198,7 +204,7 @@ void enqueue_aggregate_right2left_path(
const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_horizontal_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
......@@ -211,6 +217,8 @@ template void enqueue_aggregate_left2right_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_left2right_path<128u>(
......@@ -222,8 +230,10 @@ template void enqueue_aggregate_left2right_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_left2right_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -233,6 +243,8 @@ template void enqueue_aggregate_left2right_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_right2left_path<64u>(
......@@ -244,6 +256,8 @@ template void enqueue_aggregate_right2left_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_right2left_path<128u>(
......@@ -255,8 +269,10 @@ template void enqueue_aggregate_right2left_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_right2left_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -266,6 +282,8 @@ template void enqueue_aggregate_right2left_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
......@@ -32,6 +32,8 @@ void enqueue_aggregate_left2right_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template <unsigned int MAX_DISPARITY>
......@@ -44,6 +46,8 @@ void enqueue_aggregate_right2left_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
......@@ -33,7 +33,9 @@ __global__ void aggregate_oblique_path_kernel(
int height,
unsigned int p1,
const uint8_t* __restrict__ p2,
int p2_pitch)
int p2_pitch,
const uint8_t* __restrict__ w,
int w_pitch)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
......@@ -105,7 +107,7 @@ __global__ void aggregate_oblique_path_kernel(
for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
local_costs[j] = __popc(left_value ^ right_values[j]);
}
dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask);
store_uint8_vector<DP_BLOCK_SIZE>(
&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
dp.dp);
......@@ -125,6 +127,8 @@ void enqueue_aggregate_upleft2downright_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -133,7 +137,7 @@ void enqueue_aggregate_upleft2downright_path(
const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_oblique_path_kernel<1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
template <unsigned int MAX_DISPARITY>
......@@ -146,6 +150,8 @@ void enqueue_aggregate_upright2downleft_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -154,7 +160,7 @@ void enqueue_aggregate_upright2downleft_path(
const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_oblique_path_kernel<-1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
template <unsigned int MAX_DISPARITY>
......@@ -167,6 +173,8 @@ void enqueue_aggregate_downright2upleft_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -175,7 +183,7 @@ void enqueue_aggregate_downright2upleft_path(
const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_oblique_path_kernel<-1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
template <unsigned int MAX_DISPARITY>
......@@ -188,6 +196,8 @@ void enqueue_aggregate_downleft2upright_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -196,7 +206,7 @@ void enqueue_aggregate_downleft2upright_path(
const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_oblique_path_kernel<1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
......@@ -209,6 +219,8 @@ template void enqueue_aggregate_upleft2downright_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_upleft2downright_path<128u>(
......@@ -220,8 +232,10 @@ template void enqueue_aggregate_upleft2downright_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_upleft2downright_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -231,6 +245,8 @@ template void enqueue_aggregate_upleft2downright_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_upright2downleft_path<64u>(
......@@ -242,6 +258,8 @@ template void enqueue_aggregate_upright2downleft_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_upright2downleft_path<128u>(
......@@ -253,8 +271,10 @@ template void enqueue_aggregate_upright2downleft_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_upright2downleft_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -264,6 +284,8 @@ template void enqueue_aggregate_upright2downleft_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downright2upleft_path<64u>(
......@@ -275,6 +297,8 @@ template void enqueue_aggregate_downright2upleft_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downright2upleft_path<128u>(
......@@ -286,8 +310,10 @@ template void enqueue_aggregate_downright2upleft_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downright2upleft_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -297,6 +323,8 @@ template void enqueue_aggregate_downright2upleft_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downleft2upright_path<64u>(
......@@ -308,6 +336,8 @@ template void enqueue_aggregate_downleft2upright_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downleft2upright_path<128u>(
......@@ -319,8 +349,10 @@ template void enqueue_aggregate_downleft2upright_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_downleft2upright_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -330,6 +362,8 @@ template void enqueue_aggregate_downleft2upright_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
......@@ -32,6 +32,8 @@ void enqueue_aggregate_upleft2downright_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template <unsigned int MAX_DISPARITY>
......@@ -44,6 +46,8 @@ void enqueue_aggregate_upright2downleft_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template <unsigned int MAX_DISPARITY>
......@@ -56,6 +60,8 @@ void enqueue_aggregate_downright2upleft_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template <unsigned int MAX_DISPARITY>
......@@ -68,6 +74,8 @@ void enqueue_aggregate_downleft2upright_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
......@@ -49,6 +49,8 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
const size_t buffer_size = width * height * MAX_DISPARITY * NUM_PATHS;
......@@ -59,28 +61,28 @@ void PathAggregation<MAX_DISPARITY>::enqueue(
cudaStreamSynchronize(stream);
path_aggregation::enqueue_aggregate_up2down_path<MAX_DISPARITY>(
m_cost_buffer.data() + 0 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[0]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[0]);
path_aggregation::enqueue_aggregate_down2up_path<MAX_DISPARITY>(
m_cost_buffer.data() + 1 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[1]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[1]);
path_aggregation::enqueue_aggregate_left2right_path<MAX_DISPARITY>(
m_cost_buffer.data() + 2 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[2]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[2]);
path_aggregation::enqueue_aggregate_right2left_path<MAX_DISPARITY>(
m_cost_buffer.data() + 3 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[3]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[3]);
path_aggregation::enqueue_aggregate_upleft2downright_path<MAX_DISPARITY>(
m_cost_buffer.data() + 4 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[4]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[4]);
path_aggregation::enqueue_aggregate_upright2downleft_path<MAX_DISPARITY>(
m_cost_buffer.data() + 5 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[5]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[5]);
path_aggregation::enqueue_aggregate_downright2upleft_path<MAX_DISPARITY>(
m_cost_buffer.data() + 6 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[6]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[6]);
path_aggregation::enqueue_aggregate_downleft2upright_path<MAX_DISPARITY>(
m_cost_buffer.data() + 7 * buffer_step,
left, right, width, height, p1, p2, p2_pitch, m_streams[7]);
left, right, width, height, p1, p2, p2_pitch, w, w_pitch, m_streams[7]);
for(unsigned int i = 0; i < NUM_PATHS; ++i){
cudaEventRecord(m_events[i], m_streams[i]);
cudaStreamWaitEvent(stream, m_events[i], 0);
......
......@@ -31,7 +31,7 @@ private:
DeviceBuffer<cost_type> m_cost_buffer;
cudaStream_t m_streams[NUM_PATHS];
cudaEvent_t m_events[NUM_PATHS];
public:
PathAggregation();
~PathAggregation();
......@@ -48,6 +48,8 @@ public:
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
};
......
......@@ -32,7 +32,7 @@ struct DynamicProgramming {
DP_BLOCK_SIZE >= 2,
"DP_BLOCK_SIZE must be greater than or equal to 2");
static_assert(
(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,
(SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0,
"SUBGROUP_SIZE must be a power of 2");
uint32_t last_min;
......@@ -45,7 +45,7 @@ struct DynamicProgramming {
}
__device__ void update(
uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
uint32_t *local_costs, uint32_t p1, uint32_t p2, float w, uint32_t mask)
{
const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
......@@ -62,14 +62,14 @@ struct DynamicProgramming {
uint32_t out = min(dp[k] - last_min, p2);
if(lane_id != 0){ out = min(out, prev - last_min + p1); }
out = min(out, dp[k + 1] - last_min + p1);
lazy_out = local_min = out + local_costs[k];
lazy_out = local_min = out + round(local_costs[k]*w);
}
for(unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k){
uint32_t out = min(dp[k] - last_min, p2);
out = min(out, dp[k - 1] - last_min + p1);
out = min(out, dp[k + 1] - last_min + p1);
dp[k - 1] = lazy_out;
lazy_out = out + local_costs[k];
lazy_out = out + round(local_costs[k]*w);
local_min = min(local_min, lazy_out);
}
{
......@@ -85,7 +85,7 @@ struct DynamicProgramming {
out = min(out, next - last_min + p1);
}
dp[k - 1] = lazy_out;
dp[k] = out + local_costs[k];
dp[k] = out + round(local_costs[k]*w);
local_min = min(local_min, dp[k]);
}
last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
......
......@@ -53,6 +53,8 @@ public:
int dst_pitch,
unsigned int penalty1,
const uint8_t *penalty2,
const uint8_t *weights,
int weights_pitch,
float uniqueness,
bool subpixel,
cudaStream_t stream)
......@@ -66,7 +68,8 @@ public:
m_census_right.get_output(),
width, height,
penalty1, penalty2,
src_pitch,
src_pitch, // bug?
weights, weights_pitch,
stream);
m_winner_takes_all.enqueue(
dest_left, dest_right,
......@@ -99,6 +102,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
int dst_pitch,
unsigned int penalty1,
const uint8_t *penalty2,
const uint8_t *weights,
int weights_pitch,
float uniqueness,
bool subpixel,
cudaStream_t stream)
......@@ -109,6 +114,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::execute(
width, height,
src_pitch, dst_pitch,
penalty1, penalty2,
weights, weights_pitch,
uniqueness, subpixel,
stream);
//cudaStreamSynchronize(0);
......@@ -126,6 +132,8 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
int dst_pitch,
unsigned int penalty1,
const uint8_t *penalty2,
const uint8_t *weights,
int weights_pitch,
float uniqueness,
bool subpixel,
cudaStream_t stream)
......@@ -136,6 +144,7 @@ void SemiGlobalMatching<T, MAX_DISPARITY>::enqueue(
width, height,
src_pitch, dst_pitch,
penalty1, penalty2,
weights, weights_pitch,
uniqueness, subpixel,
stream);
}
......
......@@ -49,6 +49,8 @@ public:
int dst_pitch,
unsigned int penalty1,
const uint8_t *penalty2,
const uint8_t *weights,
int weights_pitch,
float uniqueness,
bool subpixel,
cudaStream_t stream);
......@@ -64,6 +66,8 @@ public:
int dst_pitch,
unsigned int penalty1,
const uint8_t *penalty2,
const uint8_t *weights,
int weights_pitch,
float uniqueness,
bool subpixel,
cudaStream_t stream);
......
......@@ -28,8 +28,8 @@ namespace sgm {
class SemiGlobalMatchingBase {
public:
using output_type = sgm::output_type;
virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
virtual void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) = 0;
virtual ~SemiGlobalMatchingBase() {}
};
......@@ -38,9 +38,9 @@ namespace sgm {
class SemiGlobalMatchingImpl : public SemiGlobalMatchingBase {
public:
void execute(output_type* dst_L, output_type* dst_R, const void* src_L, const void* src_R,
int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, float uniqueness, bool subpixel, cudaStream_t stream) override
int w, int h, int sp, int dp, unsigned int P1, const uint8_t *P2, const uint8_t *weights, int weights_pitch, float uniqueness, bool subpixel, cudaStream_t stream) override
{
sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, uniqueness, subpixel, stream);
sgm_engine_.execute(dst_L, dst_R, (const input_type*)src_L, (const input_type*)src_R, w, h, sp, dp, P1, P2, weights, weights_pitch, uniqueness, subpixel, stream);
}
private:
SemiGlobalMatching<input_type, DISP_SIZE> sgm_engine_;
......@@ -54,7 +54,7 @@ namespace sgm {
void* d_tmp_left_disp;
void* d_tmp_right_disp;
uint8_t* d_mask;
SemiGlobalMatchingBase* sgm_engine;
CudaStereoSGMResources(int width_, int height_, int disparity_size_, int input_depth_bits_, int output_depth_bits_, int src_pitch_, int dst_pitch_, EXECUTE_INOUT inout_type_) {
......@@ -80,7 +80,7 @@ namespace sgm {
CudaSafeCall(cudaMalloc(&this->d_src_left, input_depth_bits_ / 8 * src_pitch_ * height_));
CudaSafeCall(cudaMalloc(&this->d_src_right, input_depth_bits_ / 8 * src_pitch_ * height_));
}
CudaSafeCall(cudaMalloc(&this->d_left_disp, sizeof(uint16_t) * dst_pitch_ * height_));
CudaSafeCall(cudaMalloc(&this->d_right_disp, sizeof(uint16_t) * dst_pitch_ * height_));
......@@ -149,7 +149,8 @@ namespace sgm {
if (cu_res_) { delete cu_res_; }
}
void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch, const uint8_t *P2, cudaStream_t stream) {
void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const int width, const int height, const int src_pitch, const int dst_pitch,
const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
const void *d_input_left, *d_input_right;
......@@ -171,9 +172,9 @@ namespace sgm {
if (is_cuda_output(inout_type_) && output_depth_bits_ == 16)
d_left_disp = dst; // when threre is no device-host copy or type conversion, use passed buffer
cu_res_->sgm_engine->execute((uint16_t*)d_tmp_left_disp, (uint16_t*)d_tmp_right_disp,
d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, param_.uniqueness, param_.subpixel, stream);
d_input_left, d_input_right, width, height, src_pitch, dst_pitch, param_.P1, P2, weights, weights_pitch, param_.uniqueness, param_.subpixel, stream);
sgm::details::median_filter((uint16_t*)d_tmp_left_disp, (uint16_t*)d_left_disp, width, height, dst_pitch, stream);
sgm::details::median_filter((uint16_t*)d_tmp_right_disp, (uint16_t*)d_right_disp, width, height, dst_pitch, stream);
......@@ -197,8 +198,8 @@ namespace sgm {
}
}
void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, cudaStream_t stream) {
execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, stream);
void StereoSGM::execute(const void* left_pixels, const void* right_pixels, void* dst, const uint8_t *P2, const uint8_t *weights, int weights_pitch, cudaStream_t stream) {
execute(left_pixels, right_pixels, dst, width_, height_, src_pitch_, dst_pitch_, P2, weights, weights_pitch, stream);
}
bool StereoSGM::updateParameters(const Parameters &params) {
......@@ -208,7 +209,7 @@ namespace sgm {
if ((params.uniqueness < 0.0) || (params.uniqueness > 1.0)) {
return false;
}
Parameters params_ = params;
std::swap(params_, this->param_);
return true;
......
......@@ -33,7 +33,9 @@ __global__ void aggregate_vertical_path_kernel(
int height,
unsigned int p1,
const uint8_t* __restrict__ p2,
int p2_pitch)
int p2_pitch,
const uint8_t* __restrict__ w,
int w_pitch)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
......@@ -103,7 +105,7 @@ __global__ void aggregate_vertical_path_kernel(
for(unsigned int j = 0; j < DP_BLOCK_SIZE; ++j){
local_costs[j] = __popc(left_value ^ right_values[j]);
}
dp.update(local_costs, p1, p2[x+y*p2_pitch], shfl_mask);
dp.update(local_costs, p1, p2[x+y*p2_pitch], w ? float(w[x+y*w_pitch])/255.0f : 1.0f, shfl_mask);
store_uint8_vector<DP_BLOCK_SIZE>(
&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
dp.dp);
......@@ -122,6 +124,8 @@ void enqueue_aggregate_up2down_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -130,7 +134,7 @@ void enqueue_aggregate_up2down_path(
const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_vertical_path_kernel<1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
template <unsigned int MAX_DISPARITY>
......@@ -143,6 +147,8 @@ void enqueue_aggregate_down2up_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
......@@ -151,7 +157,7 @@ void enqueue_aggregate_down2up_path(
const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
aggregate_vertical_path_kernel<-1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, p2_pitch);
dest, left, right, width, height, p1, p2, p2_pitch, w, w_pitch);
}
......@@ -164,6 +170,8 @@ template void enqueue_aggregate_up2down_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_up2down_path<128u>(
......@@ -175,8 +183,10 @@ template void enqueue_aggregate_up2down_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_up2down_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -186,6 +196,8 @@ template void enqueue_aggregate_up2down_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_down2up_path<64u>(
......@@ -197,6 +209,8 @@ template void enqueue_aggregate_down2up_path<64u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_down2up_path<128u>(
......@@ -208,8 +222,10 @@ template void enqueue_aggregate_down2up_path<128u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template void enqueue_aggregate_down2up_path<256u>(
cost_type *dest,
const feature_type *left,
......@@ -219,6 +235,8 @@ template void enqueue_aggregate_down2up_path<256u>(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
......@@ -32,6 +32,8 @@ void enqueue_aggregate_up2down_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
template <unsigned int MAX_DISPARITY>
......@@ -44,6 +46,8 @@ void enqueue_aggregate_down2up_path(
unsigned int p1,
const uint8_t *p2,
int p2_pitch,
const uint8_t* w,
int w_pitch,
cudaStream_t stream);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment