Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
ftl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nicolas Pope
ftl
Commits
9ddef690
Commit
9ddef690
authored
5 years ago
by
Nicolas Pope
Browse files
Options
Downloads
Patches
Plain Diff
Warp pixel optimize
parent
c160edaf
No related branches found
No related tags found
1 merge request
!191
Warp optimise correspondence search
Pipeline
#16960
passed
5 years ago
Stage: all
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
components/operators/src/correspondence.cu
+90
-21
90 additions, 21 deletions
components/operators/src/correspondence.cu
components/renderers/cpp/include/ftl/cuda/warp.hpp
+35
-0
35 additions, 0 deletions
components/renderers/cpp/include/ftl/cuda/warp.hpp
with
125 additions
and
21 deletions
components/operators/src/correspondence.cu
+
90
−
21
View file @
9ddef690
#include
"mvmls_cuda.hpp"
#include
<ftl/cuda/weighting.hpp>
#include
<ftl/cuda/mask.hpp>
#include
<ftl/cuda/warp.hpp>
using
ftl
::
cuda
::
TextureObject
;
using
ftl
::
rgbd
::
Camera
;
...
...
@@ -8,6 +9,8 @@ using ftl::cuda::Mask;
using
ftl
::
cuda
::
MvMLSParams
;
#define T_PER_BLOCK 8
#define WARP_SIZE 32
#define INTERVAL 1
template
<
int
FUNCTION
>
__device__
float
weightFunction
(
const
ftl
::
cuda
::
MvMLSParams
&
params
,
float
dweight
,
float
cweight
);
...
...
@@ -42,6 +45,45 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams ¶ms,
return
(
cweight
>
0.0
f
)
?
dweight
:
0.0
f
;
}
#ifndef PINF
#define PINF __int_as_float(0x7f800000)
#endif
__device__
inline
float
distance
(
float4
p1
,
float4
p2
)
{
return
min
(
1.0
f
,
max
(
max
(
fabsf
(
p1
.
x
-
p2
.
x
),
fabsf
(
p1
.
y
-
p2
.
y
)),
fabsf
(
p1
.
z
-
p2
.
z
))
/
10.0
f
);
//return min(1.0f, ftl::cuda::colourDistance(p1, p2) / 10.0f);
}
__device__
inline
int
halfWarpCensus
(
float
e
)
{
float
e0
=
__shfl_sync
(
FULL_MASK
,
e
,
(
threadIdx
.
x
>=
16
)
?
16
:
0
,
WARP_SIZE
);
int
c
=
(
e
>
e0
)
?
1
<<
(
threadIdx
.
x
%
16
)
:
0
;
for
(
int
i
=
WARP_SIZE
/
4
;
i
>
0
;
i
/=
2
)
{
const
int
other
=
__shfl_xor_sync
(
FULL_MASK
,
c
,
i
,
WARP_SIZE
);
c
|=
other
;
}
return
c
;
}
__device__
inline
float4
relativeDelta
(
const
float4
&
e
)
{
const
float
e0x
=
__shfl_sync
(
FULL_MASK
,
e
.
x
,
0
,
WARP_SIZE
/
2
);
const
float
e0y
=
__shfl_sync
(
FULL_MASK
,
e
.
y
,
0
,
WARP_SIZE
/
2
);
const
float
e0z
=
__shfl_sync
(
FULL_MASK
,
e
.
z
,
0
,
WARP_SIZE
/
2
);
return
make_float4
(
e
.
x
-
e0x
,
e
.
y
-
e0y
,
e
.
z
-
e0z
,
0.0
f
);
}
/**
* See: Birchfield S. et al. (1998). A pixel dissimilarity measure that is
* insensitive to image sampling. IEEE Transactions on Pattern Analysis and
* Machine Intelligence.
*/
__device__
float
dissimilarity
(
const
float4
&
l
,
const
float4
&
rp
,
const
float4
&
rc
,
const
float4
&
rn
)
{
const
float
rpd
=
distance
((
rc
-
rp
)
*
0.5
f
+
rp
,
l
);
const
float
rnd
=
distance
((
rc
-
rn
)
*
0.5
f
+
rn
,
l
);
const
float
rcd
=
distance
(
rc
,
l
);
return
min
(
min
(
rpd
,
rnd
),
rcd
);
}
template
<
int
COR_STEPS
,
int
FUNCTION
>
__global__
void
corresponding_point_kernel
(
TextureObject
<
float
>
d1
,
...
...
@@ -59,8 +101,13 @@ __global__ void corresponding_point_kernel(
// Each warp picks point in p1
//const int tid = (threadIdx.x + threadIdx.y * blockDim.x);
const
int
x
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
);
// / WARP_SIZE;
const
int
y
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
//const int x = (blockIdx.x*blockDim.x + threadIdx.x); // / WARP_SIZE;
//const int y = blockIdx.y*blockDim.y + threadIdx.y;
const
int
tid
=
(
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
);
const
int
x
=
(
blockIdx
.
x
*
8
+
(
threadIdx
.
x
%
4
)
+
4
*
(
threadIdx
.
x
/
16
));
// / WARP_SIZE;
const
int
y
=
blockIdx
.
y
*
8
+
threadIdx
.
x
/
4
+
4
*
threadIdx
.
y
;
if
(
x
>=
0
&&
y
>=
0
&&
x
<
screenOut
.
width
()
&&
y
<
screenOut
.
height
())
{
screenOut
(
x
,
y
)
=
make_short2
(
-
1
,
-
1
);
...
...
@@ -88,37 +135,51 @@ __global__ void corresponding_point_kernel(
int
count
=
0
;
float
contrib
=
0.0
f
;
const
float
step_interval
=
params
.
range
/
(
COR_STEPS
/
2
);
//
const float step_interval = params.range / (COR_STEPS / 2);
const
float3
rayStep_world
=
pose1
.
getFloat3x3
()
*
cam1
.
screenToCam
(
x
,
y
,
step_interval
);
const
float3
rayStart_2
=
pose2
*
world1
;
const
float3
rayStep_2
=
pose2
.
getFloat3x3
()
*
rayStep_world
;
//const float3 rayStep_world = pose1.getFloat3x3() * cam1.screenToCam(x,y,step_interval);
//const float3 rayStart_2 = pose2 * world1;
//const float3 rayStep_2 = pose2.getFloat3x3() * rayStep_world;
const
float3
camPosOrigin
=
pose2
*
world1
;
const
float2
lineOrigin
=
cam2
.
camToScreen
<
float2
>
(
camPosOrigin
);
const
float3
camPosDistant
=
pose2
*
(
pose1
*
cam1
.
screenToCam
(
x
,
y
,
depth1
+
10.0
f
));
const
float2
lineDistant
=
cam2
.
camToScreen
<
float2
>
(
camPosDistant
);
const
float
lineM
=
(
lineDistant
.
y
-
lineOrigin
.
y
)
/
(
lineDistant
.
x
-
lineOrigin
.
x
);
const
float
depthM
=
10.0
f
/
(
lineDistant
.
x
-
lineOrigin
.
x
);
const
float
depthM2
=
(
camPosDistant
.
z
-
camPosOrigin
.
z
)
/
(
lineDistant
.
x
-
lineOrigin
.
x
);
float2
linePos
;
linePos
.
x
=
lineOrigin
.
x
-
((
COR_STEPS
/
2
));
linePos
.
y
=
lineOrigin
.
y
-
(((
COR_STEPS
/
2
))
*
lineM
);
float
depthPos
=
depth1
-
(
float
((
COR_STEPS
/
2
))
*
depthM
);
float
depthPos2
=
camPosOrigin
.
z
-
(
float
((
COR_STEPS
/
2
))
*
depthM2
);
// Project to p2 using cam2
// Each thread takes a possible correspondence and calculates a weighting
//const int lane = tid % WARP_SIZE;
for
(
int
i
=
0
;
i
<
COR_STEPS
;
++
i
)
{
const
int
j
=
i
-
(
COR_STEPS
/
2
);
const
float
depth_adjust
=
(
float
)
j
*
step_interval
;
//
const int j = i - (COR_STEPS/2);
//
const float depth_adjust = (float)j * step_interval;
// Calculate adjusted depth 3D point in camera 2 space
const
float3
worldPos
=
world1
+
j
*
rayStep_world
;
//(pose1 * cam1.screenToCam(x, y, depth_adjust));
const
float3
camPos
=
rayStart_2
+
j
*
rayStep_2
;
//pose2 * worldPos;
const
float2
screen
=
cam2
.
camToScreen
<
float2
>
(
camPos
);
//
const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
//
const float3 camPos = rayStart_2 + j * rayStep_2; //pose2 * worldPos;
//
const float2 screen = cam2.camToScreen<float2>(camPos);
float
weight
=
(
screen
.
x
>=
cam2
.
width
||
screen
.
y
>=
cam2
.
height
)
?
0.0
f
:
1.0
f
;
float
weight
=
(
linePos
.
x
>=
cam2
.
width
||
linePos
.
y
>=
cam2
.
height
)
?
0.0
f
:
1.0
f
;
// Generate a colour correspondence value
const
auto
colour2
=
c2
.
tex2D
(
screen
.
x
,
screen
.
y
);
const
auto
colour2
=
c2
.
tex2D
(
linePos
.
x
,
linePos
.
y
);
const
float
cweight
=
ftl
::
cuda
::
colourWeighting
(
colour1
,
colour2
,
params
.
colour_smooth
);
// Generate a depth correspondence value
const
float
depth2
=
d2
.
tex2D
(
int
(
screen
.
x
+
0.5
f
),
int
(
screen
.
y
+
0.5
f
));
const
float
depth2
=
d2
.
tex2D
(
int
(
linePos
.
x
+
0.5
f
),
int
(
linePos
.
y
+
0.5
f
));
if
(
FUNCTION
==
1
)
{
weight
*=
ftl
::
cuda
::
weighting
(
fabs
(
depth2
-
cam
Pos
.
z
),
cweight
*
params
.
spatial_smooth
);
weight
*=
ftl
::
cuda
::
weighting
(
fabs
(
depth2
-
depth
Pos
2
),
cweight
*
params
.
spatial_smooth
);
}
else
{
const
float
dweight
=
ftl
::
cuda
::
weighting
(
fabs
(
depth2
-
cam
Pos
.
z
),
params
.
spatial_smooth
);
const
float
dweight
=
ftl
::
cuda
::
weighting
(
fabs
(
depth2
-
depth
Pos
2
),
params
.
spatial_smooth
);
weight
*=
weightFunction
<
FUNCTION
>
(
params
,
dweight
,
cweight
);
}
//const float dweight = ftl::cuda::weighting(fabs(depth_adjust), 10.0f*params.range);
...
...
@@ -130,14 +191,19 @@ __global__ void corresponding_point_kernel(
bestcolour
=
max
(
cweight
,
bestcolour
);
//bestdweight = max(dweight, bestdweight);
totalcolour
+=
cweight
;
bestdepth
=
(
weight
>
bestweight
)
?
depth
_adjust
:
bestdepth
;
bestdepth
=
(
weight
>
bestweight
)
?
depth
Pos
:
bestdepth
;
//bestdepth2 = (weight > bestweight) ? camPos.z : bestdepth2;
//bestScreen = (weight > bestweight) ? make_short2(screen.x+0.5f, screen.y+0.5f) : bestScreen;
bestweight
=
max
(
bestweight
,
weight
);
//bestweight = weight;
//bestdepth = depth_adjust;
//bestScreen = make_short2(screen.x+0.5f, screen.y+0.5f);
//}
//}
depthPos
+=
depthM
*
float
(
INTERVAL
);
depthPos2
+=
depthM2
*
float
(
INTERVAL
);
linePos
.
x
+=
float
(
INTERVAL
);
linePos
.
y
+=
lineM
*
float
(
INTERVAL
);
}
const
float
avgcolour
=
totalcolour
/
(
float
)
count
;
...
...
@@ -149,7 +215,7 @@ __global__ void corresponding_point_kernel(
float
old
=
conf
.
tex2D
(
x
,
y
);
if
(
bestweight
*
confidence
>
old
)
{
d1
(
x
,
y
)
=
0.4
f
*
bestdepth
+
depth1
;
d1
(
x
,
y
)
=
(
0.4
f
*
(
bestdepth
-
depth1
))
+
depth1
;
//d2(bestScreen.x, bestScreen.y) = bestdepth2;
//screenOut(x,y) = bestScreen;
conf
(
x
,
y
)
=
bestweight
*
confidence
;
...
...
@@ -176,8 +242,11 @@ void ftl::cuda::correspondence(
const
Camera
&
cam2
,
const
MvMLSParams
&
params
,
int
func
,
cudaStream_t
stream
)
{
const
dim3
gridSize
((
d1
.
width
()
+
T_PER_BLOCK
-
1
)
/
T_PER_BLOCK
,
(
d1
.
height
()
+
T_PER_BLOCK
-
1
)
/
T_PER_BLOCK
);
const
dim3
blockSize
(
T_PER_BLOCK
,
T_PER_BLOCK
);
//const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
//const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
const
dim3
gridSize
((
d1
.
width
()
+
1
),
(
d1
.
height
()
+
T_PER_BLOCK
-
1
)
/
T_PER_BLOCK
);
const
dim3
blockSize
(
WARP_SIZE
,
2
);
//printf("COR SIZE %d,%d\n", p1.width(), p1.height());
...
...
This diff is collapsed.
Click to expand it.
components/renderers/cpp/include/ftl/cuda/warp.hpp
+
35
−
0
View file @
9ddef690
...
...
@@ -42,6 +42,41 @@ __device__ inline int warpSum(int e) {
return
e
;
}
// Half warp
__device__
inline
float
halfWarpMin
(
float
e
)
{
for
(
int
i
=
WARP_SIZE
/
4
;
i
>
0
;
i
/=
2
)
{
const
float
other
=
__shfl_xor_sync
(
FULL_MASK
,
e
,
i
,
WARP_SIZE
);
e
=
min
(
e
,
other
);
}
return
e
;
}
__device__
inline
float
halfWarpMax
(
float
e
)
{
for
(
int
i
=
WARP_SIZE
/
4
;
i
>
0
;
i
/=
2
)
{
const
float
other
=
__shfl_xor_sync
(
FULL_MASK
,
e
,
i
,
WARP_SIZE
);
e
=
max
(
e
,
other
);
}
return
e
;
}
__device__
inline
float
halfWarpSum
(
float
e
)
{
for
(
int
i
=
WARP_SIZE
/
4
;
i
>
0
;
i
/=
2
)
{
const
float
other
=
__shfl_xor_sync
(
FULL_MASK
,
e
,
i
,
WARP_SIZE
);
e
+=
other
;
}
return
e
;
}
__device__
inline
int
halfWarpSum
(
int
e
)
{
for
(
int
i
=
WARP_SIZE
/
4
;
i
>
0
;
i
/=
2
)
{
const
float
other
=
__shfl_xor_sync
(
FULL_MASK
,
e
,
i
,
WARP_SIZE
);
e
+=
other
;
}
return
e
;
}
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment