Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
ftl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Nicolas Pope
ftl
Commits
9a28d56d
Commit
9a28d56d
authored
6 years ago
by
Nicolas Pope
Browse files
Options
Downloads
Patches
Plain Diff
Speed boost without quality loss through local caching
parent
4376f7d4
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
cv-node/src/algorithms/rtcensus.cu
+24
-12
24 additions, 12 deletions
cv-node/src/algorithms/rtcensus.cu
with
24 additions
and
12 deletions
cv-node/src/algorithms/rtcensus.cu
+
24
−
12
View file @
9a28d56d
...
@@ -100,20 +100,25 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
...
@@ -100,20 +100,25 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
size_t
ds
)
{
size_t
ds
)
{
//extern __shared__ uint64_t cache[];
//extern __shared__ uint64_t cache[];
const
int
gamma
=
1
0
;
const
int
gamma
=
1
;
size_t
u
=
(
blockIdx
.
x
*
BLOCK_W
)
+
threadIdx
.
x
+
RADIUS2
;
int
u
=
(
blockIdx
.
x
*
BLOCK_W
)
+
threadIdx
.
x
+
RADIUS2
;
size_t
v_start
=
(
blockIdx
.
y
*
ROWSperTHREAD
)
+
RADIUS2
;
int
v_start
=
(
blockIdx
.
y
*
ROWSperTHREAD
)
+
RADIUS2
;
size_t
v_end
=
v_start
+
ROWSperTHREAD
;
int
v_end
=
v_start
+
ROWSperTHREAD
;
int
maxdisp
=
ds
;
// Local cache
uint64_t
l_cache_l1
[
5
][
5
];
uint64_t
l_cache_l2
[
5
][
5
];
// Prepare the cache load
// Prepare the cache load
//const int cache_thread_width = (BLOCK_W+ds / BLOCK_W + RADIUS2*2 + 1)*2;
//const int cache_thread_width = (BLOCK_W+ds / BLOCK_W + RADIUS2*2 + 1)*2;
//uint64_t *cache_ptr = cache + (threadIdx.x * cache_thread_width);
//uint64_t *cache_ptr = cache + (threadIdx.x * cache_thread_width);
if
(
v_end
>=
height
)
v_end
=
height
;
if
(
v_end
>=
height
)
v_end
=
height
;
//
if (u >= width
-ds) return
;
if
(
u
+
maxdisp
>=
width
)
maxdisp
=
width
-
u
;
for
(
size_
t
v
=
v_start
;
v
<
v_end
;
v
++
)
{
for
(
in
t
v
=
v_start
;
v
<
v_end
;
v
++
)
{
/*const int cache_start = v*width*2 + cache_thread_width*blockIdx.x;
/*const int cache_start = v*width*2 + cache_thread_width*blockIdx.x;
for (int i=0; i<cache_thread_width; i+=2) {
for (int i=0; i<cache_thread_width; i+=2) {
cache_ptr[i] = census[cache_start+i];
cache_ptr[i] = census[cache_start+i];
...
@@ -122,6 +127,15 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
...
@@ -122,6 +127,15 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
__syncthreads();*/
__syncthreads();*/
// Fill local cache for window 5x5
// TODO Use shared memory?
for
(
int
m
=-
2
;
m
<=
2
;
m
++
)
{
for
(
int
n
=-
2
;
n
<=
2
;
n
++
)
{
l_cache_l2
[
m
+
2
][
n
+
2
]
=
uint2asull
(
tex2D
<
uint2
>
(
censusL
,
u
+
n
,
v
+
m
));
l_cache_l1
[
m
+
2
][
n
+
2
]
=
uint2asull
(
tex2D
<
uint2
>
(
censusR
,
u
+
n
,
v
+
m
));
}
}
uint16_t
last_ham1
=
65535
;
uint16_t
last_ham1
=
65535
;
uint16_t
last_ham2
=
65535
;
uint16_t
last_ham2
=
65535
;
uint16_t
min_disp1
=
65535
;
uint16_t
min_disp1
=
65535
;
...
@@ -136,7 +150,7 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
...
@@ -136,7 +150,7 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
int
dix2
=
0
;
int
dix2
=
0
;
// TODO Use prediction textures to narrow range
// TODO Use prediction textures to narrow range
for
(
size_
t
d
=
0
;
d
<
ds
;
d
++
)
{
for
(
in
t
d
=
0
;
d
<
maxdisp
;
d
++
)
{
uint16_t
hamming1
=
0
;
uint16_t
hamming1
=
0
;
uint16_t
hamming2
=
0
;
uint16_t
hamming2
=
0
;
...
@@ -146,13 +160,11 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
...
@@ -146,13 +160,11 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
const
auto
v_
=
(
v
+
m
);
const
auto
v_
=
(
v
+
m
);
for
(
int
n
=-
2
;
n
<=
2
;
n
++
)
{
for
(
int
n
=-
2
;
n
<=
2
;
n
++
)
{
const
auto
u_
=
u
+
n
;
const
auto
u_
=
u
+
n
;
auto
l1
=
l_cache_l1
[
m
+
2
][
n
+
2
];
auto
l2
=
uint2asull
(
tex2D
<
uint2
>
(
censusL
,
u_
,
v_
));
auto
l2
=
l_cache_l2
[
m
+
2
][
n
+
2
];
auto
l1
=
uint2asull
(
tex2D
<
uint2
>
(
censusR
,
u_
,
v_
));
// TODO Somehow might use shared memory
auto
r1
=
uint2asull
(
tex2D
<
uint2
>
(
censusL
,
u_
+
d
,
v_
));
auto
r1
=
uint2asull
(
tex2D
<
uint2
>
(
censusL
,
u_
+
d
,
v_
));
auto
r2
=
uint2asull
(
tex2D
<
uint2
>
(
censusR
,
u_
-
d
,
v_
));
auto
r2
=
uint2asull
(
tex2D
<
uint2
>
(
censusR
,
u_
-
d
,
v_
));
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment