Skip to content
Snippets Groups Projects
Commit 9a28d56d authored by Nicolas Pope's avatar Nicolas Pope
Browse files

Speed boost without quality loss through local caching

parent 4376f7d4
No related branches found
No related tags found
No related merge requests found
...@@ -100,20 +100,25 @@ __global__ void disp_kernel(float *disp_l, float *disp_r, ...@@ -100,20 +100,25 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
size_t ds) { size_t ds) {
//extern __shared__ uint64_t cache[]; //extern __shared__ uint64_t cache[];
const int gamma = 10; const int gamma = 1;
size_t u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2; int u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2;
size_t v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2; int v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2;
size_t v_end = v_start + ROWSperTHREAD; int v_end = v_start + ROWSperTHREAD;
int maxdisp = ds;
// Local cache
uint64_t l_cache_l1[5][5];
uint64_t l_cache_l2[5][5];
// Prepare the cache load // Prepare the cache load
//const int cache_thread_width = (BLOCK_W+ds / BLOCK_W + RADIUS2*2 + 1)*2; //const int cache_thread_width = (BLOCK_W+ds / BLOCK_W + RADIUS2*2 + 1)*2;
//uint64_t *cache_ptr = cache + (threadIdx.x * cache_thread_width); //uint64_t *cache_ptr = cache + (threadIdx.x * cache_thread_width);
if (v_end >= height) v_end = height; if (v_end >= height) v_end = height;
//if (u >= width-ds) return; if (u+maxdisp >= width) maxdisp = width-u;
for (size_t v=v_start; v<v_end; v++) { for (int v=v_start; v<v_end; v++) {
/*const int cache_start = v*width*2 + cache_thread_width*blockIdx.x; /*const int cache_start = v*width*2 + cache_thread_width*blockIdx.x;
for (int i=0; i<cache_thread_width; i+=2) { for (int i=0; i<cache_thread_width; i+=2) {
cache_ptr[i] = census[cache_start+i]; cache_ptr[i] = census[cache_start+i];
...@@ -122,6 +127,15 @@ __global__ void disp_kernel(float *disp_l, float *disp_r, ...@@ -122,6 +127,15 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
__syncthreads();*/ __syncthreads();*/
// Fill local cache for window 5x5
// TODO Use shared memory?
for (int m=-2; m<=2; m++) {
for (int n=-2; n<=2; n++) {
l_cache_l2[m+2][n+2] = uint2asull(tex2D<uint2>(censusL,u+n,v+m));
l_cache_l1[m+2][n+2] = uint2asull(tex2D<uint2>(censusR,u+n,v+m));
}
}
uint16_t last_ham1 = 65535; uint16_t last_ham1 = 65535;
uint16_t last_ham2 = 65535; uint16_t last_ham2 = 65535;
uint16_t min_disp1 = 65535; uint16_t min_disp1 = 65535;
...@@ -136,7 +150,7 @@ __global__ void disp_kernel(float *disp_l, float *disp_r, ...@@ -136,7 +150,7 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
int dix2 = 0; int dix2 = 0;
// TODO Use prediction textures to narrow range // TODO Use prediction textures to narrow range
for (size_t d=0; d<ds; d++) { for (int d=0; d<maxdisp; d++) {
uint16_t hamming1 = 0; uint16_t hamming1 = 0;
uint16_t hamming2 = 0; uint16_t hamming2 = 0;
...@@ -146,13 +160,11 @@ __global__ void disp_kernel(float *disp_l, float *disp_r, ...@@ -146,13 +160,11 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
const auto v_ = (v + m); const auto v_ = (v + m);
for (int n=-2; n<=2; n++) { for (int n=-2; n<=2; n++) {
const auto u_ = u + n; const auto u_ = u + n;
auto l1 = l_cache_l1[m+2][n+2];
auto l2 = uint2asull(tex2D<uint2>(censusL,u_,v_)); auto l2 = l_cache_l2[m+2][n+2];
auto l1 = uint2asull(tex2D<uint2>(censusR,u_,v_));
// TODO Somehow might use shared memory
auto r1 = uint2asull(tex2D<uint2>(censusL, u_+d, v_)); auto r1 = uint2asull(tex2D<uint2>(censusL, u_+d, v_));
auto r2 = uint2asull(tex2D<uint2>(censusR, u_-d, v_)); auto r2 = uint2asull(tex2D<uint2>(censusR, u_-d, v_));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment