diff --git a/cv-node/src/algorithms/rtcensus.cu b/cv-node/src/algorithms/rtcensus.cu
index 6d48a392a9bd78bd2041d92447c77bd693064e3a..9adab3cc1f55b259d9c7ed1e7bff771b090c4aad 100644
--- a/cv-node/src/algorithms/rtcensus.cu
+++ b/cv-node/src/algorithms/rtcensus.cu
@@ -100,20 +100,25 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
 		size_t ds) {	
 	//extern __shared__ uint64_t cache[];
 
-	const int gamma = 10;
+	const int gamma = 1;
 	
-	size_t u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2;
-	size_t v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2;
-	size_t v_end = v_start + ROWSperTHREAD;
+	int u = (blockIdx.x * BLOCK_W) + threadIdx.x + RADIUS2;
+	int v_start = (blockIdx.y * ROWSperTHREAD) + RADIUS2;
+	int v_end = v_start + ROWSperTHREAD;
+	int maxdisp = ds;
+	
+	// Local cache
+	uint64_t l_cache_l1[5][5];
+	uint64_t l_cache_l2[5][5];
 
 	// Prepare the cache load
 	//const int cache_thread_width = (BLOCK_W+ds / BLOCK_W + RADIUS2*2 + 1)*2;
 	//uint64_t *cache_ptr = cache + (threadIdx.x * cache_thread_width);
 	
 	if (v_end >= height) v_end = height;
-	//if (u >= width-ds) return;
+	if (u+maxdisp >= width) maxdisp = width-u;
 	
-	for (size_t v=v_start; v<v_end; v++) {
+	for (int v=v_start; v<v_end; v++) {
 		/*const int cache_start = v*width*2 + cache_thread_width*blockIdx.x;
 		for (int i=0; i<cache_thread_width; i+=2) {
 			cache_ptr[i] = census[cache_start+i];
@@ -122,6 +127,15 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
 		
 		__syncthreads();*/
 		
+		// Fill local cache for window 5x5
+		// TODO Use shared memory?
+		for (int m=-2; m<=2; m++) {
+			for (int n=-2; n<=2; n++) {
+				l_cache_l2[m+2][n+2] = uint2asull(tex2D<uint2>(censusL,u+n,v+m));
+				l_cache_l1[m+2][n+2] = uint2asull(tex2D<uint2>(censusR,u+n,v+m));
+			}
+		}
+		
 		uint16_t last_ham1 = 65535;
 		uint16_t last_ham2 = 65535;
 		uint16_t min_disp1 = 65535;
@@ -136,7 +150,7 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
 		int dix2 = 0;
 		
 		// TODO Use prediction textures to narrow range
-		for (size_t d=0; d<ds; d++) {
+		for (int d=0; d<maxdisp; d++) {
 			uint16_t hamming1 = 0;
 			uint16_t hamming2 = 0;
 			
@@ -146,13 +160,11 @@ __global__ void disp_kernel(float *disp_l, float *disp_r,
 				const auto v_ = (v + m);
 				for (int n=-2; n<=2; n++) {
 					const auto u_ = u + n;
-
-				
 					
-
-					auto l2 = uint2asull(tex2D<uint2>(censusL,u_,v_));
-					auto l1 = uint2asull(tex2D<uint2>(censusR,u_,v_));
+					auto l1 = l_cache_l1[m+2][n+2];
+					auto l2 = l_cache_l2[m+2][n+2];
 					
+					// TODO Somehow might use shared memory
 					auto r1 = uint2asull(tex2D<uint2>(censusL, u_+d, v_));
 					auto r2 = uint2asull(tex2D<uint2>(censusR, u_-d, v_));