diff --git a/components/operators/src/segmentation.cu b/components/operators/src/segmentation.cu
index 5cfd0e8b753c1d687e2fd8985fa88cfd90a8aa3a..e486b6ad9556736d381b36d9835f800e5ab44bb0 100644
--- a/components/operators/src/segmentation.cu
+++ b/components/operators/src/segmentation.cu
@@ -68,112 +68,104 @@ void ftl::cuda::edge_filter(
 	#endif
 }
 
-__global__ void bdmseg_kernel(
-		TextureObject<uchar4> colour_in,
-		TextureObject<float> gradient_in,
-		TextureObject<uchar4> seg_out) {
-
-	__shared__ uchar4 colours[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP];
-	//__shared__ float gradientY[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP];
-	//__shared__ int flags[WARPS_PER_BLOCK][WIDTH_PER_WARP+2][WIDTH_PER_WARP+2];
-		
-	const int bx = blockIdx.x*WIDTH_PER_WARP*BLOCK_WIDTH;
-	const int by = blockIdx.y*WIDTH_PER_WARP*BLOCK_WIDTH;
-	const int warp = threadIdx.y;
+__device__ float findMaxEnergy(const float (&gradients)[WIDTH_PER_WARP][WIDTH_PER_WARP], const float (&field)[WIDTH_PER_WARP][WIDTH_PER_WARP]) {
 	const int lane = threadIdx.x;
-	const int wx = (warp % BLOCK_WIDTH)*WIDTH_PER_WARP + bx;
-	const int wy = (warp / BLOCK_WIDTH)*WIDTH_PER_WARP + by;
-
+	float max_e = 0.0f;
 	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
 		int x = i % WIDTH_PER_WARP;
 		int y = i / WIDTH_PER_WARP;
 
-		if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue;
-		//if (x == WIDTH_PER_WARP-1 || y == WIDTH_PER_WARP-1) continue;
-
-		// Iteratively merge pixels until division is approx 2. Reduce threshold
-		// over the entire block to achieve merge, don't just take local min diff
-		// to merge?
+		float f = field[y][x];
+		float e = f * gradients[y][x];
+		max_e = (f < 1.0f) ? max(max_e, e) : max_e;
+	}
+	max_e = warpMax(max_e);
+	return max_e;
+}
 
-		// Start at max gradient? Or some percentage of it and divide.
-		// Merge if gradient is less than max. Merge left, merge up?
-		// Iterate until no further merges take place.
+__device__ void updateField(const float (&gradients)[WIDTH_PER_WARP][WIDTH_PER_WARP], float (&field)[WIDTH_PER_WARP][WIDTH_PER_WARP], float me) {
+	const int lane = threadIdx.x;
 
-		// Succeed if 3 or less segments and each segment has a certain number of
-		// pixels touching a boundary.
+	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
+		int x = i % WIDTH_PER_WARP;
+		int y = i / WIDTH_PER_WARP;
 
-		// Perhaps a 9x9 is better? If merge left and up is used.
+		float e = field[y][x] * gradients[y][x];
+		if (e >= me*0.99f) field[y][x] = 1.0f;
+	}
 
-		// Segment count starts at max and is subtracted when a merge occurs.
+	__syncwarp();
 
-		//float pgrad = gradient_in(wx+x,wy+y);
-		//gradient[warp][y][x] = pgrad;
-		//gradientY[warp][y][x] = pgrady;
+	float newfields[PIXELS_PER_THREAD];
+	int j=0;
+	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
+		int x = i % WIDTH_PER_WARP;
+		int y = i / WIDTH_PER_WARP;
 
-		// Find max border gradient
-		//if (x == 0 || y == 0 || x == WIDTH_PER_WARP-1 || y == WIDTH_PER_WARP-1) grad = max(grad, pgrad);
+		float oldfield = field[y][x];
+		float newfield = 0.0f;
 
-		colours[warp][y][x] = colour_in(wx+x, wy+y);
+		for (int v=max(y-1,0); v<=min(y+1,WIDTH_PER_WARP-1); ++v) {
+			for (int u=max(x-1,0); u<=min(x+1,WIDTH_PER_WARP-1); ++u) {
+				float f = oldfield * field[v][u];
+				newfield = max(newfield, f);
+			}
+		}
 
-		seg_out(wx+x,wy+y) = make_uchar4(0,0,0,0);
+		newfields[j++] = newfield;
 	}
-
-	//grad = warpMax(grad);
-	//if (grad < 20.0f) return;
-
-	float alpha = 0.0f;
-	float max_sum = 0.0f;
-	float best_alpha = -1.0f;
-	float grad_total = 0.0f;
-
-	while (alpha < 3.14f) {
-		float grad_sum = 0.0f;
 	
-		for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
-			int x = i % (WIDTH_PER_WARP);
-			int y = i / (WIDTH_PER_WARP);
-			if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue;
+	__syncwarp();
 
-			uchar4 mycolour = colours[warp][y][x];
-
-			float xf = (float)x - 4.0f;
-			float yf = (float)y - 4.0f;
+	j=0;
+	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
+		int x = i % WIDTH_PER_WARP;
+		int y = i / WIDTH_PER_WARP;
+		field[y][x] = newfields[j++];
+	}
 
-			float base = (xf*xf + yf*yf);
-			float G_alpha_ij = (base < 0.00001f) ? 0.0f : (xf*__cosf(alpha) + yf*__sinf(alpha));
-			float r = ((float)mycolour.x / 256.0f) * G_alpha_ij;
+	__syncwarp();
+}
 
-			//float g = (float)mycolour.y * G_alpha_ij;
-			//float b = (float)mycolour.z * G_alpha_ij;
+__global__ void bdmseg_kernel(
+		TextureObject<uchar4> colour_in,
+		TextureObject<float> gradient_in,
+		TextureObject<uchar4> seg_out) {
 
-			grad_sum += r;
-		}
+	__shared__ float gradients[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP];
+	__shared__ float field[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP];
+	//__shared__ int flags[WARPS_PER_BLOCK][WIDTH_PER_WARP+2][WIDTH_PER_WARP+2];
+		
+	const int bx = blockIdx.x*WIDTH_PER_WARP*BLOCK_WIDTH;
+	const int by = blockIdx.y*WIDTH_PER_WARP*BLOCK_WIDTH;
+	const int warp = threadIdx.y;
+	const int lane = threadIdx.x;
+	const int wx = (warp % BLOCK_WIDTH)*WIDTH_PER_WARP + bx;
+	const int wy = (warp / BLOCK_WIDTH)*WIDTH_PER_WARP + by;
 
-		grad_sum = warpSum(grad_sum);
-		grad_total += grad_sum*grad_sum;
+	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
+		int x = i % WIDTH_PER_WARP;
+		int y = i / WIDTH_PER_WARP;
 
-		if (fabs(grad_sum) > max_sum) {
-			max_sum = fabs(grad_sum);
-			best_alpha = alpha;
-		}
+		if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue;
 
-		alpha += 0.3925f;
+		gradients[warp][y][x] = gradient_in.tex2D(wx+x, wy+y);
+		field[warp][y][x] = 0.1f;  // All locations have equal chance
+		seg_out(wx+x,wy+y) = make_uchar4(0,0,0,0);
 	}
 
-	grad_total = sqrt(grad_total);
+	for (int j=0; j<8; ++j) {
+		float e = findMaxEnergy(gradients[warp], field[warp]);
+		updateField(gradients[warp], field[warp], e);
+	}
 
 	for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) {
 		int x = i % WIDTH_PER_WARP;
 		int y = i / WIDTH_PER_WARP;
 		if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue;
 
-		float xf = (float)x - 4.0f;
-		float yf = (float)y - 4.0f;
-		float base = (xf*xf + yf*yf);
-		float G_alpha_ij = (base < 0.00001f) ? 0.0f : (xf*__cosf(best_alpha) + yf*__sinf(best_alpha));
-
-		seg_out(wx+x,wy+y) = make_uchar4(0,G_alpha_ij,0,0);
-		//seg_out(wx+x, wy+y) = make_uchar4(gradient[warp][y][x], 0, 0, 0);
+		//seg_out(wx+x,wy+y) = make_uchar4(0,G_alpha_ij,0,0);
+		seg_out(wx+x, wy+y) = make_uchar4(field[warp][y][x]*255.0f, 0, 0, 0);
 	}
 }