diff --git a/components/operators/src/segmentation.cu b/components/operators/src/segmentation.cu index 5cfd0e8b753c1d687e2fd8985fa88cfd90a8aa3a..e486b6ad9556736d381b36d9835f800e5ab44bb0 100644 --- a/components/operators/src/segmentation.cu +++ b/components/operators/src/segmentation.cu @@ -68,112 +68,104 @@ void ftl::cuda::edge_filter( #endif } -__global__ void bdmseg_kernel( - TextureObject<uchar4> colour_in, - TextureObject<float> gradient_in, - TextureObject<uchar4> seg_out) { - - __shared__ uchar4 colours[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP]; - //__shared__ float gradientY[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP]; - //__shared__ int flags[WARPS_PER_BLOCK][WIDTH_PER_WARP+2][WIDTH_PER_WARP+2]; - - const int bx = blockIdx.x*WIDTH_PER_WARP*BLOCK_WIDTH; - const int by = blockIdx.y*WIDTH_PER_WARP*BLOCK_WIDTH; - const int warp = threadIdx.y; +__device__ float findMaxEnergy(const float (&gradients)[WIDTH_PER_WARP][WIDTH_PER_WARP], const float (&field)[WIDTH_PER_WARP][WIDTH_PER_WARP]) { const int lane = threadIdx.x; - const int wx = (warp % BLOCK_WIDTH)*WIDTH_PER_WARP + bx; - const int wy = (warp / BLOCK_WIDTH)*WIDTH_PER_WARP + by; - + float max_e = 0.0f; for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { int x = i % WIDTH_PER_WARP; int y = i / WIDTH_PER_WARP; - if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue; - //if (x == WIDTH_PER_WARP-1 || y == WIDTH_PER_WARP-1) continue; - - // Iteratively merge pixels until division is approx 2. Reduce threshold - // over the entire block to achieve merge, don't just take local min diff - // to merge? + float f = field[y][x]; + float e = f * gradients[y][x]; + max_e = (f < 1.0f) ? max(max_e, e) : max_e; + } + max_e = warpMax(max_e); + return max_e; +} - // Start at max gradient? Or some percentage of it and divide. - // Merge if gradient is less than max. Merge left, merge up? - // Iterate until no further merges take place. +__device__ void updateField(const float (&gradients)[WIDTH_PER_WARP][WIDTH_PER_WARP], float (&field)[WIDTH_PER_WARP][WIDTH_PER_WARP], float me) { + const int lane = threadIdx.x; - // Succeed if 3 or less segments and each segment has a certain number of - // pixels touching a boundary. + for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { + int x = i % WIDTH_PER_WARP; + int y = i / WIDTH_PER_WARP; - // Perhaps a 9x9 is better? If merge left and up is used. + float e = field[y][x] * gradients[y][x]; + if (e >= me*0.99f) field[y][x] = 1.0f; + } - // Segment count starts at max and is subtracted when a merge occurs. + __syncwarp(); - //float pgrad = gradient_in(wx+x,wy+y); - //gradient[warp][y][x] = pgrad; - //gradientY[warp][y][x] = pgrady; + float newfields[PIXELS_PER_THREAD]; + int j=0; + for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { + int x = i % WIDTH_PER_WARP; + int y = i / WIDTH_PER_WARP; - // Find max border gradient - //if (x == 0 || y == 0 || x == WIDTH_PER_WARP-1 || y == WIDTH_PER_WARP-1) grad = max(grad, pgrad); + float oldfield = field[y][x]; + float newfield = 0.0f; - colours[warp][y][x] = colour_in(wx+x, wy+y); + for (int v=max(y-1,0); v<=min(y+1,WIDTH_PER_WARP-1); ++v) { + for (int u=max(x-1,0); u<=min(x+1,WIDTH_PER_WARP-1); ++u) { + float f = oldfield * field[v][u]; + newfield = max(newfield, f); + } + } - seg_out(wx+x,wy+y) = make_uchar4(0,0,0,0); + newfields[j++] = newfield; } - - //grad = warpMax(grad); - //if (grad < 20.0f) return; - - float alpha = 0.0f; - float max_sum = 0.0f; - float best_alpha = -1.0f; - float grad_total = 0.0f; - - while (alpha < 3.14f) { - float grad_sum = 0.0f; - for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { - int x = i % (WIDTH_PER_WARP); - int y = i / (WIDTH_PER_WARP); - if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue; + __syncwarp(); - uchar4 mycolour = colours[warp][y][x]; - - float xf = (float)x - 4.0f; - float yf = (float)y - 4.0f; + j=0; + for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { + int x = i % WIDTH_PER_WARP; + int y = i / WIDTH_PER_WARP; + field[y][x] = newfields[j++]; + } - float base = (xf*xf + yf*yf); - float G_alpha_ij = (base < 0.00001f) ? 0.0f : (xf*__cosf(alpha) + yf*__sinf(alpha)); - float r = ((float)mycolour.x / 256.0f) * G_alpha_ij; + __syncwarp(); +} - //float g = (float)mycolour.y * G_alpha_ij; - //float b = (float)mycolour.z * G_alpha_ij; +__global__ void bdmseg_kernel( + TextureObject<uchar4> colour_in, + TextureObject<float> gradient_in, + TextureObject<uchar4> seg_out) { - grad_sum += r; - } + __shared__ float gradients[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP]; + __shared__ float field[WARPS_PER_BLOCK][WIDTH_PER_WARP][WIDTH_PER_WARP]; + //__shared__ int flags[WARPS_PER_BLOCK][WIDTH_PER_WARP+2][WIDTH_PER_WARP+2]; + + const int bx = blockIdx.x*WIDTH_PER_WARP*BLOCK_WIDTH; + const int by = blockIdx.y*WIDTH_PER_WARP*BLOCK_WIDTH; + const int warp = threadIdx.y; + const int lane = threadIdx.x; + const int wx = (warp % BLOCK_WIDTH)*WIDTH_PER_WARP + bx; + const int wy = (warp / BLOCK_WIDTH)*WIDTH_PER_WARP + by; - grad_sum = warpSum(grad_sum); - grad_total += grad_sum*grad_sum; + for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { + int x = i % WIDTH_PER_WARP; + int y = i / WIDTH_PER_WARP; - if (fabs(grad_sum) > max_sum) { - max_sum = fabs(grad_sum); - best_alpha = alpha; - } + if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue; - alpha += 0.3925f; + gradients[warp][y][x] = gradient_in.tex2D(wx+x, wy+y); + field[warp][y][x] = 0.1f; // All locations have equal chance + seg_out(wx+x,wy+y) = make_uchar4(0,0,0,0); } - grad_total = sqrt(grad_total); + for (int j=0; j<8; ++j) { + float e = findMaxEnergy(gradients[warp], field[warp]); + updateField(gradients[warp], field[warp], e); + } for (int i=lane; i<BLOCK_COUNT; i+=WARP_SIZE) { int x = i % WIDTH_PER_WARP; int y = i / WIDTH_PER_WARP; if (wx+x >= colour_in.width() || wy+y >= colour_in.height()) continue; - float xf = (float)x - 4.0f; - float yf = (float)y - 4.0f; - float base = (xf*xf + yf*yf); - float G_alpha_ij = (base < 0.00001f) ? 0.0f : (xf*__cosf(best_alpha) + yf*__sinf(best_alpha)); - - seg_out(wx+x,wy+y) = make_uchar4(0,G_alpha_ij,0,0); - //seg_out(wx+x, wy+y) = make_uchar4(gradient[warp][y][x], 0, 0, 0); + //seg_out(wx+x,wy+y) = make_uchar4(0,G_alpha_ij,0,0); + seg_out(wx+x, wy+y) = make_uchar4(field[warp][y][x]*255.0f, 0, 0, 0); } }