Warp optimise correspondence search
Compare changes
@@ -8,6 +9,8 @@ using ftl::cuda::Mask;
@@ -8,6 +9,8 @@ using ftl::cuda::Mask;
__device__ float weightFunction(const ftl::cuda::MvMLSParams ¶ms, float dweight, float cweight);
@@ -42,6 +45,45 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams ¶ms,
@@ -42,6 +45,45 @@ __device__ inline float weightFunction<5>(const ftl::cuda::MvMLSParams ¶ms,
@@ -51,16 +93,14 @@ __global__ void corresponding_point_kernel(
@@ -51,16 +93,14 @@ __global__ void corresponding_point_kernel(
@@ -73,52 +113,52 @@ __global__ void corresponding_point_kernel(
@@ -73,52 +113,52 @@ __global__ void corresponding_point_kernel(
const float3 worldPos = world1 + j * rayStep_world; //(pose1 * cam1.screenToCam(x, y, depth_adjust));
@@ -126,18 +166,23 @@ __global__ void corresponding_point_kernel(
@@ -126,18 +166,23 @@ __global__ void corresponding_point_kernel(
@@ -149,7 +194,7 @@ __global__ void corresponding_point_kernel(
@@ -149,7 +194,7 @@ __global__ void corresponding_point_kernel(
@@ -169,25 +214,26 @@ void ftl::cuda::correspondence(
@@ -169,25 +214,26 @@ void ftl::cuda::correspondence(
const dim3 gridSize((d1.width() + T_PER_BLOCK - 1)/T_PER_BLOCK, (d1.height() + T_PER_BLOCK - 1)/T_PER_BLOCK);
case 0: corresponding_point_kernel<16,0><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
case 1: corresponding_point_kernel<16,1><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
case 2: corresponding_point_kernel<16,2><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
case 3: corresponding_point_kernel<16,3><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
case 4: corresponding_point_kernel<16,4><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;
case 5: corresponding_point_kernel<16,5><<<gridSize, blockSize, 0, stream>>>(d1, d2, c1, c2, screen, conf, mask, pose1, pose1_inv, pose2, cam1, cam2, params); break;