WIP add lossy depth encode again

40b0461f · Nicolas Pope · e14006b7 · 40b0461f · 40b0461f · 40b0461f
Commit 40b0461f authored 5 years ago by Nicolas Pope
--- a/components/codecs/include/ftl/codecs/depth_convert_cuda.hpp
+++ b/components/codecs/include/ftl/codecs/depth_convert_cuda.hpp
@@ -8,6 +8,8 @@ namespace cuda {
 void depth_to_vuya(const cv::cuda::PtrStepSz<float> &depth, const cv::cuda::PtrStepSz<uchar4> &rgba, float maxdepth, cv::cuda::Stream &stream);
+void depth_to_nv12(const cv::cuda::PtrStepSz<float> &depth, uchar* luminance, uchar* chroma, int pitch, float maxdepth, cv::cuda::Stream &stream);
 void vuya_to_depth(const cv::cuda::PtrStepSz<float> &depth, const cv::cuda::PtrStepSz<ushort4> &rgba, float maxdepth, cv::cuda::Stream &stream);
 void vuya_to_depth(const cv::cuda::PtrStepSz<float> &depth, const cv::cuda::PtrStepSz<ushort> &luminance, const cv::cuda::PtrStepSz<ushort> &chroma, float maxdepth, cv::cuda::Stream &stream);

--- a/components/codecs/src/depth_convert.cu
+++ b/components/codecs/src/depth_convert.cu
@@ -25,13 +25,8 @@ __device__ inline float clampC(float v, float t=255.0f) {
  // Assumes 8 bit output channels and 14bit depth
  static constexpr float P = (2.0f * 256.0f) / 16384.0f;
-__global__ void depth_to_vuya_kernel(cv::cuda::PtrStepSz<float> depth, cv::cuda::PtrStepSz<uchar4> rgba, float maxdepth) {
+ __device__ inline float3 depth2yuv(float depth, float maxdepth) {
-	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	float d = max(0.0f,depth);
-	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-	if (x < depth.cols && y < depth.rows) {
-		//float d = max(0.0f,min(maxdepth,depth(y,x)));
-		float d = max(0.0f,depth(y,x));
 	if (d >= maxdepth) d = 0.0f;
 	float L = d / maxdepth;
 	const float p = P;
@@ -42,7 +37,16 @@ __global__ void depth_to_vuya_kernel(cv::cuda::PtrStepSz<float> depth, cv::cuda:
 	float Hb1 = fmodf(((L - (p/4.0f)) / (p/2.0f)), 2.0f);
 	float Hb = (Hb1 <= 1.0f) ? Hb1 : 2.0f - Hb1;
-        rgba(y,x) = make_uchar4(Hb*255.0f,Ha*255.0f,L*255.0f, 0.0f);
+	return {L, Ha, Hb};
+ }
+__global__ void depth_to_vuya_kernel(cv::cuda::PtrStepSz<float> depth, cv::cuda::PtrStepSz<uchar4> rgba, float maxdepth) {
+	const unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+	const unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+	if (x < depth.cols && y < depth.rows) {
+		float3 yuv = depth2yuv(depth(y,x), maxdepth);
+        rgba(y,x) = make_uchar4(yuv.z*255.0f,yuv.y*255.0f,yuv.x*255.0f, 0.0f);
 	}
 }
@@ -54,6 +58,44 @@ void ftl::cuda::depth_to_vuya(const cv::cuda::PtrStepSz<float> &depth, const cv:
 	cudaSafeCall( cudaGetLastError() );
 }
+// Planar 8bit version
+__global__ void depth_to_nv12_kernel(cv::cuda::PtrStepSz<float> depth, uchar* luminance, uchar* chroma, int pitch, float maxdepth) {
+	const unsigned int x = (blockIdx.x*blockDim.x + threadIdx.x) * 2;
+	const unsigned int y = (blockIdx.y*blockDim.y + threadIdx.y) * 2;
+	if (x < depth.cols && y < depth.rows) {
+		float3 yuv1 = depth2yuv(depth(y,x), maxdepth);
+		float3 yuv2 = depth2yuv(depth(y,x+1), maxdepth);
+		float3 yuv3 = depth2yuv(depth(y+1,x), maxdepth);
+		float3 yuv4 = depth2yuv(depth(y+1,x+1), maxdepth);
+		// TODO: Something better than just average!
+		// Bad ones are discarded anyway...
+		float Ha = (yuv1.y+yuv2.y+yuv3.y+yuv4.y) / 4.0f * 255.0f;
+		float Hb = (yuv1.z+yuv2.z+yuv3.z+yuv4.z) / 4.0f * 255.0f;
+		luminance[y*pitch+x] = yuv1.x*255.0f;
+		luminance[y*pitch+x+1] = yuv2.x*255.0f;
+		luminance[(y+1)*pitch+x] = yuv3.x*255.0f;
+		luminance[(y+1)*pitch+x+1] = yuv4.x*255.0f;
+		chroma[(y/2)*pitch+x] = Hb;
+		chroma[(y/2)*pitch+x+1] = Ha;
+	}
+}
+void ftl::cuda::depth_to_nv12(const cv::cuda::PtrStepSz<float> &depth, uchar* luminance, uchar* chroma, int pitch, float maxdepth, cv::cuda::Stream &stream) {
+	const dim3 gridSize((depth.cols/2 + T_PER_BLOCK - 1)/T_PER_BLOCK, (depth.rows/2 + T_PER_BLOCK - 1)/T_PER_BLOCK);
+    const dim3 blockSize(T_PER_BLOCK, T_PER_BLOCK);
+	depth_to_nv12_kernel<<<gridSize, blockSize, 0, cv::cuda::StreamAccessor::getStream(stream)>>>(depth, luminance, chroma, pitch, maxdepth);
+	cudaSafeCall( cudaGetLastError() );
+}
+// =============================================================================
 // Decoding
 /*

--- a/components/codecs/src/nvpipe_encoder.cpp
+++ b/components/codecs/src/nvpipe_encoder.cpp
@@ -257,11 +257,14 @@ bool NvPipeEncoder::encode(const cv::cuda::GpuMat &in, ftl::codecs::Packet &pkt)
 	//pkt.flags = NvPipeEncoder::kFlagRGB | NvPipeEncoder::kFlagMappedDepth;
 	const NvEncInputFrame* f = nvenc_->GetNextInputFrame();
+	bool islossless = pkt.codec == codec_t::H264_LOSSLESS || pkt.codec == codec_t::HEVC_LOSSLESS;
+	//auto cvstream = cv::cuda::StreamAccessor::wrapStream(stream_);
 	if (!(pkt.flags & kFlagFloat)) {
 		cudaSafeCall(cudaMemcpy2D(f->inputPtr, f->pitch, in.data, in.step, width * 4, height, cudaMemcpyDeviceToDevice));
-	} else {
+	} else if (!islossless) {
 		// Do colour conversion here.
+		ftl::cuda::depth_to_nv12(in, (uchar*)f->inputPtr, ((uchar*)f->inputPtr)+(height*f->pitch), f->pitch, 16.0f, cv::cuda::Stream::Null());
 	}
 	// TODO: Use page locked memory?
@@ -338,18 +341,6 @@ bool NvPipeEncoder::_createEncoder(const ftl::codecs::Packet &pkt, format_t fmt)
 	bool ish264 = pkt.codec == codec_t::H264 || pkt.codec == codec_t::H264_LOSSLESS;
 	bool ishevc = !ish264;
-	/*if (nvenc_) NvPipe_Destroy(nvenc_);
-	const int fps = 1000/ftl::timer::getInterval();
-	nvenc_ = NvPipe_CreateEncoder(
-		format_,
-		codec_,
-		compression_,
-		bitrate,
-		fps,				// FPS
-		ftl::codecs::getWidth(pkt.definition),	// Output Width
-		ftl::codecs::getHeight(pkt.definition)	// Output Height
-	);*/
 	// Ensure we have a CUDA context
 	cudaSafeCall(cudaDeviceSynchronize());
 	CUcontext cudaContext;
@@ -436,6 +427,9 @@ bool NvPipeEncoder::_createEncoder(const ftl::codecs::Packet &pkt, format_t fmt)
 		return false;
 	} else {
 		LOG(INFO) << "NvPipe encoder created";
+		// TODO: SetIOCudaStreams
 		return true;
 	}
 }