Hello,
We have a video encode/decode system built using the Nvidia Video Codec SDK. We utilize the native API within our c++ application.
Our current issue we are trying to solve is speeding up the time it takes for our encoder to reconfigure for different resolutions, e.g. 1080p → 2160P.
Historically, we have destroyed our encoder instance (base upon NvVideoEncoder.h) and created a new one. From our tests, this can take up to 4 seconds when going up to 2160P.
I have managed to successfully re-use an existing instance using some information provided here: [MMAPI R28.2] deinitPlane() of NvVideoEncoder
This works, and it’s a huge improvement - often under 100ms for reconfiguration.
However, I am witnessing a memory leak using this new method, and I cannot see where this is occuring/what I am doing incorrectly. Using jtop, GPU memory increases significantly every time resolution is changed (it does decrease, but not the same amount as it increases).
From what I can see, and I hope this is enough, this is the relevant code
Initialize() method - this is called to create the encoder and configure it’s properties
void HevcHardwareEncoder::Initialize(int width, int height, uint8_t* dataPointer, int dmaFd, bool isLowLatency) {
const auto start = GetNowMs();
int ret = 0;
mCurrentWidth = width;
mCurrentHeight = height;
_isLowLatency = isLowLatency;
ctx.width = width;
ctx.height = height;
set_defaults(&ctx, params, width, height, isLowLatency);
ctx.width = width;
ctx.height = height;
if (ctx.encoder_pixfmt == V4L2_PIX_FMT_H265) {
if ((ctx.width < 144 || ctx.height < 144)) HILogger::Fatal("Height/Width should be > 144 for H.265: ", ret);
}
const auto encStart = GetNowMs();
if (!ctx.enc) {
ctx.enc = NvVideoEncoder::createVideoEncoder(_name.c_str());
}
if (!ctx.enc) HILogger::Fatal("Failed to create encoder\n");
ret = ctx.enc->setCapturePlaneFormat(ctx.encoder_pixfmt, ctx.width, ctx.height, ctx.width * ctx.height * 2);
if (ret < 0) HILogger::Fatal("Could not set capture plane format: ", ret);
ret = ctx.enc->setOutputPlaneFormat(ctx.raw_pixfmt, ctx.width, ctx.height);
if (ret < 0) HILogger::Fatal("Could not set output plane format: ", ret);
const auto encEnd = GetNowMs();
// lots more specific encode parameters, omitted for brevity
// ...
const auto memMap = GetNowMs();
switch (ctx.output_memory_type) {
case V4L2_MEMORY_MMAP:
ret = ctx.enc->yuv_plane.setupPlane(V4L2_MEMORY_MMAP, NUM_ENCODE_BUFFERS, true, false);
if (ret < 0) HILogger::Warn("Could not setup output plane: ", ret);
break;
case V4L2_MEMORY_USERPTR:
ret = ctx.enc->yuv_plane.setupPlane(V4L2_MEMORY_USERPTR, NUM_ENCODE_BUFFERS, false, true);
if (ret < 0) HILogger::Warn("Could not setup output plane: ", ret);
break;
case V4L2_MEMORY_DMABUF:
ret = setup_output_dmabuf(&ctx, NUM_ENCODE_BUFFERS);
if (ret < 0) HILogger::Warn("Could not setup output plane: ", ret);
break;
default:
if (ret < 0) HILogger::Warn("Not a valid plane");
}
ret = ctx.enc->bitstream_plane.setupPlane(V4L2_MEMORY_MMAP, NUM_ENCODE_BUFFERS, true, false);
if (ret < 0) HILogger::Warn("Could not setup capture plane: ", ret);
ret = ctx.enc->subscribeEvent(V4L2_EVENT_EOS, 0, 0);
if (ret < 0) HILogger::Warn("Could not subscribe EOS event: ", ret);
// output plane STREAMON
ret = ctx.enc->yuv_plane.setStreamStatus(true);
if (ret < 0) HILogger::Warn("Error in output plane streamon: ", ret);
// capture plane STREAMON
ret = ctx.enc->bitstream_plane.setStreamStatus(true);
if (ret < 0) HILogger::Warn("Error in capture plane streamon: ", ret);
if (ctx.blocking_mode) {
ctx.enc->bitstream_plane.setDQThreadCallback((void*)this, encoder_capture_plane_dq_callback);
// startDQThread starts a thread internally which calls the
// encoder_capture_plane_dq_callback whenever a buffer is dequeued
// on the plane
ctx.enc->bitstream_plane.startDQThread(&ctx);
}
// Enqueue all the empty bitstream plane buffers
for (uint32_t i = 0; i < ctx.enc->bitstream_plane.getNumBuffers(); i++) {
struct v4l2_buffer v4l2_buf;
struct v4l2_plane planes[MAX_PLANES];
memset(&v4l2_buf, 0, sizeof(v4l2_buf));
memset(planes, 0, MAX_PLANES * sizeof(struct v4l2_plane));
v4l2_buf.index = i;
v4l2_buf.m.planes = planes;
ret = ctx.enc->bitstream_plane.qBuffer(v4l2_buf, NULL);
if (ret < 0) {
HILogger::Fatal("Error while queueing buffer at capture plane");
abort(&ctx);
return;
}
}
// Read video frame and queue all the yuv plane buffers
for (uint32_t i = 0; i < ctx.enc->yuv_plane.getNumBuffers(); i++) {
struct v4l2_buffer v4l2_buf;
struct v4l2_plane planes[MAX_PLANES];
NvBuffer* buffer = ctx.enc->yuv_plane.getNthBuffer(i);
memset(&v4l2_buf, 0, sizeof(v4l2_buf));
memset(planes, 0, MAX_PLANES * sizeof(struct v4l2_plane));
v4l2_buf.index = i;
v4l2_buf.m.planes = planes;
if (ctx.output_memory_type == V4L2_MEMORY_DMABUF) {
v4l2_buf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
v4l2_buf.memory = V4L2_MEMORY_DMABUF;
ret = ctx.enc->yuv_plane.mapOutputBuffers(v4l2_buf, ctx.output_plane_fd[i]);
if (ret < 0) {
HILogger::Fatal("Error while mapping buffer at output plane");
abort(&ctx);
return;
}
}
bool success = true;
if (dmaFd > 0) {
#if JETPACK_VAR == 4
NvBufferTransformParams transform_params;
memset(&transform_params, 0, sizeof(NvBufferTransformParams));
if (NvBufferTransform(dmaFd, buffer->planes[0].fd, &transform_params) < 0) {
HILogger::Debug("Failed to copy DMA frame during encoder initialisation");
success = false;
}
#endif
} else {
if (ConvertToBuffer(*buffer, dataPointer) < 0) {
HILogger::Debug("Failed to read complete frame");
success = false;
}
}
if (success) {
for (int j = 0; j < buffer->n_planes; ++j) {
NvBuffer::NvBufferPlane& plane = buffer->planes[j];
// Setting this value prevents a segfault on getNumBuffers in NvV4l2ElementPlane?
plane.bytesused = plane.fmt.stride * plane.fmt.height;
}
} else {
v4l2_buf.m.planes[0].m.userptr = 0;
v4l2_buf.m.planes[0].bytesused = v4l2_buf.m.planes[1].bytesused = v4l2_buf.m.planes[2].bytesused = 0;
}
if (ctx.output_memory_type == V4L2_MEMORY_DMABUF || ctx.output_memory_type == V4L2_MEMORY_MMAP) {
for (uint32_t j = 0; j < buffer->n_planes; j++) {
SyncSurface(buffer->planes[j].fd, j, buffer);
}
}
if (ctx.output_memory_type == V4L2_MEMORY_DMABUF) {
for (uint32_t j = 0; j < buffer->n_planes; j++) {
v4l2_buf.m.planes[j].bytesused = buffer->planes[j].bytesused;
}
}
ret = ctx.enc->yuv_plane.qBuffer(v4l2_buf, NULL);
if (ret < 0) {
HILogger::Fatal("Error while queueing buffer at output plane");
abort(&ctx);
return;
}
if (v4l2_buf.m.planes[0].bytesused == 0) {
HILogger::Info("Frame reading complete in Initialize()");
return;
}
ctx.input_frames_queued_count++;
}
// force first frame to be used as an idr frame
ctx.enc->forceIDR();
#if JETPACK_VAR == 4
_bufferSession = NvBufferSessionCreate();
memset(&_syncObj, 0, sizeof(NvBufferSyncObj));
_syncObj.use_outsyncobj = 1;
#endif
mEncoderOn = true;
const auto end = GetNowMs();
}
// create DMA buffers for the YUV plane.
static int setup_output_dmabuf(context_t* ctx, uint32_t num_buffers) {
int fd;
int ret = ctx->enc->yuv_plane.reqbufs(V4L2_MEMORY_DMABUF, num_buffers);
if (ret) {
HILogger::Fatal("reqbufs failed for output plane V4L2_MEMORY_DMABUF");
return ret;
}
#if JETPACK_VAR == 5
NvBufSurf::NvCommonAllocateParams cParams;
for (uint32_t i = 0; i < ctx->enc->yuv_plane.getNumBuffers(); i++) {
cParams.width = ctx->width;
cParams.height = ctx->height;
cParams.layout = NVBUF_LAYOUT_PITCH;
if (ctx->enableLossless && ctx->encoder_pixfmt == V4L2_PIX_FMT_H264) {
cParams.colorFormat = NVBUF_COLOR_FORMAT_YUV444;
} else if (ctx->profile == V4L2_MPEG_VIDEO_H265_PROFILE_MAIN10) {
cParams.colorFormat = NVBUF_COLOR_FORMAT_NV12_10LE;
} else {
cParams.colorFormat = ctx->enable_extended_colorformat ? NVBUF_COLOR_FORMAT_YUV420_ER : NVBUF_COLOR_FORMAT_YUV420;
}
cParams.memtag = NvBufSurfaceTag_VIDEO_ENC;
cParams.memType = NVBUF_MEM_SURFACE_ARRAY;
ret = NvBufSurf::NvAllocate(&cParams, 1, &fd);
#elif JETPACK_VAR == 4
NvBufferCreateParams cParams;
for (uint32_t i = 0; i < ctx->enc->yuv_plane.getNumBuffers(); i++) {
cParams.width = ctx->width;
cParams.height = ctx->height;
cParams.layout = NvBufferLayout_Pitch;
if (ctx->enableLossless && ctx->encoder_pixfmt == V4L2_PIX_FMT_H264) {
cParams.colorFormat = NvBufferColorFormat_YUV444;
} else if (ctx->profile == V4L2_MPEG_VIDEO_H265_PROFILE_MAIN10) {
cParams.colorFormat = NvBufferColorFormat_NV12_10LE;
} else {
cParams.colorFormat = ctx->enable_extended_colorformat ? NvBufferColorFormat_YUV420_ER : NvBufferColorFormat_YUV420;
}
cParams.nvbuf_tag = NvBufferTag_VIDEO_ENC;
cParams.payloadType = NvBufferPayload_SurfArray;
ret = NvBufferCreateEx(&fd, &cParams);
#endif
if (ret < 0) {
HILogger::Fatal("Failed to create NvBuffer");
return ret;
}
HILogger::Warn("Created ", fd);
ctx->output_plane_fd[i] = fd;
}
return ret;
}
Finish() method - this is called when a resolution change is detected, then Initialize() is called again.
void HevcHardwareEncoder::Finish() {
if (!mEncoderOn) return;
HILogger::Trace("Start closing encoder");
// Send EOS signal
EncodeFrame(nullptr, mCurrentWidth, mCurrentHeight);
ctx.enc->bitstream_plane.waitForDQThread(-1);
HILogger::Trace("Finished waiting for DQThread");
if (ctx.enc && ctx.enc->isInError()) {
HILogger::Warn("Encoder is in error on Close()");
}
if (ctx.output_memory_type == V4L2_MEMORY_DMABUF) {
for (uint32_t i = 0; i < ctx.enc->yuv_plane.getNumBuffers(); i++) {
int ret = ctx.enc->yuv_plane.unmapOutputBuffers(i, ctx.output_plane_fd[i]);
if (ret < 0) {
HILogger::Warn("Error while unmapping buffer at output plane");
}
#if JETPACK_VAR == 5
ret = NvBufSurf::NvDestroy(ctx.output_plane_fd[i]);
#elif JETPACK_VAR == 4
ret = NvBufferDestroy(ctx.output_plane_fd[i]);
#endif
if (ret < 0) {
HILogger::Warn("Failed to Destroy NvBuffer");
}
HILogger::Warn("Destroyed ", ctx.output_plane_fd[i]);
}
}
ctx.enc->yuv_plane.deinitPlane();
ctx.enc->bitstream_plane.deinitPlane();
**// If I include this code below, no leaks are evident!**
// delete ctx.enc;
// ctx.enc = nullptr;
#if JETPACK_VAR == 4
NvBufferSessionDestroy(_bufferSession);
#endif
mEncoderOn = false;
HILogger::Debug("Closed encoder");
}
We use MMAP buffers on the bitstream plane (encoded packets) and DMA buffers on the output plane (input YUV frames). As above in the code block, if I completely destroy the ctx.enc
variable, I don’t see the leak occur.
I have looked and verified that all of our memory allocations/mapping etc is done in pairs. For example:
mapOutputBuffers() → unmapOutputBuffers()
setupPlane() → deinitPlane()
NvBufferCreateEx → NvBufferDestroy()
However, there is one method I’m unsure about:
NvBufferMemSyncForDevice()
there doesn’t seem to be a release equivalent.
So, could you please advise how I can track this down, or is this even possible what I’m trying to do (it appears so, as streaming works well). I’ve searched for ways to list active DMA file descriptors but cannot seem to find anything and I have a feeling its a DMA memory leak, or something I’m not calling. The main code difference when I destroy the ctx.enc
is that it calls v4l2_close(fd);
and fd = v4l2_open(dev_node, flags | O_RDWR);
Any assistance is appreciated.