Continuing the discussion from Nano not using GPU with gstreamer/python. Slow FPS, dropped frames:
Hi,
Please check if CUDA_VER is correctly set. You can run $ ls /usr/local/ to get the version.
Dear DaneLL,
I have verified the cuda version ,In my case i am using 11.4 ,
gst_cv_gpumat.zip (3.0 KB)
I have no problem during the make ,but when i run executable it is showing this error .
CUDA : 11.4
Platform: Jetson xavier NX
Thanks in advance
Hi,
The sample is for Jetpack 4 release and does not work on Jetpack 5, since Nvbuffer APIs are deprecated on Jetpack 5. You would need to replace the Nvbuffer APIs with NvBufSurface APIs.
Thanks for the quick update ,
is it like this
,But i got the error…
@DaneLLL
Hi,
You may refer to this patch to use NvBufurface APIs:
How to create opencv gpumat from nvstream? - #18 by DaneLLL
We will also check and try to have a sample for Jetpack 5. Not sure which version you are using. It would be great if you can use latest Jetpack 5.1.2.
Thanks Dane,
#include <cstdlib>
#include <cstring>
#include <sstream>
#include <gst/gst.h>
#include "nvbuf_utils.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudaEGL.h>
#include <opencv2/cudafilters.hpp>
#include <iostream>
using namespace std;
#define USE(x) ((void)(x))
static GstPipeline *gst_pipeline = nullptr;
static string launch_string;
GstClockTime usec = 1000000;
static int w = 1920;
static int h = 1080;
EGLDisplay egl_display;
static bool create_filter = true;
static cv::Ptr< cv::cuda::Filter > filter;
static GstPadProbeReturn
conv_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info,
gpointer u_data)
{
GstBuffer *buffer = (GstBuffer *) info->data;
GstMapInfo map = {0};
int dmabuf_fd = 0;
CUresult status;
gst_buffer_map (buffer, &map, GST_MAP_READ);
ExtractFdFromNvBuffer((void *)map.data, &dmabuf_fd);
//CUDA postprocess
{
EGLImageKHR egl_image;
egl_image = NvEGLImageFromFd(egl_display, dmabuf_fd);
CUresult status;
CUeglFrame eglFrame;
CUgraphicsResource pResource = NULL;
cudaFree(0);
// status = cuGraphicsEGLRegisterImage(&pResource,
// egl_image,
// CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
// if (status != CUDA_SUCCESS)
// {
// printf("cuGraphicsEGLRegisterImage failed: %d \n",status);
// }
// status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
// status = cuCtxSynchronize();
status = cuGraphicsEGLRegisterImage(&pResource,
egl_image,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if (status != CUDA_SUCCESS)
{
printf("cuGraphicsEGLRegisterImage failed: %d \n",status);
}
status = cuGraphicsResourceGetMappedEglFrame(&eglFrame, pResource, 0, 0);
status = cuCtxSynchronize();
if (create_filter) {
filter = cv::cuda::createSobelFilter(CV_8UC4, CV_8UC4, 1, 0, 3, 1, cv::BORDER_DEFAULT);
//filter = cv::cuda::createGaussianFilter(CV_8UC4, CV_8UC4, cv::Size(31,31), 0, 0, cv::BORDER_DEFAULT);
create_filter = false;
}
cv::cuda::GpuMat d_mat(h, w, CV_8UC4, eglFrame.frame.pPitch[0]);
filter->apply (d_mat, d_mat);
status = cuCtxSynchronize();
status = cuGraphicsUnregisterResource(pResource);
NvDestroyEGLImage(egl_display, egl_image);
}
gst_buffer_unmap(buffer, &map);
return GST_PAD_PROBE_OK;
}
int main(int argc, char** argv) {
USE(argc);
USE(argv);
gst_init (&argc, &argv);
GMainLoop *main_loop;
main_loop = g_main_loop_new (NULL, FALSE);
ostringstream launch_stream;
egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
eglInitialize(egl_display, NULL, NULL);
launch_stream
<< "nvarguscamerasrc sensor-id=0 ! "
<< "video/x-raw(memory:NVMM),width="<< w <<",height="<< h <<",framerate=30/1,format=NV12 ! "
<< "nvvidconv name=myconv ! "
<< "video/x-raw(memory:NVMM),format=RGBA ! "
<< "nvoverlaysink ";
launch_string = launch_stream.str();
g_print("Using launch string: %s\n", launch_string.c_str());
GError *error = nullptr;
gst_pipeline = (GstPipeline*) gst_parse_launch(launch_string.c_str(), &error);
if (gst_pipeline == nullptr) {
g_print( "Failed to parse launch: %s\n", error->message);
return -1;
}
if(error) g_error_free(error);
GstElement* conv = gst_bin_get_by_name(GST_BIN(gst_pipeline), "myconv");
GstPad *src_pad = gst_element_get_static_pad (conv, "src");
gst_pad_add_probe (src_pad, GST_PAD_PROBE_TYPE_BUFFER,
conv_src_pad_buffer_probe, NULL, NULL);
gst_element_set_state((GstElement*)gst_pipeline, GST_STATE_PLAYING);
g_usleep(30*usec);
GstElement* src = gst_bin_get_by_name(GST_BIN(gst_pipeline), "mysource");
gst_element_send_event (src, gst_event_new_eos ());
// Wait for EOS message
GstBus *bus = gst_pipeline_get_bus(GST_PIPELINE(gst_pipeline));
gst_bus_poll(bus, GST_MESSAGE_EOS, GST_CLOCK_TIME_NONE);
gst_element_set_state((GstElement*)gst_pipeline, GST_STATE_NULL);
gst_object_unref(GST_OBJECT(gst_pipeline));
g_main_loop_unref(main_loop);
eglTerminate(egl_display);
g_print("going to exit \n");
return 0;
}
I have edited the code but still i am facing an error during the run time
Hi,One more thing i am not using Deepstream ,the given code is extracting the frame captured using nvarguscamera and place that to the cv::cuda::GPuMat from NVMM .
@DaneLLL Sir can you please give me some input…Thanks in advance
Thanks verymuch , But when i tried to run it is not showing any output.
@DaveYYY
Thanks in advance
Hi,
Sorry that I missed one thing here.
nvoverlaysink
is deprecated on JetPack 5, and please use nv3dsink
instead.
launch_stream
<< "nvarguscamerasrc name=mysource ! "
<< "video/x-raw(memory:NVMM),width="<< w <<",height="<< h <<",framerate=30/1,format=NV12 ! "
<< "nvvidconv name=myconv ! "
<< "video/x-raw(memory:NVMM),format=RGBA ! "
<< "nv3dsink ";
gst_cv_gpumat.tbz2 (3.4 KB)
THANKZZZZZZZZZZZZZZZZZZZZZ…
you are an ultra saver…
I am trying to capture the camera in 60FPS so i had made a tweak in the code
<< "nvarguscamerasrc name=mysource ! "
<< "video/x-raw(memory:NVMM),width="<< w <<",height="<< h <<",framerate=60/1,format=NV12 ! "
<< "nvvidconv name=myconv ! "
<< "video/x-raw(memory:NVMM),format=RGBA ! "
<< "nv3dsink ";
But ends up in error ,Can you please give me some input on that …Thanks in advance @DaveYYY
Hi,
Please run your pipeline in gst-launch-1.0 to make sure it is valid first. And then apply to C code.
I had done all the suggestions from your side and it succeeded well.
Some more questions
1,I triying to run the NVMM code in one thread and needs to place the GpuMat variable to a Queue so i can acess the queue from another thread.
How can i place the GpuMat variable to a normal queue ,Is it possible or Is there any method to place the GpuMat variable to a queue in GPU?
- Which is faster to process ,in my case i am using 1080p RGB video ,Is it queue or shared memory or any other faster method is available?
3.I want to blend an rgba image over this Gpu Mat image .From the documention i am able to see that using VIC(Video image compositor) we can able to blend on images .Can you please help me that how can i blend my RGBA image to normal image using the VIC engine or any sample code available?
Thanks in advance…
Hi,
The demonstration is to map NvBufSurface to a GpuMat. So a possible solution should be pass NvBufSurface to another thread, and map it to GpuMat. Please try this. Or see if other users can share experience in OpenCV programming.
There may be no existing implementation for this. Probably you can implement the function in CUDA code so that it can utilize GPU engine
You can try NvBufSurfTransformMultiInputBufCompositeBlend(). There is demonstration in
/usr/src/jetson_multimedia_api/samples/13_argus_multi_camera
Also can refer to source code of nvcompositor plugin.
Thanks for the quick update ,
I have a rgba image .I need to blend that image onto a video .(Consider the background as a video and dice as an rgba image ) Like below
is that possible with the mention sample app …Dane …
Thanks in advance…
@DaveYYY @DaneLLL
Hi,
Transparent pixels are not supported. Please refer to discussion in the topic:
Nvcompositor plugs alpha is not work and alpha plugs no work
The whole image can be blended in single alpha value. For this use-case the dices have alpha=1 and other pixels have alpha=0(transparent). This is not supported in hardware converter.
Many Many thanks ,
Can you please give me any other faster method .
cv::Mat overlayImages_cuda(const cv::cuda::GpuMat& face_img)
{
cv::cuda::GpuMat resized_image_torric;
// cv::resize(image_torric, resized_image_torric, face_img.size(), 0, 0, cv::INTER_NEAREST);
cv::cuda::resize(transImg, resized_image_torric, face_img.size(), 0, 0, cv::INTER_NEAREST);
std::vector<cv::cuda::GpuMat> channels;
cv::cuda::GpuMat overlay_img,result;
cv::cuda::GpuMat overlay_mask,background_mask,face_part,overlay_part;
cv::cuda::split(resized_image_torric, channels);
cv::cuda::merge({channels[0], channels[1], channels[2]}, overlay_img);
channels[3].copyTo(overlay_mask);
cv::cuda::subtract(cv::Scalar(255), overlay_mask, background_mask);
face_img.convertTo(face_part, CV_32F,1.0 / 255.0);
overlay_img.convertTo(overlay_part, CV_32F, 1.0 / 255.0);
background_mask.convertTo(background_mask, CV_32F,1.0 / 255.0);
overlay_mask.convertTo(overlay_mask, CV_32F, 1.0 / 255.0);
cv::cuda::cvtColor(background_mask, background_mask, cv::COLOR_GRAY2BGR);
cv::cuda::cvtColor(overlay_mask,overlay_mask, cv::COLOR_GRAY2BGR);
cv::cuda::multiply(face_part, background_mask,face_part);
cv::cuda::multiply(overlay_part, overlay_mask, overlay_part);
cv::cuda::addWeighted(face_part, 255.0, overlay_part, 255.0, 0.0, result);
result.convertTo(result, CV_8U);
//Download from the gpu image part------------------------------------------------------
cv::Mat temp_frame_downloaded;
result.download(temp_frame_downloaded);
// cv::imshow("Object Detection", temp_frame_downloaded);
return temp_frame_downloaded;
}
I had tried this method but it is taking around 50ms for the execution of a single frame (1920x1080 @30fps) i want to reduce it to around 10ms ,Can you please give me some suggestion,How can i blend it.
Hi,
We don’t know if there is other existing solution for the use-case. Would need other users to share experience.
One more thing you may try is to run sudo jetson_clocks. This enables GPU engine at maximum clock. Please try and see if it brings enhancement.
Thanks for your kind response ,
As per your suggestion maximise the clock perfomance i have tried out.But not much difference
Can you suggest any method other than cv2.cuda.addweighted api .In nvidia (Blending RGBA image with Normal image)
I have gone through a lot of articles but things got in to vain…
Your suggestions are really appreciated…
@DaveYYY @DaneLLL