Kernels not running concurrently in different dedicated streams

sunil.mallavarapu · April 28, 2025, 3:15pm

I am having issue getting different kernels to run concurrently on dedicated non-blocking cuda streams. However, the kernels run in parallel if multiple instances of the same kernel are scheduled on multiple cuda streams.

Below code has 4 persistent kernels Rx0PktHandlerKernel, Rx1PktHandlerKernel, Rx2PktHandlerKernel and Rx3PktHandlerKernel which get the data to process from a 5th kernel SimulatePktTicksKernel at a regular rate of 8.33us.
All 5 kernels are scheduled on different dedicated non-blocking cuda streams. I need all 5 kernels to run concurrently but having trouble to get that behaviour.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>
#include <Windows.h>

using namespace std;
#define NUM_PKTS  (60)
static const int s_NumRxAnts = 4;
static const int s_NumSlots = 1;

cudaStream_t aPktStream[s_NumRxAnts];
cudaStream_t PktTickStream;
volatile bool* apbPktHandlerPolled_h[s_NumRxAnts];
__device__ volatile bool abPktDataRdy_d[s_NumRxAnts] = { false };

__global__ void Rx0PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[0] if data is available
   }
}

__global__ void Rx1PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[1] if data is available
   }
}

__global__ void Rx2PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[2] if data is available
   }
}

__global__ void Rx3PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[3] if data is available
   }
}

__global__ void SimulatePktTicksKernel(const int NumSlots)
{
   for (int pkt = 0; pkt < NUM_PKTS * NumSlots; pkt++)
   {
      for (int rx = 0; rx < s_NumRxAnts; rx++)
      {
         abPktDataRdy_d[rx] = true;
      }

      __nanosleep(8333);
   }
}

void AllocDeviceMemory()
{
   cudaError_t CudaStatus;

   int AllocFlags = cudaHostAllocMapped | cudaHostAllocWriteCombined;
   cout << "AllocFlags=" << AllocFlags << endl;

   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      cudaStreamCreateWithFlags(&aPktStream[rx], cudaStreamNonBlocking);

      CudaStatus = cudaHostAlloc(&apbPktHandlerPolled_h[rx], sizeof(bool), AllocFlags);
      if (CudaStatus != cudaSuccess)
      {
         cout << "Pinned memory alloc of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
      }
      *apbPktHandlerPolled_h[rx] = false;
   }

   cudaStreamCreateWithFlags(&PktTickStream, cudaStreamNonBlocking);
}

void FreeDeviceMemory()
{
   cudaError_t CudaStatus;

   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      cudaStreamDestroy(aPktStream[rx]);

      CudaStatus = cudaFreeHost((void*)apbPktHandlerPolled_h[rx]);
      if (CudaStatus != cudaSuccess)
      {
         cout << "cudaFreeHost of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
      }
   }

   cudaStreamDestroy(PktTickStream);
}

void SpawnPktHandlerKernels()
{
   // switch on the control signals to keep the persistent kernels going
   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
       *apbPktHandlerPolled_h[rx] = true;
   }

   // launch the persistent kernels on dedicated streams
   dim3 gridDim(1, 1, 1);
   dim3 blockDim(1, 1, 1);
   cudaError_t CudaStatus;
   volatile bool* apbPktHandlerPolled_d[s_NumRxAnts];
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[0], (void*)apbPktHandlerPolled_h[0], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[1], (void*)apbPktHandlerPolled_h[1], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[2], (void*)apbPktHandlerPolled_h[2], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[3], (void*)apbPktHandlerPolled_h[3], 0);
   if (CudaStatus != cudaSuccess)
   {
      cout << "cudaHostGetDevicePointer() failed! CUDA error=" << cudaGetErrorName(CudaStatus) << endl;
   }

   Rx0PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[0]>>>(apbPktHandlerPolled_d[0]);
   Rx1PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[1]>>>(apbPktHandlerPolled_d[1]);
   Rx2PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[2]>>>(apbPktHandlerPolled_d[2]);
   Rx3PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[3]>>>(apbPktHandlerPolled_d[3]);

   SimulatePktTicksKernel<<<gridDim, blockDim, 0, PktTickStream>>>(s_NumSlots);
}

void HaltPktHandlerKernels()
{
   // switch off the control signals to halt the persistent kernels
   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      *apbPktHandlerPolled_h[rx] = false;
   }

   cudaError_t CudaStatus = cudaDeviceSynchronize();
   if (CudaStatus != cudaSuccess)
   {
      cout << "Device synchronization failed" << endl;
   }
}

int main()
{
   AllocDeviceMemory();

   SpawnPktHandlerKernels();

   Sleep(100); // Let the empty persistent kernels run for a bit

   HaltPktHandlerKernels();

   FreeDeviceMemory();

   return 1;
}

With the above piece of code, all 5 kernels run sequentially. See below NSYS snippet

But if I call same kernel Rx0PktHandlerKernel in all 4 streams, they run in parallel with the 5th kernel SimulatePktTicksKernel starting after the first four kernels finish as it has a different name. See below NSYS snippet.

So, it appears the kernels need to be identical for them to run concurrently which is a bit odd.
What am I missing here? I have seen posts on related topic but the kernels running in parallel in all those examples are same which ties with my observations.
My question is, is there really a restriction in cuda that the kernels must be same for them to run in parallel?

I am using RTX A4000 with flag --default-stream per-thread .

Robert_Crovella · April 28, 2025, 3:42pm

The first time you call those kernels, they are going to run sequentially because of CUDA lazy initialization/lazy module loading. This sort of question come up from time to time so you can find various examples. Here is one. Also see here.

Lazy module loading is in effect by default on Windows on CUDA versions 12.3 and newer.

Even after you address that, WDDM behavior can sometimes make achieving the desired concurrency scenarios somewhat difficult to witness. You might also try both settings of Windows Hardware Accelerated GPU Scheduling. But if it were me, I would address the lazy loading topic first. A simple way to address that could be to add some code to each kernel to check for a flag and just exit. Call the kernel first with that exit flag set. That will get the kernel module loaded. Then later, when you call it with the flag not set, you can/may witness “normal” behavior.

sunil.mallavarapu · April 29, 2025, 8:56am

Many thanks Robert, that was quite useful. I see how lazy module loading is affecting my persistent kernels now.

Checking for a global flag in the first call before actually launching the kernels has fixed my issue and I can see the expected behaviour now.

Alternatively, the same behaviour can also be achieved by using cudaFuncGetAttributes() API as recommended here. I have verified it.

system · May 13, 2025, 8:57am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Why Different Kernels in Different Streams Behave Nearly Serially While Same Kernels Overlap Perfectly? CUDA Programming and Performance cuda , kernel	6	56	March 16, 2025
My streams are not running concurrently CUDA Programming and Performance	7	1796	March 6, 2018
Concurrent Kernels Bug / Undocumented Behavior (Urgent) need info on "simple" problem with c CUDA Programming and Performance	2	910	June 18, 2010
unable to get the cpu and gpu to run in parallel CUDA Programming and Performance	34	23219	October 7, 2010
A few new to CUDA questions CUDA Programming and Performance	3	1112	February 4, 2011
Multiple kernels in flight? CUDA Programming and Performance	19	26849	August 28, 2007
Concurrent kernel execution CUDA Programming and Performance	2	397	March 26, 2024
Overlapping CPU and GPU operations using streams. Total failure. Any help? CUDA Programming and Performance	6	6038	April 2, 2013
Cannot force kernels to concurrent execution CUDA Programming and Performance	8	5549	April 28, 2012
Persistent Kernel does not work properly on some GPUs CUDA Programming and Performance cuda , kernel , nvbugs	6	2737	September 9, 2023

Kernels not running concurrently in different dedicated streams

Related topics