Kernels not running concurrently in different dedicated streams

I am having issue getting different kernels to run concurrently on dedicated non-blocking cuda streams. However, the kernels run in parallel if multiple instances of the same kernel are scheduled on multiple cuda streams.

Below code has 4 persistent kernels Rx0PktHandlerKernel, Rx1PktHandlerKernel, Rx2PktHandlerKernel and Rx3PktHandlerKernel which get the data to process from a 5th kernel SimulatePktTicksKernel at a regular rate of 8.33us.
All 5 kernels are scheduled on different dedicated non-blocking cuda streams. I need all 5 kernels to run concurrently but having trouble to get that behaviour.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>
#include <Windows.h>

using namespace std;
#define NUM_PKTS  (60)
static const int s_NumRxAnts = 4;
static const int s_NumSlots = 1;

cudaStream_t aPktStream[s_NumRxAnts];
cudaStream_t PktTickStream;
volatile bool* apbPktHandlerPolled_h[s_NumRxAnts];
__device__ volatile bool abPktDataRdy_d[s_NumRxAnts] = { false };

__global__ void Rx0PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[0] if data is available
   }
}

__global__ void Rx1PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[1] if data is available
   }
}

__global__ void Rx2PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[2] if data is available
   }
}

__global__ void Rx3PktHandlerKernel(volatile bool* pControlSig)
{
   while (*pControlSig)
   {
      // @todo check abPktDataRdy_d[3] if data is available
   }
}

__global__ void SimulatePktTicksKernel(const int NumSlots)
{
   for (int pkt = 0; pkt < NUM_PKTS * NumSlots; pkt++)
   {
      for (int rx = 0; rx < s_NumRxAnts; rx++)
      {
         abPktDataRdy_d[rx] = true;
      }

      __nanosleep(8333);
   }
}

void AllocDeviceMemory()
{
   cudaError_t CudaStatus;

   int AllocFlags = cudaHostAllocMapped | cudaHostAllocWriteCombined;
   cout << "AllocFlags=" << AllocFlags << endl;

   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      cudaStreamCreateWithFlags(&aPktStream[rx], cudaStreamNonBlocking);

      CudaStatus = cudaHostAlloc(&apbPktHandlerPolled_h[rx], sizeof(bool), AllocFlags);
      if (CudaStatus != cudaSuccess)
      {
         cout << "Pinned memory alloc of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
      }
      *apbPktHandlerPolled_h[rx] = false;
   }

   cudaStreamCreateWithFlags(&PktTickStream, cudaStreamNonBlocking);
}

void FreeDeviceMemory()
{
   cudaError_t CudaStatus;

   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      cudaStreamDestroy(aPktStream[rx]);

      CudaStatus = cudaFreeHost((void*)apbPktHandlerPolled_h[rx]);
      if (CudaStatus != cudaSuccess)
      {
         cout << "cudaFreeHost of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
      }
   }

   cudaStreamDestroy(PktTickStream);
}

void SpawnPktHandlerKernels()
{
   // switch on the control signals to keep the persistent kernels going
   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
       *apbPktHandlerPolled_h[rx] = true;
   }

   // launch the persistent kernels on dedicated streams
   dim3 gridDim(1, 1, 1);
   dim3 blockDim(1, 1, 1);
   cudaError_t CudaStatus;
   volatile bool* apbPktHandlerPolled_d[s_NumRxAnts];
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[0], (void*)apbPktHandlerPolled_h[0], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[1], (void*)apbPktHandlerPolled_h[1], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[2], (void*)apbPktHandlerPolled_h[2], 0);
   CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[3], (void*)apbPktHandlerPolled_h[3], 0);
   if (CudaStatus != cudaSuccess)
   {
      cout << "cudaHostGetDevicePointer() failed! CUDA error=" << cudaGetErrorName(CudaStatus) << endl;
   }

   Rx0PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[0]>>>(apbPktHandlerPolled_d[0]);
   Rx1PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[1]>>>(apbPktHandlerPolled_d[1]);
   Rx2PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[2]>>>(apbPktHandlerPolled_d[2]);
   Rx3PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[3]>>>(apbPktHandlerPolled_d[3]);

   SimulatePktTicksKernel<<<gridDim, blockDim, 0, PktTickStream>>>(s_NumSlots);
}

void HaltPktHandlerKernels()
{
   // switch off the control signals to halt the persistent kernels
   for (int rx = 0; rx < s_NumRxAnts; rx++)
   {
      *apbPktHandlerPolled_h[rx] = false;
   }

   cudaError_t CudaStatus = cudaDeviceSynchronize();
   if (CudaStatus != cudaSuccess)
   {
      cout << "Device synchronization failed" << endl;
   }
}

int main()
{
   AllocDeviceMemory();

   SpawnPktHandlerKernels();

   Sleep(100); // Let the empty persistent kernels run for a bit

   HaltPktHandlerKernels();

   FreeDeviceMemory();

   return 1;
}

With the above piece of code, all 5 kernels run sequentially. See below NSYS snippet

But if I call same kernel Rx0PktHandlerKernel in all 4 streams, they run in parallel with the 5th kernel SimulatePktTicksKernel starting after the first four kernels finish as it has a different name. See below NSYS snippet.

So, it appears the kernels need to be identical for them to run concurrently which is a bit odd.
What am I missing here? I have seen posts on related topic but the kernels running in parallel in all those examples are same which ties with my observations.
My question is, is there really a restriction in cuda that the kernels must be same for them to run in parallel?

I am using RTX A4000 with flag --default-stream per-thread .

The first time you call those kernels, they are going to run sequentially because of CUDA lazy initialization/lazy module loading. This sort of question come up from time to time so you can find various examples. Here is one. Also see here.

Lazy module loading is in effect by default on Windows on CUDA versions 12.3 and newer.

Even after you address that, WDDM behavior can sometimes make achieving the desired concurrency scenarios somewhat difficult to witness. You might also try both settings of Windows Hardware Accelerated GPU Scheduling. But if it were me, I would address the lazy loading topic first. A simple way to address that could be to add some code to each kernel to check for a flag and just exit. Call the kernel first with that exit flag set. That will get the kernel module loaded. Then later, when you call it with the flag not set, you can/may witness “normal” behavior.

Many thanks Robert, that was quite useful. I see how lazy module loading is affecting my persistent kernels now.

Checking for a global flag in the first call before actually launching the kernels has fixed my issue and I can see the expected behaviour now.

Alternatively, the same behaviour can also be achieved by using cudaFuncGetAttributes() API as recommended here. I have verified it.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.