I am having issue getting different kernels to run concurrently on dedicated non-blocking cuda streams. However, the kernels run in parallel if multiple instances of the same kernel are scheduled on multiple cuda streams.
Below code has 4 persistent kernels Rx0PktHandlerKernel, Rx1PktHandlerKernel, Rx2PktHandlerKernel and Rx3PktHandlerKernel which get the data to process from a 5th kernel SimulatePktTicksKernel at a regular rate of 8.33us.
All 5 kernels are scheduled on different dedicated non-blocking cuda streams. I need all 5 kernels to run concurrently but having trouble to get that behaviour.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
#include <Windows.h>
using namespace std;
#define NUM_PKTS (60)
static const int s_NumRxAnts = 4;
static const int s_NumSlots = 1;
cudaStream_t aPktStream[s_NumRxAnts];
cudaStream_t PktTickStream;
volatile bool* apbPktHandlerPolled_h[s_NumRxAnts];
__device__ volatile bool abPktDataRdy_d[s_NumRxAnts] = { false };
__global__ void Rx0PktHandlerKernel(volatile bool* pControlSig)
{
while (*pControlSig)
{
// @todo check abPktDataRdy_d[0] if data is available
}
}
__global__ void Rx1PktHandlerKernel(volatile bool* pControlSig)
{
while (*pControlSig)
{
// @todo check abPktDataRdy_d[1] if data is available
}
}
__global__ void Rx2PktHandlerKernel(volatile bool* pControlSig)
{
while (*pControlSig)
{
// @todo check abPktDataRdy_d[2] if data is available
}
}
__global__ void Rx3PktHandlerKernel(volatile bool* pControlSig)
{
while (*pControlSig)
{
// @todo check abPktDataRdy_d[3] if data is available
}
}
__global__ void SimulatePktTicksKernel(const int NumSlots)
{
for (int pkt = 0; pkt < NUM_PKTS * NumSlots; pkt++)
{
for (int rx = 0; rx < s_NumRxAnts; rx++)
{
abPktDataRdy_d[rx] = true;
}
__nanosleep(8333);
}
}
void AllocDeviceMemory()
{
cudaError_t CudaStatus;
int AllocFlags = cudaHostAllocMapped | cudaHostAllocWriteCombined;
cout << "AllocFlags=" << AllocFlags << endl;
for (int rx = 0; rx < s_NumRxAnts; rx++)
{
cudaStreamCreateWithFlags(&aPktStream[rx], cudaStreamNonBlocking);
CudaStatus = cudaHostAlloc(&apbPktHandlerPolled_h[rx], sizeof(bool), AllocFlags);
if (CudaStatus != cudaSuccess)
{
cout << "Pinned memory alloc of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
}
*apbPktHandlerPolled_h[rx] = false;
}
cudaStreamCreateWithFlags(&PktTickStream, cudaStreamNonBlocking);
}
void FreeDeviceMemory()
{
cudaError_t CudaStatus;
for (int rx = 0; rx < s_NumRxAnts; rx++)
{
cudaStreamDestroy(aPktStream[rx]);
CudaStatus = cudaFreeHost((void*)apbPktHandlerPolled_h[rx]);
if (CudaStatus != cudaSuccess)
{
cout << "cudaFreeHost of control signals failed! RxAnt=" << rx << ", CUDA error: " << cudaGetErrorName(CudaStatus) << endl;
}
}
cudaStreamDestroy(PktTickStream);
}
void SpawnPktHandlerKernels()
{
// switch on the control signals to keep the persistent kernels going
for (int rx = 0; rx < s_NumRxAnts; rx++)
{
*apbPktHandlerPolled_h[rx] = true;
}
// launch the persistent kernels on dedicated streams
dim3 gridDim(1, 1, 1);
dim3 blockDim(1, 1, 1);
cudaError_t CudaStatus;
volatile bool* apbPktHandlerPolled_d[s_NumRxAnts];
CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[0], (void*)apbPktHandlerPolled_h[0], 0);
CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[1], (void*)apbPktHandlerPolled_h[1], 0);
CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[2], (void*)apbPktHandlerPolled_h[2], 0);
CudaStatus = cudaHostGetDevicePointer((void**)&apbPktHandlerPolled_d[3], (void*)apbPktHandlerPolled_h[3], 0);
if (CudaStatus != cudaSuccess)
{
cout << "cudaHostGetDevicePointer() failed! CUDA error=" << cudaGetErrorName(CudaStatus) << endl;
}
Rx0PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[0]>>>(apbPktHandlerPolled_d[0]);
Rx1PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[1]>>>(apbPktHandlerPolled_d[1]);
Rx2PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[2]>>>(apbPktHandlerPolled_d[2]);
Rx3PktHandlerKernel<<<gridDim, blockDim, 0, aPktStream[3]>>>(apbPktHandlerPolled_d[3]);
SimulatePktTicksKernel<<<gridDim, blockDim, 0, PktTickStream>>>(s_NumSlots);
}
void HaltPktHandlerKernels()
{
// switch off the control signals to halt the persistent kernels
for (int rx = 0; rx < s_NumRxAnts; rx++)
{
*apbPktHandlerPolled_h[rx] = false;
}
cudaError_t CudaStatus = cudaDeviceSynchronize();
if (CudaStatus != cudaSuccess)
{
cout << "Device synchronization failed" << endl;
}
}
int main()
{
AllocDeviceMemory();
SpawnPktHandlerKernels();
Sleep(100); // Let the empty persistent kernels run for a bit
HaltPktHandlerKernels();
FreeDeviceMemory();
return 1;
}
With the above piece of code, all 5 kernels run sequentially. See below NSYS snippet
But if I call same kernel Rx0PktHandlerKernel in all 4 streams, they run in parallel with the 5th kernel SimulatePktTicksKernel starting after the first four kernels finish as it has a different name. See below NSYS snippet.
So, it appears the kernels need to be identical for them to run concurrently which is a bit odd.
What am I missing here? I have seen posts on related topic but the kernels running in parallel in all those examples are same which ties with my observations.
My question is, is there really a restriction in cuda that the kernels must be same for them to run in parallel?
I am using RTX A4000 with flag --default-stream per-thread .