I have a big multi-gpu application that mixes OpenACC kernels with other CUDA kernels. And I’m experiencing some errors related to cuda context:
I’ve been able to reduce the original code into a test-case:
#include <openacc.h>
#include <omp.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#define SIZE 128*1024*1024
void init(int* v, const int nelems, int streamId)
{
#pragma acc kernels async(streamId)
for (int i = 0; i < nelems; i++)
v[i] = i;
}
void vecadd(const int* restrict a, const int* restrict b, int* restrict c, const int nelems, int streamId)
{
#pragma acc kernels async(streamId)
for (int i = 0; i < nelems; i++)
c[i] = a[i] + b[i];
}
int main(int argc, char* argv[])
{
if (argc != 2) return -1;
#pragma acc init device_type(acc_device_nvidia)
const int inp_gpus = atoi(argv[1]);
const int max_gpus = acc_get_num_devices(acc_device_nvidia);
const int ngpus = (inp_gpus > max_gpus) ? max_gpus : inp_gpus;
const int nelems = SIZE;
printf("NGPUS %d\n", ngpus);
int a[ngpus][nelems], b[ngpus][nelems], c[ngpus][nelems];
cudaStream_t stream[ngpus];
#pragma omp parallel for schedule(static,1)
for (int gpuId = 0; gpuId < ngpus; gpuId++)
{
//acc_set_device_num(gpuId, acc_device_nvidia);
cudaSetDevice(gpuId);
cudaStreamCreate(&stream[gpuId]);
}
#pragma omp parallel for schedule(static,1)
for (int gpuId = 0; gpuId < ngpus; gpuId++)
{
printf("-- gpu - int a & b %d\n", gpuId);
acc_set_device_num(gpuId, acc_device_nvidia);
acc_set_cuda_stream(gpuId, stream[gpuId]);
init(a[gpuId], nelems, gpuId);
init(b[gpuId], nelems, gpuId);
}
#pragma omp parallel for schedule(static,1)
for (int gpuId = 0; gpuId < ngpus; gpuId++)
{
printf("-- gpu vecadd %d\n", gpuId);
acc_set_device_num(gpuId, acc_device_nvidia);
acc_set_cuda_stream(gpuId, stream[gpuId]);
vecadd(a[gpuId], b[gpuId], c[gpuId], nelems, gpuId);
}
#pragma acc wait
#pragma omp parallel for schedule(static,1)
for (int gpuId = 0; gpuId < ngpus; gpuId++)
{
for (int i = 0; i < 2; i++)
printf("c[%d][%d] %d\n", gpuId, i, c[gpuId][i]);
}
#pragma omp parallel for schedule(static,1)
for (int gpuId = 0; gpuId < ngpus; gpuId++)
{
cudaStreamDestroy(stream[gpuId]);
}
return 0;
}
Error I’m getting:
========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxDestroy_v2.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so (cuCtxDestroy_v2 + 0x143) [0x1e96b3]
Failing in Thread:1
========= Host Frame:/usr/local/pgi/linux86-64/16.10/lib/libaccnmp.so (__pgi_uacc_cuda_release_buffer + 0xdc) [0x5459]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x35910]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x3596a]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf8) [0x202b8]
========= Host Frame:./openacc-test [0x13fa]
=========
call to cuCtxDestroy returned error 201: Invalid context
========= ERROR SUMMARY: 1 error
The question A is: Is it safe to use cudaSetDevice instead of acc_set_device_num ?