CUDA & OpenACC interoperability: Device selection

I have a big multi-gpu application that mixes OpenACC kernels with other CUDA kernels. And I’m experiencing some errors related to cuda context:


I’ve been able to reduce the original code into a test-case:


#include <openacc.h>
#include <omp.h>
#include <cuda_runtime_api.h>

#include <stdio.h>

#define SIZE   128*1024*1024

void init(int* v, const int nelems, int streamId)
{
   #pragma acc kernels async(streamId)
   for (int i = 0; i < nelems; i++)
      v[i] = i;
}

void vecadd(const int* restrict a, const int* restrict b, int* restrict c, const int nelems, int streamId)
{
    #pragma acc kernels async(streamId)
    for (int i = 0; i < nelems; i++)
        c[i] = a[i] + b[i];
}

int main(int argc, char* argv[])
{
    if (argc != 2) return -1;

    #pragma acc init device_type(acc_device_nvidia)

    const int inp_gpus = atoi(argv[1]);
    const int max_gpus = acc_get_num_devices(acc_device_nvidia);

    const int ngpus  = (inp_gpus > max_gpus) ? max_gpus : inp_gpus;
    const int nelems = SIZE;

    printf("NGPUS    %d\n", ngpus);

    int a[ngpus][nelems], b[ngpus][nelems], c[ngpus][nelems];

    cudaStream_t stream[ngpus];

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
       //acc_set_device_num(gpuId, acc_device_nvidia);
        cudaSetDevice(gpuId);
        cudaStreamCreate(&stream[gpuId]);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        printf("-- gpu - int a & b %d\n", gpuId);

        acc_set_device_num(gpuId, acc_device_nvidia);
        acc_set_cuda_stream(gpuId, stream[gpuId]);

        init(a[gpuId], nelems, gpuId);
        init(b[gpuId], nelems, gpuId);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        printf("-- gpu vecadd %d\n", gpuId);

        acc_set_device_num(gpuId, acc_device_nvidia);
        acc_set_cuda_stream(gpuId, stream[gpuId]);

        vecadd(a[gpuId], b[gpuId], c[gpuId], nelems, gpuId);
    }

    #pragma acc wait

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        for (int i = 0; i < 2; i++)
            printf("c[%d][%d] %d\n", gpuId, i, c[gpuId][i]);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        cudaStreamDestroy(stream[gpuId]);
    }

    return 0;
}

Error I’m getting:

========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxDestroy_v2. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so (cuCtxDestroy_v2 + 0x143) [0x1e96b3]
Failing in Thread:1
=========     Host Frame:/usr/local/pgi/linux86-64/16.10/lib/libaccnmp.so (__pgi_uacc_cuda_release_buffer + 0xdc) [0x5459]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x35910]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x3596a]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf8) [0x202b8]
=========     Host Frame:./openacc-test [0x13fa]
=========
call to cuCtxDestroy returned error 201: Invalid context
========= ERROR SUMMARY: 1 error

The question A is: Is it safe to use cudaSetDevice instead of acc_set_device_num ?

Hi pfarre83876,

This isn’t really an error, or at least not unexpected. Basically, this is the call used to by the PGI runtime to test if the CUDA context has already been created and cuda-memcheck flags it as an error. The error is caught by the PGI runtime and does not effect execution.

However, the initial call to setDevice isn’t needed since acc_set_device_num will call setDevice as well. Nor are the subsequent calls to acc_set_device_num since the OpenMP threads are persistent between OpenMP regions and will retain which GPU they are using.

I cleaned-up the code (see below) with these changes.

Hope this helps,
Mat

% cat test.c
#include <openacc.h>
 #include <omp.h>
 #include <cuda_runtime_api.h>

 #include <stdio.h>

 #define SIZE   128*1024*1024

 void init(int* v, const int nelems, int streamId)
 {
    #pragma acc kernels async(streamId)
    for (int i = 0; i < nelems; i++)
       v[i] = i;
 }

 void vecadd(const int* restrict a, const int* restrict b, int* restrict c, const int nelems, int streamId)
 {
     #pragma acc kernels async(streamId)
     for (int i = 0; i < nelems; i++)
         c[i] = a[i] + b[i];
 }

 int main(int argc, char* argv[])
 {
     if (argc != 2) return -1;

     const int inp_gpus = atoi(argv[1]);
     const int max_gpus = acc_get_num_devices(acc_device_nvidia);

     const int ngpus  = (inp_gpus > max_gpus) ? max_gpus : inp_gpus;
     const int nelems = SIZE;

     printf("NGPUS    %d\n", ngpus);

     int a[ngpus][nelems], b[ngpus][nelems], c[ngpus][nelems];

     cudaStream_t stream[ngpus];

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         printf("-- gpu - int a & b %d\n", gpuId);

         acc_set_device_num(gpuId, acc_device_nvidia);
         cudaStreamCreate(&stream[gpuId]);
         acc_set_cuda_stream(gpuId, stream[gpuId]);

         init(a[gpuId], nelems, gpuId);
         init(b[gpuId], nelems, gpuId);
     }

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         printf("-- gpu vecadd %d\n", gpuId);
         vecadd(a[gpuId], b[gpuId], c[gpuId], nelems, gpuId);
     }

     #pragma acc wait

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         for (int i = 0; i < 2; i++)
             printf("c[%d][%d] %d\n", gpuId, i, c[gpuId][i]);
     }

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         cudaStreamDestroy(stream[gpuId]);
     }

     return 0;
 }

% pgcc -mp -acc -Minfo -fast test.c -ta=tesla:cc60 -Mcuda -V16.10
init:
     11, Generating implicit copyout(v[:nelems])
     12, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
vecadd:
     18, Generating implicit copyin(a[:nelems],b[:nelems])
         Generating implicit copyout(c[:nelems])
     19, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
     40, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     52, Parallel region terminated
         Barrier
     53, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     59, Parallel region terminated
         Barrier
     62, Parallel region activated
         Parallel loop activated with static cyclic schedule
     64, Loop not vectorized/parallelized: contains call
     68, Parallel region terminated
         Barrier
     69, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     74, Parallel region terminated
         Barrier

% cuda-memcheck a.out 2
========= CUDA-MEMCHECK
NGPUS    2
-- gpu - int a & b 0
-- gpu - int a & b 1
-- gpu vecadd 0
-- gpu vecadd 1
c[0][0] 0
c[0][1] 2
c[1][0] 0
c[1][1] 2
========= ERROR SUMMARY: 0 errors