CUDA & OpenACC interoperability: Device selection

pfarre83876 · July 3, 2017, 4:16pm

I have a big multi-gpu application that mixes OpenACC kernels with other CUDA kernels. And I’m experiencing some errors related to cuda context:

I’ve been able to reduce the original code into a test-case:

#include <openacc.h>
#include <omp.h>
#include <cuda_runtime_api.h>

#include <stdio.h>

#define SIZE   128*1024*1024

void init(int* v, const int nelems, int streamId)
{
   #pragma acc kernels async(streamId)
   for (int i = 0; i < nelems; i++)
      v[i] = i;
}

void vecadd(const int* restrict a, const int* restrict b, int* restrict c, const int nelems, int streamId)
{
    #pragma acc kernels async(streamId)
    for (int i = 0; i < nelems; i++)
        c[i] = a[i] + b[i];
}

int main(int argc, char* argv[])
{
    if (argc != 2) return -1;

    #pragma acc init device_type(acc_device_nvidia)

    const int inp_gpus = atoi(argv[1]);
    const int max_gpus = acc_get_num_devices(acc_device_nvidia);

    const int ngpus  = (inp_gpus > max_gpus) ? max_gpus : inp_gpus;
    const int nelems = SIZE;

    printf("NGPUS    %d\n", ngpus);

    int a[ngpus][nelems], b[ngpus][nelems], c[ngpus][nelems];

    cudaStream_t stream[ngpus];

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
       //acc_set_device_num(gpuId, acc_device_nvidia);
        cudaSetDevice(gpuId);
        cudaStreamCreate(&stream[gpuId]);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        printf("-- gpu - int a & b %d\n", gpuId);

        acc_set_device_num(gpuId, acc_device_nvidia);
        acc_set_cuda_stream(gpuId, stream[gpuId]);

        init(a[gpuId], nelems, gpuId);
        init(b[gpuId], nelems, gpuId);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        printf("-- gpu vecadd %d\n", gpuId);

        acc_set_device_num(gpuId, acc_device_nvidia);
        acc_set_cuda_stream(gpuId, stream[gpuId]);

        vecadd(a[gpuId], b[gpuId], c[gpuId], nelems, gpuId);
    }

    #pragma acc wait

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        for (int i = 0; i < 2; i++)
            printf("c[%d][%d] %d\n", gpuId, i, c[gpuId][i]);
    }

    #pragma omp parallel for schedule(static,1)
    for (int gpuId = 0; gpuId < ngpus; gpuId++)
    {
        cudaStreamDestroy(stream[gpuId]);
    }

    return 0;
}

Error I’m getting:

========= Program hit CUDA_ERROR_INVALID_CONTEXT (error 201) due to "invalid device context" on CUDA API call to cuCtxDestroy_v2. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so (cuCtxDestroy_v2 + 0x143) [0x1e96b3]
Failing in Thread:1
=========     Host Frame:/usr/local/pgi/linux86-64/16.10/lib/libaccnmp.so (__pgi_uacc_cuda_release_buffer + 0xdc) [0x5459]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x35910]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 [0x3596a]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf8) [0x202b8]
=========     Host Frame:./openacc-test [0x13fa]
=========
call to cuCtxDestroy returned error 201: Invalid context
========= ERROR SUMMARY: 1 error

The question A is: Is it safe to use cudaSetDevice instead of acc_set_device_num ?

MatColgrove · July 6, 2017, 7:07pm

Hi pfarre83876,

This isn’t really an error, or at least not unexpected. Basically, this is the call used to by the PGI runtime to test if the CUDA context has already been created and cuda-memcheck flags it as an error. The error is caught by the PGI runtime and does not effect execution.

However, the initial call to setDevice isn’t needed since acc_set_device_num will call setDevice as well. Nor are the subsequent calls to acc_set_device_num since the OpenMP threads are persistent between OpenMP regions and will retain which GPU they are using.

I cleaned-up the code (see below) with these changes.

Hope this helps,
Mat

% cat test.c
#include <openacc.h>
 #include <omp.h>
 #include <cuda_runtime_api.h>

 #include <stdio.h>

 #define SIZE   128*1024*1024

 void init(int* v, const int nelems, int streamId)
 {
    #pragma acc kernels async(streamId)
    for (int i = 0; i < nelems; i++)
       v[i] = i;
 }

 void vecadd(const int* restrict a, const int* restrict b, int* restrict c, const int nelems, int streamId)
 {
     #pragma acc kernels async(streamId)
     for (int i = 0; i < nelems; i++)
         c[i] = a[i] + b[i];
 }

 int main(int argc, char* argv[])
 {
     if (argc != 2) return -1;

     const int inp_gpus = atoi(argv[1]);
     const int max_gpus = acc_get_num_devices(acc_device_nvidia);

     const int ngpus  = (inp_gpus > max_gpus) ? max_gpus : inp_gpus;
     const int nelems = SIZE;

     printf("NGPUS    %d\n", ngpus);

     int a[ngpus][nelems], b[ngpus][nelems], c[ngpus][nelems];

     cudaStream_t stream[ngpus];

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         printf("-- gpu - int a & b %d\n", gpuId);

         acc_set_device_num(gpuId, acc_device_nvidia);
         cudaStreamCreate(&stream[gpuId]);
         acc_set_cuda_stream(gpuId, stream[gpuId]);

         init(a[gpuId], nelems, gpuId);
         init(b[gpuId], nelems, gpuId);
     }

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         printf("-- gpu vecadd %d\n", gpuId);
         vecadd(a[gpuId], b[gpuId], c[gpuId], nelems, gpuId);
     }

     #pragma acc wait

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         for (int i = 0; i < 2; i++)
             printf("c[%d][%d] %d\n", gpuId, i, c[gpuId][i]);
     }

     #pragma omp parallel for schedule(static,1)
     for (int gpuId = 0; gpuId < ngpus; gpuId++)
     {
         cudaStreamDestroy(stream[gpuId]);
     }

     return 0;
 }

% pgcc -mp -acc -Minfo -fast test.c -ta=tesla:cc60 -Mcuda -V16.10
init:
     11, Generating implicit copyout(v[:nelems])
     12, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         12, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
vecadd:
     18, Generating implicit copyin(a[:nelems],b[:nelems])
         Generating implicit copyout(c[:nelems])
     19, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
main:
     40, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     52, Parallel region terminated
         Barrier
     53, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     59, Parallel region terminated
         Barrier
     62, Parallel region activated
         Parallel loop activated with static cyclic schedule
     64, Loop not vectorized/parallelized: contains call
     68, Parallel region terminated
         Barrier
     69, Parallel region activated
         Parallel loop activated with static cyclic schedule
         Loop not vectorized/parallelized: contains call
     74, Parallel region terminated
         Barrier

% cuda-memcheck a.out 2
========= CUDA-MEMCHECK
NGPUS    2
-- gpu - int a & b 0
-- gpu - int a & b 1
-- gpu vecadd 0
-- gpu vecadd 1
c[0][0] 0
c[0][1] 2
c[1][0] 0
c[1][1] 2
========= ERROR SUMMARY: 0 errors

Topic		Replies	Views
How to parallelize this loop... Legacy PGI Compilers	14	7826	December 18, 2012
OpenACC Region: Command exited with non-zero status 1 nvc, nvc++ and nvfortran cuda	21	1866	October 14, 2021
compiler ask acc routine information for internal function Legacy PGI Compilers	12	20317	October 25, 2017
pgcc compile error in OpenACC-CUDA interoperabily example Legacy PGI Compilers	5	13280	June 14, 2016
cuModuleLoadData error 209 Legacy PGI Compilers	7	16298	February 10, 2015
copy(movement) of user defined objects to the gpu in OpenACC Legacy PGI Compilers	22	22518	December 10, 2018
How to Avoid Re-Creating CUDA Context with Multi-GPU using OpenMP? Call a function that does omp par CUDA Programming and Performance	2	1833	July 28, 2009
OpenACC async problem when using PGI compiler v13.9 or v14.1 Legacy PGI Compilers	3	5524	February 4, 2014
OpenMP, OpenACC and acc_set_device_num Legacy PGI Compilers	12	10809	March 15, 2013
CudaAPI calls in functions, compiler/linking bug? CUDA Programming and Performance	6	374	August 17, 2023

CUDA & OpenACC interoperability: Device selection

Related topics