MPI + Peer2Peer combine MPI and Peer2Peer

jtoelke · January 18, 2012, 12:55am

Hello,

I can run a multithreaded code with cudaMemcpyPeerAsync with amazing performance (pthreads commands for synchronization).
Now I want to use MPI instead of pthreads, because I don"t want to reprogram a larger existing code in mixed pthreads/MPI style.
I want to use MPI for synchronization and cudaMemcpyPeerAsync to communicate. It is like a one sided communication setup.
I can enable Peer2Peer between the processes (each precess holds one GPU).
I transfer the device pointers with MPI to the other processes, but cudaMemcpyPeerAsync fails.
Do I need a unified address space to do that ???

Thanks,
Jonas.

mfatica · January 18, 2012, 2:08am

Are you using the cudaIpc interface available in CUDA 4.1?
It is the only way to have P2P between the processes.

jtoelke · January 18, 2012, 4:49pm

Thanks a lot, that looks like the solution but I can not make it work, it is

simple_text.cu(125) : cudaSafeCall() Runtime API error 11: invalid argument.

I post the code here, perhaps somebody can help me:

include <mpi.h>

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include

include <cuda_runtime.h>

//include <cutil.h>

include <cutil_inline.h>

define BUFFSIZE 10000

define MPI_SAFE_CALL( call) { \

int errloc = call;							\

if( errloc != MPI_SUCCESS) {					\

  fprintf(stderr, "MPI error in file '%s' in line %i\n",	\

      __FILE__, __LINE__);		\

  MPI_Finalize();							\

  exit(-1);								\

} }

void write2stdout(int myid, char* messg, MPI_Comm comm_cart, int numprocs){

MPI_Status status;

MPI_Request request;

int i;

char mess1[BUFFSIZE];

MPI_SAFE_CALL( MPI_Isend( messg, BUFFSIZE, MPI_CHARACTER, 0, 0, comm_cart, &request) );

if(myid==0) {

for(i=0 ; i<numprocs ; i++){

  MPI_SAFE_CALL( MPI_Recv( mess1  , BUFFSIZE, MPI_CHARACTER, i, 0, comm_cart, &status));

  printf("%s",mess1);

}

}

MPI_SAFE_CALL( MPI_Wait(&request,&status) );

}

int main( int argc, char* argv){

char buff[BUFFSIZE];

int i;

int pid=-1, np=-1;

MPI_SAFE_CALL( MPI_Init(&argc, &argv) );

MPI_SAFE_CALL( MPI_Comm_rank(MPI_COMM_WORLD, &pid));

MPI_SAFE_CALL( MPI_Comm_size(MPI_COMM_WORLD, &np));

if( np!=2) {

printf("exactly 2 processes\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;

}

if( argc != np+1) {

printf("usage: %s device0 device1\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;

}

int gpuID[2];

for(i=0;i<2;i++){

gpuID[i] = atoi(argv[i+1]);

}

int* memD[2];

int mem=pid;

int mem_size = sizeof(int);

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

if(deviceProp.computeMode==cudaComputeModeDefault) cutilSafeCall(cudaSetDevice(gpuID[pid]));

else{

sprintf(buff,"pid %d, gpuId: %d:  Exclusive Mode, reassign gpuId\n",pid, gpuID[pid]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

}

cutilSafeCall( cudaMalloc((void**) &memD[pid], mem_size) );

cutilSafeCall( cudaMemcpy( memD[pid], &mem, mem_size, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaGetDevice(&gpuID[pid]));

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

sprintf(buff,“pid %d, gpuId: %d: %s, %d multiprocessors, %d warp size\n”,pid, gpuID[pid], deviceProp.name, deviceProp.multiProcessorCount, deviceProp.warpSize);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

cudaIpcMemHandle_t memHandle[2];

cudaIpcGetMemHandle ( &memHandle[pid], memD[pid]);

for (i=0 ; i<2 ; i++){

MPI_Bcast( &gpuID[i], 1, MPI_INT, i , MPI_COMM_WORLD); 

MPI_Bcast( &memHandle[i], sizeof(memHandle[i]), MPI_CHAR, i , MPI_COMM_WORLD);

}

for (i=0 ; i<2 ; i++){

//if(pid!=i){

cudaIpcOpenMemHandle ((void**) &memD[i], memHandle[i], cudaIpcMemLazyEnablePeerAccess);

//}

sprintf(buff,"pid %d: gpuId[i=%d]=%d: dat1[i=%d]=%p\n",pid, i, gpuID[i], i, memD[i]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

}

MPI_Barrier (MPI_COMM_WORLD);

int peerI=0;

if(pid == 0) {

cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1]));

if(peerI==1){

  printf("pid %d: Enable peer to %d\n",pid,pid+1);

  cutilSafeCall( cudaDeviceEnablePeerAccess(gpuID[1],0) );

}

else{

  printf("pid %d: *** Can not Enable peer to %d\n",gpuID[0],gpuID[1]);

}

}

MPI_Barrier (MPI_COMM_WORLD);

if(pid==0) {

cutilSafeCall( cudaMemcpyPeer( memD[0], gpuID[0],  memD[1],  gpuID[1], mem_size ) );

}

cutilSafeCall( cudaMemcpy(&mem, memD[pid], mem_size,cudaMemcpyDeviceToHost) );

sprintf(buff, “pid %d: mem=%d\n”, pid, mem);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

MPI_Barrier (MPI_COMM_WORLD);

MPI_Finalize();

return 0;

}

jtoelke · January 18, 2012, 9:06pm

it works with
cutilSafeCall( cudaMemcpy( memD[0], memD[1], mem_size, cudaMemcpyDefault ) ); !!!

why not with
cudaMemcpyPeer( memD[0], gpuID[0], memD[1], gpuID[1], mem_size ) ???

By the way, it works but only if they are on the same socket.

jtoelke · January 23, 2012, 9:43pm

ok here is more information:

fji · February 8, 2012, 7:35am

I had a similar program to yours.

But, in my case, cudaIpcOpenMemHandle(…) will return cudaErrorMapBufferObjectFailed, if

cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1])) returns false.

Therefore I have to put cudaDeviceCanAccessPeer test before.

Has anyone seen this problem as I did?

Thanks a lot, that looks like the solution but I can not make it work, it is

simple_text.cu(125) : cudaSafeCall() Runtime API error 11: invalid argument.

I post the code here, perhaps somebody can help me:

include <mpi.h>

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include

include <cuda_runtime.h>

//include <cutil.h>

include <cutil_inline.h>

define BUFFSIZE 10000

define MPI_SAFE_CALL( call) { \
int errloc = call;							\

if( errloc != MPI_SUCCESS) {					\

  fprintf(stderr, "MPI error in file '%s' in line %i\n",	\

      __FILE__, __LINE__);		\

  MPI_Finalize();							\

  exit(-1);								\

} }
void write2stdout(int myid, char* messg, MPI_Comm comm_cart, int numprocs){

MPI_Status status;

MPI_Request request;

int i;

char mess1[BUFFSIZE];

MPI_SAFE_CALL( MPI_Isend( messg, BUFFSIZE, MPI_CHARACTER, 0, 0, comm_cart, &request) );

if(myid==0) {
for(i=0 ; i<numprocs ; i++){

  MPI_SAFE_CALL( MPI_Recv( mess1  , BUFFSIZE, MPI_CHARACTER, i, 0, comm_cart, &status));

  printf("%s",mess1);

}
}

MPI_SAFE_CALL( MPI_Wait(&request,&status) );

}

int main( int argc, char* argv){

char buff[BUFFSIZE];

int i;

int pid=-1, np=-1;

MPI_SAFE_CALL( MPI_Init(&argc, &argv) );

MPI_SAFE_CALL( MPI_Comm_rank(MPI_COMM_WORLD, &pid));

MPI_SAFE_CALL( MPI_Comm_size(MPI_COMM_WORLD, &np));

if( np!=2) {
printf("exactly 2 processes\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;
}

if( argc != np+1) {
printf("usage: %s device0 device1\n",argv[0]);

MPI_Abort( MPI_COMM_WORLD, 1 );

return 1;
}

int gpuID[2];

for(i=0;i<2;i++){
gpuID[i] = atoi(argv[i+1]);
}

int* memD[2];

int mem=pid;

int mem_size = sizeof(int);

cudaDeviceProp deviceProp;

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

if(deviceProp.computeMode==cudaComputeModeDefault) cutilSafeCall(cudaSetDevice(gpuID[pid]));

else{
sprintf(buff,"pid %d, gpuId: %d:  Exclusive Mode, reassign gpuId\n",pid, gpuID[pid]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);
}

cutilSafeCall( cudaMalloc((void**) &memD[pid], mem_size) );

cutilSafeCall( cudaMemcpy( memD[pid], &mem, mem_size, cudaMemcpyHostToDevice) );

cutilSafeCall(cudaGetDevice(&gpuID[pid]));

cudaGetDeviceProperties(&deviceProp, gpuID[pid]);

sprintf(buff,“pid %d, gpuId: %d: %s, %d multiprocessors, %d warp size\n”,pid, gpuID[pid], deviceProp.name, deviceProp.multiProcessorCount, deviceProp.warpSize);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

cudaIpcMemHandle_t memHandle[2];

cudaIpcGetMemHandle ( &memHandle[pid], memD[pid]);

for (i=0 ; i<2 ; i++){
MPI_Bcast( &gpuID[i], 1, MPI_INT, i , MPI_COMM_WORLD); 

MPI_Bcast( &memHandle[i], sizeof(memHandle[i]), MPI_CHAR, i , MPI_COMM_WORLD); 
}

for (i=0 ; i<2 ; i++){
//if(pid!=i){

cudaIpcOpenMemHandle ((void**) &memD[i], memHandle[i], cudaIpcMemLazyEnablePeerAccess);

//}

sprintf(buff,"pid %d: gpuId[i=%d]=%d: dat1[i=%d]=%p\n",pid, i, gpuID[i], i, memD[i]);

write2stdout(pid,buff,MPI_COMM_WORLD,np);
}

MPI_Barrier (MPI_COMM_WORLD);

int peerI=0;

if(pid == 0) {
cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1]));

if(peerI==1){

  printf("pid %d: Enable peer to %d\n",pid,pid+1);

  cutilSafeCall( cudaDeviceEnablePeerAccess(gpuID[1],0) );

}

else{

  printf("pid %d: *** Can not Enable peer to %d\n",gpuID[0],gpuID[1]);

}
}

MPI_Barrier (MPI_COMM_WORLD);

if(pid==0) {
cutilSafeCall( cudaMemcpyPeer( memD[0], gpuID[0],  memD[1],  gpuID[1], mem_size ) );
}

cutilSafeCall( cudaMemcpy(&mem, memD[pid], mem_size,cudaMemcpyDeviceToHost) );

sprintf(buff, “pid %d: mem=%d\n”, pid, mem);

write2stdout(pid,buff,MPI_COMM_WORLD,np);

MPI_Barrier (MPI_COMM_WORLD);

MPI_Finalize();

return 0;

}

Topic		Replies	Views
MPI and CUDA mixed programming General CUDA programming CUDA Programming and Performance	22	23717	July 27, 2010
An Introduction to CUDA-Aware MPI Technical Blog	5	978	August 30, 2019
using all 4 GPUs in S1070 from multi-core cpu? how CUDA Programming and Performance	11	32423	December 13, 2010
openMP+CUDA, need help! CUDA Programming and Performance	7	2014	November 23, 2012
GPU Inter-Process Communications(IPC) question CUDA Programming and Performance	13	15380	January 4, 2023
Mixed Programing combining MPI and CUDA CUDA Programming and Performance	19	6556	May 5, 2009
Using multiple GPUs Legacy PGI Compilers	7	22094	August 11, 2009
CUDA Cluster Programming Any1 Experienced? CUDA Programming and Performance	12	7059	December 5, 2008
Mutual exclusion MPI Windows CUDA Programming and Performance	6	3544	May 18, 2011
[CUF] cuda-aware mpi send/recv segfault with cuda-memcheck Legacy PGI Compilers	7	3632	October 10, 2018

MPI + Peer2Peer combine MPI and Peer2Peer

define MPI_SAFE_CALL( call) { \

Related topics