Thanks a lot, that looks like the solution but I can not make it work, it is
simple_text.cu(125) : cudaSafeCall() Runtime API error 11: invalid argument.
I post the code here, perhaps somebody can help me:
include <mpi.h>
include <stdlib.h>
include <stdio.h>
include <string.h>
include <math.h>
include
include <cuda_runtime.h>
//include <cutil.h>
include <cutil_inline.h>
define BUFFSIZE 10000
define MPI_SAFE_CALL( call) { \
int errloc = call; \
if( errloc != MPI_SUCCESS) { \
fprintf(stderr, "MPI error in file '%s' in line %i\n", \
__FILE__, __LINE__); \
MPI_Finalize(); \
exit(-1); \
} }
void write2stdout(int myid, char* messg, MPI_Comm comm_cart, int numprocs){
MPI_Status status;
MPI_Request request;
int i;
char mess1[BUFFSIZE];
MPI_SAFE_CALL( MPI_Isend( messg, BUFFSIZE, MPI_CHARACTER, 0, 0, comm_cart, &request) );
if(myid==0) {
for(i=0 ; i<numprocs ; i++){
MPI_SAFE_CALL( MPI_Recv( mess1 , BUFFSIZE, MPI_CHARACTER, i, 0, comm_cart, &status));
printf("%s",mess1);
}
}
MPI_SAFE_CALL( MPI_Wait(&request,&status) );
}
int main( int argc, char* argv ){
char buff[BUFFSIZE];
int i;
int pid=-1, np=-1;
MPI_SAFE_CALL( MPI_Init(&argc, &argv) );
MPI_SAFE_CALL( MPI_Comm_rank(MPI_COMM_WORLD, &pid));
MPI_SAFE_CALL( MPI_Comm_size(MPI_COMM_WORLD, &np));
if( np!=2) {
printf("exactly 2 processes\n",argv[0]);
MPI_Abort( MPI_COMM_WORLD, 1 );
return 1;
}
if( argc != np+1) {
printf("usage: %s device0 device1\n",argv[0]);
MPI_Abort( MPI_COMM_WORLD, 1 );
return 1;
}
int gpuID[2];
for(i=0;i<2;i++){
gpuID[i] = atoi(argv[i+1]);
}
int* memD[2];
int mem=pid;
int mem_size = sizeof(int);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpuID[pid]);
if(deviceProp.computeMode==cudaComputeModeDefault) cutilSafeCall(cudaSetDevice(gpuID[pid]));
else{
sprintf(buff,"pid %d, gpuId: %d: Exclusive Mode, reassign gpuId\n",pid, gpuID[pid]);
write2stdout(pid,buff,MPI_COMM_WORLD,np);
}
cutilSafeCall( cudaMalloc((void**) &memD[pid], mem_size) );
cutilSafeCall( cudaMemcpy( memD[pid], &mem, mem_size, cudaMemcpyHostToDevice) );
cutilSafeCall(cudaGetDevice(&gpuID[pid]));
cudaGetDeviceProperties(&deviceProp, gpuID[pid]);
sprintf(buff,“pid %d, gpuId: %d: %s, %d multiprocessors, %d warp size\n”,pid, gpuID[pid], deviceProp.name, deviceProp.multiProcessorCount, deviceProp.warpSize);
write2stdout(pid,buff,MPI_COMM_WORLD,np);
cudaIpcMemHandle_t memHandle[2];
cudaIpcGetMemHandle ( &memHandle[pid], memD[pid]);
for (i=0 ; i<2 ; i++){
MPI_Bcast( &gpuID[i], 1, MPI_INT, i , MPI_COMM_WORLD);
MPI_Bcast( &memHandle[i], sizeof(memHandle[i]), MPI_CHAR, i , MPI_COMM_WORLD);
}
for (i=0 ; i<2 ; i++){
//if(pid!=i){
cudaIpcOpenMemHandle ((void**) &memD[i], memHandle[i], cudaIpcMemLazyEnablePeerAccess);
//}
sprintf(buff,"pid %d: gpuId[i=%d]=%d: dat1[i=%d]=%p\n",pid, i, gpuID[i], i, memD[i]);
write2stdout(pid,buff,MPI_COMM_WORLD,np);
}
MPI_Barrier (MPI_COMM_WORLD);
int peerI=0;
if(pid == 0) {
cutilSafeCall( cudaDeviceCanAccessPeer(&peerI,gpuID[0],gpuID[1]));
if(peerI==1){
printf("pid %d: Enable peer to %d\n",pid,pid+1);
cutilSafeCall( cudaDeviceEnablePeerAccess(gpuID[1],0) );
}
else{
printf("pid %d: *** Can not Enable peer to %d\n",gpuID[0],gpuID[1]);
}
}
MPI_Barrier (MPI_COMM_WORLD);
if(pid==0) {
cutilSafeCall( cudaMemcpyPeer( memD[0], gpuID[0], memD[1], gpuID[1], mem_size ) );
}
cutilSafeCall( cudaMemcpy(&mem, memD[pid], mem_size,cudaMemcpyDeviceToHost) );
sprintf(buff, “pid %d: mem=%d\n”, pid, mem);
write2stdout(pid,buff,MPI_COMM_WORLD,np);
MPI_Barrier (MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}