This page helped me a lot. so I just add a simple example to show clearly how we can use pointer to pointers in cuda.
//////////////////////////////P2P.cu//////////////
#include
#include
#include
#include
#include
#include <stddef.h>
#include <cuda.h>
#include <math_functions.h>
static void HandleError( cudaError_t err,const char *file, int line ) {
if (err != cudaSuccess) {
printf( “%s in %s at line %d\n”, cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, FILE, LINE ))
float Randomizer () { return (float)rand()/(float)RAND_MAX; }
global void kernelTestP2P(float **in, float **out)
{
unsigned int bid = blockIdx.x;
unsigned int tid = threadIdx.x;
// get the pointer to this thread block's row
extern __shared__ float shmem[];
shmem[tid] = in[bid][tid];
__syncthreads();
out[bid][tid]=sinf(shmem[tid]);
}
int main(){
int B=14,N=32;
srand ( unsigned ( time(NULL) ) );
float **in_dd,**in_hd,**in_hh;
//alocate 1st part, B count of float*
in_hd=(float**)malloc(sizeof(float*)B);
in_hh=(float*)malloc(sizeof(float*)B);
HANDLE_ERROR(cudaMalloc(&in_dd, sizeof(float) * B));
std::cout<<“OK\n”;
//alocate 2nd part, N count of float
for(int i = 0; i < B; ++i) {
HANDLE_ERROR(cudaMalloc((void**)&in_hd[i],Nsizeof(float)));
in_hh[i]=(float)malloc(Nsizeof(float));//allocate or connect the input data to it
std::generate_n(in_hh[i],N,Randomizer);
HANDLE_ERROR(cudaMemcpy(in_hd[i],in_hh[i],sizeof(float)N,cudaMemcpyHostToDevice));
}
//copy float part from hd to dd
HANDLE_ERROR(cudaMemcpy(in_dd, in_hd, sizeof(float)*B, cudaMemcpyHostToDevice));
float **out_dd,**out_hd,**out_hh;
out_hh=(float**)malloc(sizeof(float*)B);
out_hd=(float*)malloc(sizeof(float*) * B);
HANDLE_ERROR(cudaMalloc(&out_dd, sizeof(float*) * B));
for(int i=0;i<B;i++){
HANDLE_ERROR(cudaMalloc((void**)&out_hd[i],Nsizeof(float)));
out_hh[i]=(float)malloc(Nsizeof(float));//just alocate, NO input
}
HANDLE_ERROR(cudaMemcpy(out_dd,out_hd,Bsizeof(float*),cudaMemcpyHostToDevice));
size_t shMemSize=N*sizeof(float);
dim3 GridDim=dim3(B,1,1),BlockDim=dim3(N,1,1);
kernelTestP2P<<<GridDim,BlockDim,shMemSize>>>(in_dd,out_dd);
for(int i=0;i<B;i++){
HANDLE_ERROR(cudaMemcpy(out_hh[i], out_hd[i], sizeof(float)*N, cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(in_hd[i]));
HANDLE_ERROR(cudaFree(out_hd[i]));
}
HANDLE_ERROR(cudaFree(in_dd));
HANDLE_ERROR(cudaFree(out_dd));
std::ofstream f_out;
f_out.open(“output.txt”);
for(int i=0;i<B;i++) {
for(int j=0;j<N;j++) f_out<<out_hh[i][j]<<" “;
f_out<<”\n";
free(out_hh[i]);
free(in_hh[i]);
}
free(out_hh);
free(in_hh);
return 0;
}