I have the below code.
I tried in “normal” C first, and then I’m trying to transfer it to CUDA.
When I is 1, the program segfaults. I don’t understand why.
Perhaps I could gain more information from cuda-gdb, but I’m not sure how.
[codebox]int main() {
int **ptrArray;
//ptrArray = (int**)malloc(3 * sizeof(int*));
cudaError_t error = cudaMalloc( (void**) ptrArray, 3 * sizeof(int*));
for(int i = 0; i < 3; i++) {
//ptrArray[i] = (int*)malloc(10 * sizeof(int));
error = cudaMalloc( (void**) ptrArray[i], 10 * sizeof(int));
}
TehKernel<<<1, 1>>>(ptrArray);
}[/codebox]
Cuda-dgb:
[codebox][Thread debugging using libthread_db enabled]
[New process 30521]
[New Thread -1211263280 (LWP 30521)]
[New Thread -1215595632 (LWP 30524)]
[New Thread -1215861872 (LWP 30525)]
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread -1211263280 (LWP 30521)]
0xb7f76773 in cudaMalloc () from /opt/cuda/lib/libcudart.so.2[/codebox]
I have the below code.
I tried in “normal” C first, and then I’m trying to transfer it to CUDA.
When I is 1, the program segfaults. I don’t understand why.
Perhaps I could gain more information from cuda-gdb, but I’m not sure how.
[codebox]int main() {
int **ptrArray;
//ptrArray = (int**)malloc(3 * sizeof(int*));
cudaError_t error = cudaMalloc( (void**) ptrArray, 3 * sizeof(int*));
for(int i = 0; i < 3; i++) {
//ptrArray[i] = (int*)malloc(10 * sizeof(int));
error = cudaMalloc( (void**) ptrArray[i], 10 * sizeof(int));
}
TehKernel<<<1, 1>>>(ptrArray);
}[/codebox]
Hi!
ptrArray points to device code. This means that the value of ptrArray must be interpreted as an adress in device mem. But
error = cudaMalloc( (void**) ptrArray[i], 10 * sizeof(int));
is executed in host code. So dereferencing in ptrArray[i] is done on the CPU. And the value of the memory in host at adress (ptrArray + i) is something undefined.
[quote name=‘navier-stokes’ post=‘531319’ date=‘Apr 17 2009, 01:44 PM’]
Hi!
ptrArray points to device code. This means that the value of ptrArray must be interpreted as an adress in device mem. But
[codebox] int **ptrArray;
cudaError_t error = cudaMalloc( (void**) ptrArray, 3 * sizeof(int*));
printf(“%d\n”, error);
int *ptrA, *ptrB, *ptrC;
error = cudaMalloc( (void**) ptrA, 10 * sizeof(int));
printf(“%d\n”, error);
error = cudaMalloc( (void**) ptrB, 10 * sizeof(int));
printf(“%d\n”, error);
error = cudaMalloc( (void**) ptrC, 10 * sizeof(int));
printf(“%d\n”, error);
int *ptrS[3];
ptrS[0] = ptrA;
ptrS[1] = ptrB;
ptrS[2] = ptrC;
cudaMemcpy(ptrArray, ptrS, 3 * sizeof(int*), cudaMemcpyHostToDevice);[/codebox]
Thinking then, that if I allocate 4 arrays, and the copy the adresses to three of them to the last, I could achieve what I wanted (Which is sending over several arrays without getting a huge parameter list)
However, I still seg fault on the second Malloc.
Something else I can’t quite figure out is, why gdb claims there is no variable ptrA in current context.
Nevermind, my mallocs where wrong >_<
[codebox]global static void TehKernel(int **DArr) {
for(int i = 0; i < 3; i++)
printf("%p\n", DArr[i]);
}
template
T* SimpleCudaMalloc(const size_t size) {
T* devicePtr = NULL;
cudaError_t error = cudaMalloc((void**)&devicePtr, size * sizeof(T));
if (error == cudaErrorMemoryAllocation) {
//cout << "CudaMalloc error: " << error << " (" << cudaGetErrorString(error) << ")" << ". Exiting" << endl;
assert(false);
}
return devicePtr;
}
int main() {
const int popSize = 1024;
int *T, *V, *A;
T = SimpleCudaMalloc(popSize);
V = SimpleCudaMalloc(popSize);
A = SimpleCudaMalloc(popSize);
int **DArr, **HArr;
DArr = SimpleCudaMalloc<int*>(3);
HArr = (int**)malloc(3 * sizeof(int*));
HArr[0] = T;
HArr[1] = V;
HArr[2] = A;
for(int i = 0; i < 3; i++)
printf("%p\n", HArr[i]);
cudaMemcpy(HArr, DArr, 3 * sizeof(int*), cudaMemcpyHostToDevice);
TehKernel<<<1, 1>>>(DArr);
printf(“\n”);
}[/codebox]
The output when printing from the Kernel is just (nil).
Nevermind, my mallocs where wrong >_<
[codebox]global static void TehKernel(int **DArr) {
for(int i = 0; i < 3; i++)
printf("%p\n", DArr[i]);
}
template
T* SimpleCudaMalloc(const size_t size) {
T* devicePtr = NULL;
cudaError_t error = cudaMalloc((void**)&devicePtr, size * sizeof(T));
if (error == cudaErrorMemoryAllocation) {
//cout << "CudaMalloc error: " << error << " (" << cudaGetErrorString(error) << ")" << ". Exiting" << endl;
assert(false);
}
return devicePtr;
}
int main() {
const int popSize = 1024;
int *T, *V, *A;
T = SimpleCudaMalloc(popSize);
V = SimpleCudaMalloc(popSize);
A = SimpleCudaMalloc(popSize);
int **DArr, **HArr;
DArr = SimpleCudaMalloc<int*>(3);
HArr = (int**)malloc(3 * sizeof(int*));
HArr[0] = T;
HArr[1] = V;
HArr[2] = A;
for(int i = 0; i < 3; i++)
printf("%p\n", HArr[i]);
cudaMemcpy(HArr, DArr, 3 * sizeof(int*), cudaMemcpyHostToDevice);
TehKernel<<<1, 1>>>(DArr);
printf(“\n”);
}[/codebox]
The output when printing from the Kernel is just (nil).
And I got that figured out too ^^
Again my copying was wrong. Gonna stick to my templates next time ;)
Must be:
int **ptrArray;
cudaError_t error = cudaMalloc( (void**)&ptrArray, 3 * sizeof(int*));
printf("%d\n", error);
hdinh
April 21, 2009, 10:08pm
7
Hi Letharion,
Would you mind sharing your working code? I am also trying to make 2D arrays using memcpy2D but my code is not working. It’s very frustrating.
Dinh
EDIT: Got it to work…Why didn’t you use memcpy2D like this???
cudaMemcpy2D(DArr, popSize*sizeof(int), HArr, popSize*sizeof(int), popSize*sizeof(int), 3, cudaMemcpyHostToDevice);