Hi all
For a school project i am trying to use cuda to do some DSP algorithm. I’ve been developed my program against emulation and everything was working as expected, but when I run it in non-emu mode, the values of memory i am trying to process remain unchanged. I tried a bunch of things and non seem to work so I decided to do something simple and see if it works, but to my surprise, even the simplest program wont work as i think it would.
here is my test porgram, in emulation the program works correctly and prints 10 1s. however, in non-emu, the values of the array was unchanged, it prints 10 0s.
[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG -deviceemu
[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out
1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,
[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG
[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out
0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,
How could this be? What am I missing here?
/////////////test.cu/////////////
#include <stdio.h>
#include <cutil.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#define SIZE 10
__device__ void test_kernel(double* x){
for(int i =0; i< SIZE; i++){
x[i] = 1.0;
}
}
__global__ void testD(double* x){
test_kernel(x);
}
void print(double* x, int size){
for(int i =0; i<size; i++){
printf("%f,", x[i]);
}
printf("\n");
}
__host__ int main(int argc, char **argv) {
double* x = new double;
for(int i=0; i< SIZE; i++){
x[i] = 0;
}
double* xD;
unsigned int mem_size =SIZE*sizeof(double);
CUDA_SAFE_CALL(cudaMalloc((void**) &xD, mem_size));
CUDA_SAFE_CALL(cudaMemcpy(xD, x, mem_size, cudaMemcpyHostToDevice) );
dim3 grid(1,1,1);
dim3 thread(1,1,1);
testD<<<grid, thread>>>(xD);
cuCtxSynchronize();//wait for things to finish
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL(cudaMemcpy(x, xD, mem_size, cudaMemcpyDeviceToHost) );
CUDA_SAFE_CALL(cudaFree(xD));
print(x, SIZE);
delete[] x;
}
I am on a 32bits ubuntu box with CUDA_SDK 1.0 and 2 8800GTX cards in SLI