This is driving me nuts! memory access problem..

Hi all

For a school project i am trying to use cuda to do some DSP algorithm. I’ve been developed my program against emulation and everything was working as expected, but when I run it in non-emu mode, the values of memory i am trying to process remain unchanged. I tried a bunch of things and non seem to work so I decided to do something simple and see if it works, but to my surprise, even the simplest program wont work as i think it would.

here is my test porgram, in emulation the program works correctly and prints 10 1s. however, in non-emu, the values of the array was unchanged, it prints 10 0s.

[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG -deviceemu

[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out

1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,

[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG

[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out

0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,

How could this be? What am I missing here?

/////////////test.cu/////////////

#include <stdio.h>

#include <cutil.h>

#include <cuda.h>

#include <cuda_runtime_api.h>

#define SIZE 10

__device__ void test_kernel(double* x){

	for(int i =0; i< SIZE; i++){

    x[i] = 1.0;

  }

}

__global__ void testD(double* x){

	test_kernel(x);

}

void  print(double* x, int size){

  for(int i =0; i<size; i++){

    printf("%f,", x[i]);

  }

  printf("\n");

}

__host__ int main(int argc, char **argv) {

    double* x = new double;

    for(int i=0; i< SIZE; i++){

      x[i] = 0;

    }

    double* xD;

    unsigned int mem_size =SIZE*sizeof(double);

    CUDA_SAFE_CALL(cudaMalloc((void**) &xD, mem_size));

   CUDA_SAFE_CALL(cudaMemcpy(xD, x, mem_size, cudaMemcpyHostToDevice) );

    dim3 grid(1,1,1);

    dim3 thread(1,1,1);

    testD<<<grid, thread>>>(xD);

    cuCtxSynchronize();//wait for things to finish

    

    CUT_CHECK_ERROR("Kernel execution failed");

    CUDA_SAFE_CALL(cudaMemcpy(x, xD, mem_size, cudaMemcpyDeviceToHost) );

    CUDA_SAFE_CALL(cudaFree(xD));    

    

    print(x, SIZE);

    delete[] x;

}

I am on a 32bits ubuntu box with CUDA_SDK 1.0 and 2 8800GTX cards in SLI

I don’t see anything obviously wrong except that double is not yet supported in CUDA. The first thing to do is rework the app to be “float clean” and see if that fixes your problem.

Wow! thank you, that’s exactly what the problem was. I have no idea that cuda doesnt support double. Changed everything to float and it works.

damn it if double is not supported then the nvcc should give some warnings

Double should be supported by nvcc, but computations are done with float since there is no hardware supporting double precision so far (that should come out soon though). So in theory nvcc should convert all your doubles in floats I believe. At least I thought it was working like this.

I don’t know why it was causing you some issues… Maybe someone can answer to that question.

Given the fact the the simple program above wont work, i am going to guess that it does not.

nvcc will squash double to float on the device. On the host side, it will handle the code to the native host compiler, that will treat double as double.

So, you need to be careful with the allocation on the host side.

The following code works, but I strongly suggest that you write your code to be float safe on this generation of GPU.

/////////////test.cu/////////////

#include <stdio.h>

#include <cutil.h>

#define SIZE 10

__device__ void test_kernel(double* x){

for(int i =0; i< SIZE; i++){

   x[i] = 1.0;

 }

}

__global__ void testD(double* x){

test_kernel(x);

}

void  print(float* x, int size){

 for(int i =0; i<size; i++){

   printf("%f,", x[i]);

 }

 printf("\n");

}

__host__ int main(int argc, char **argv) {

   float* x = new float;

   for(int i=0; i< SIZE; i++){

     x[i] = 0;

   }

   float* xD;

   unsigned int mem_size =SIZE*sizeof(float);

   CUDA_SAFE_CALL(cudaMalloc((void**) &xD, mem_size));

  CUDA_SAFE_CALL(cudaMemcpy(xD, x, mem_size, cudaMemcpyHostToDevice) );

   dim3 grid(1,1,1);

   dim3 thread(1,1,1);

   testD<<<grid, thread>>>((double *)xD);

   cudaThreadSynchronize();//wait for things to finish

  CUT_CHECK_ERROR("Kernel execution failed");

   CUDA_SAFE_CALL(cudaMemcpy(x, xD, mem_size, cudaMemcpyDeviceToHost) );

   CUDA_SAFE_CALL(cudaFree(xD));

  print(x, SIZE);

   delete[] x;

}