This is driving me nuts! memory access problem..

zl25drexel · December 6, 2007, 3:26am

Hi all

For a school project i am trying to use cuda to do some DSP algorithm. I’ve been developed my program against emulation and everything was working as expected, but when I run it in non-emu mode, the values of memory i am trying to process remain unchanged. I tried a bunch of things and non seem to work so I decided to do something simple and see if it works, but to my surprise, even the simplest program wont work as i think it would.

here is my test porgram, in emulation the program works correctly and prints 10 1s. however, in non-emu, the values of the array was unchanged, it prints 10 0s.

[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG -deviceemu

[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out

1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,

[~/NVIDIA_CUDA_SDK/projects/test]$ nvcc test.cu -I/usr/local/cuda/include -I../../common/inc -DUNIX -O3 -D_DEBUG

[~/NVIDIA_CUDA_SDK/projects/test]$ ./a.out

0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,

How could this be? What am I missing here?

/////////////test.cu/////////////

#include <stdio.h>

#include <cutil.h>

#include <cuda.h>

#include <cuda_runtime_api.h>

#define SIZE 10

__device__ void test_kernel(double* x){

	for(int i =0; i< SIZE; i++){

    x[i] = 1.0;

  }

}

__global__ void testD(double* x){

	test_kernel(x);

}

void  print(double* x, int size){

  for(int i =0; i<size; i++){

    printf("%f,", x[i]);

  }

  printf("\n");

}

__host__ int main(int argc, char **argv) {

    double* x = new double;

    for(int i=0; i< SIZE; i++){

      x[i] = 0;

    }

    double* xD;

    unsigned int mem_size =SIZE*sizeof(double);

    CUDA_SAFE_CALL(cudaMalloc((void**) &xD, mem_size));

   CUDA_SAFE_CALL(cudaMemcpy(xD, x, mem_size, cudaMemcpyHostToDevice) );

    dim3 grid(1,1,1);

    dim3 thread(1,1,1);

    testD<<<grid, thread>>>(xD);

    cuCtxSynchronize();//wait for things to finish

    

    CUT_CHECK_ERROR("Kernel execution failed");

    CUDA_SAFE_CALL(cudaMemcpy(x, xD, mem_size, cudaMemcpyDeviceToHost) );

    CUDA_SAFE_CALL(cudaFree(xD));    

    

    print(x, SIZE);

    delete[] x;

}

I am on a 32bits ubuntu box with CUDA_SDK 1.0 and 2 8800GTX cards in SLI

nwilt · December 6, 2007, 3:44am

I don’t see anything obviously wrong except that double is not yet supported in CUDA. The first thing to do is rework the app to be “float clean” and see if that fixes your problem.

zl25drexel · December 6, 2007, 3:57am

Wow! thank you, that’s exactly what the problem was. I have no idea that cuda doesnt support double. Changed everything to float and it works.

damn it if double is not supported then the nvcc should give some warnings

Morph208 · December 6, 2007, 5:12am

Double should be supported by nvcc, but computations are done with float since there is no hardware supporting double precision so far (that should come out soon though). So in theory nvcc should convert all your doubles in floats I believe. At least I thought it was working like this.

I don’t know why it was causing you some issues… Maybe someone can answer to that question.

zl25drexel · December 7, 2007, 2:22am

Given the fact the the simple program above wont work, i am going to guess that it does not.

mfatica · December 7, 2007, 2:43am

nvcc will squash double to float on the device. On the host side, it will handle the code to the native host compiler, that will treat double as double.

So, you need to be careful with the allocation on the host side.

The following code works, but I strongly suggest that you write your code to be float safe on this generation of GPU.

/////////////test.cu/////////////

#include <stdio.h>

#include <cutil.h>

#define SIZE 10

__device__ void test_kernel(double* x){

for(int i =0; i< SIZE; i++){

   x[i] = 1.0;

 }

}

__global__ void testD(double* x){

test_kernel(x);

}

void  print(float* x, int size){

 for(int i =0; i<size; i++){

   printf("%f,", x[i]);

 }

 printf("\n");

}

__host__ int main(int argc, char **argv) {

   float* x = new float;

   for(int i=0; i< SIZE; i++){

     x[i] = 0;

   }

   float* xD;

   unsigned int mem_size =SIZE*sizeof(float);

   CUDA_SAFE_CALL(cudaMalloc((void**) &xD, mem_size));

  CUDA_SAFE_CALL(cudaMemcpy(xD, x, mem_size, cudaMemcpyHostToDevice) );

   dim3 grid(1,1,1);

   dim3 thread(1,1,1);

   testD<<<grid, thread>>>((double *)xD);

   cudaThreadSynchronize();//wait for things to finish

  CUT_CHECK_ERROR("Kernel execution failed");

   CUDA_SAFE_CALL(cudaMemcpy(x, xD, mem_size, cudaMemcpyDeviceToHost) );

   CUDA_SAFE_CALL(cudaFree(xD));

  print(x, SIZE);

   delete[] x;

}

Topic		Replies	Views
problem with double precision unpredictable results Different run give differents errors or no error CUDA Programming and Performance	12	2815	September 10, 2010
Double precision in CUDA 2.3 CUDA Programming and Performance	5	38172	March 5, 2010
1080 does not support doubles? CUDA Programming and Performance	5	556	October 30, 2018
Strange change in behaviour between float and double CUDA Programming and Performance	6	1312	April 1, 2009
Help with strange error CUDA Programming and Performance	8	2102	February 25, 2010
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	362	December 8, 2020
Cuda code performance CUDA Programming and Performance	14	3167	December 16, 2014
float / double issue CUDA Programming and Performance	12	22120	December 31, 2010
Another "results different from emulation and GPU" Data-parallel, reading and writing to g CUDA Programming and Performance	6	3838	May 5, 2009
Switch oddities Compiler bug? CUDA Programming and Performance	16	4562	September 10, 2008

This is driving me nuts! memory access problem..

Related topics