CUDA Runtime API error 74: misaligned address

xav12358 · November 10, 2016, 9:47am

Hello,

I am working on shared memory and the program send me that error.
Here is my program:

////////////////////////////////////////////////////////////////////////////////////
    /// Local copy of the Image +size of the kernel and the kernel coordinate
    ///
    __shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ int16_t LocalKernel[kernelSize_X*kernelSize_Y];



    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[ptrk] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;


    if( xglobal < iMidX ||  yglobal < iMidY)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
    }

    if( xglobal >= w-kernelSize_X/2 ||  yglobal >= h-kernelSize_Y/2)
    {
        // Change the output pixel to 0
        u8_ImageOut[xglobal+yglobal*w] =  0;
        return;
    }

    if( xglobal > w-kernelSize_X ||  yglobal > h-kernelSize_Y)
    {
        return;
    }


    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 0)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  0;
        return;
    }



    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    int ptrl_Kernel = 0;//threadIdx.x + (threadIdx.y) *(kernelSize_X);

//        printf("ptrl_Image %d %d %d  \n",threadIdx.x,threadIdx.y,ptrl_Image);
    u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);



    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {
            tmpAND = *ptLocalImage & *ptLocalKernel;
            valMin = min(valMin,tmpAND);
        }
        ptrl_Image+=(BLOCK_SIZE_X+kernelSize_X);
//        / ptLocal = (u_int32_t *)(&LocalImage[ptrl2]);
    }

The error seems to come from these two lines:

u_int32_t *ptLocalImage     = (u_int32_t *)(&LocalImage[ptrl_Image]);
    u_int32_t *ptLocalKernel    = (u_int32_t *)(&LocalImage[ptrl_Kernel]);

Can someone explain me why?

cbuchner1 · November 10, 2016, 10:07am

Because u_int8_t and int16_t arrays like LocalImage and LocalKernel are not usually aligned at 4 byte boundaries.

You can try forcing the appropriate alignment. But even then ptrl_Image and ptrl_Kernel must be multiples of 4 for this to work.

Christian

xav12358 · November 10, 2016, 10:14am

Thank you for the answer.

Even the shared memory had to be aligned? And what about the simple local array?
If ptLocalKernel is a local array should it be align?

tera · November 10, 2016, 12:12pm

Yes, all memory accesses need alignment, regardless of memory space.

xav12358 · November 10, 2016, 1:25pm

Thank you.

So I modify my kernel like this:

__shared__ u_int8_t LocalImage[(BLOCK_SIZE_X+kernelSize_X)*(BLOCK_SIZE_Y+kernelSize_Y)];
    __shared__ u_int8_t LocalKernel[kernelSize_X][kernelSize_Y];


    int x = blockIdx.x*blockDim.x;
    int y = blockIdx.y*blockDim.y;

    if( (blockIdx.x*blockDim.x +threadIdx.x)  >= w  || (blockIdx.y*blockDim.y +threadIdx.y)  >= h)
    {
        return;
    }

    for(int i=threadIdx.x;i<BLOCK_SIZE_X+kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<BLOCK_SIZE_Y+kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            LocalImage[i+j*(BLOCK_SIZE_X+kernelSize_X)] = (u8_ImageIn)[x+i + (y+j)*w];
        }
    }


    for(int i=threadIdx.x;i<kernelSize_X && (x+i)<w;i=i+blockDim.x)
    {
        for(int j=threadIdx.y;j<kernelSize_Y && (y+j)<h;j=j+blockDim.y)
        {
            int ptrk =  i + j*kernelSize_X;
            LocalKernel[i][j] = u8_Kernel[ptrk];
        }
    }


    syncthreads();


    ////////////////////////////////////////////////////////////

    int iMidX = (kernelSize_X-1)/2;
    int iMidY = (kernelSize_Y-1)/2;

    int xglobal = x+threadIdx.x;
    int yglobal = y+threadIdx.y;
    int xglobal_offset = x+threadIdx.x+iMidX;
    int yglobal_offset = y+threadIdx.y+iMidY;

    int ptrl = threadIdx.x+iMidX + (threadIdx.y+iMidY) *(BLOCK_SIZE_X+kernelSize_X);
    if(LocalImage[ptrl] == 255)
    {
        u8_ImageOut[xglobal_offset+yglobal_offset*w] =  125;
        return;
    }


    u_int8_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i++)
        {
            LocalALignedMemory[i][j]  = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 0;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

    ptrl_Image  = 0;
    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        for(int i=0;i<kernelSize_X;i+=4)
        {

            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);
            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));
            tmpAND = tmpImage | tmpKernel;
            valMin = max(valMin,tmpAND);
        }
}

The problem send me the same misalign error on that line:

tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));

How could this be possible? I made the memory aligned.

Robert_Crovella · November 10, 2016, 2:51pm

You haven’t made it aligned. You attempted to “align” the i index, but based on C storage patterns you have to align the j index. Even that is problematic since there is no guarantee (AFAIK) that LocalKernel is aligned to a 32-bit boundary to begin with.

Anyway, if you are still getting the error, now or in the future, it means you are doing a misaligned access. If you get the error, the presence of the error indicates that you haven’t properly aligned things.

xav12358 · November 10, 2016, 3:23pm

How can I properly align it according i and j?

Robert_Crovella · November 10, 2016, 3:32pm

I’m not going to rewrite your kernel for you.

One possible approach would be to start out with LocalKernel defined as a uint_32 type. Then, every time you write to it, write 32 bits. Every time you read from it, you are reading 32 bits.

Problem solved. Obviously this requires a bunch of changes to the rest of your kernel. I’m not going to try and identify what all that should be.

I’m sure there are other approaches as well.

xav12358 · November 10, 2016, 4:11pm

It is not what I am asking. I just would like to know if there is a way to force a array to be align in cuda?

cbuchner1 · November 10, 2016, 4:32pm

Try the accepted answer on stackoverflow. It defines an alignment macro that works in all compiler environments (on host compilers as well).

http://stackoverflow.com/questions/12778949/cuda-memory-alignment

Christian

Robert_Crovella · November 10, 2016, 5:37pm

You can force an array declaration to be aligned in CUDA, certainly. But if you then go generating byte-level indexing into the array, things can still break.

The issue here is not how to align an array. That’s trivial. The issue is aligned access.

What you need to do is ensure your access mechanisms (index generation) will create aligned indicies into that array.

That will require rewriting of your code, not just ensuring that the base data declaration is aligned.

xav12358 · November 10, 2016, 6:05pm

As you adviced me I wrote à 32 bits version of m’y local copy:

u_int32_t LocalALignedMemory[kernelSize_X][kernelSize_Y];
    int ptrl_Image  = threadIdx.x + (threadIdx.y) *(BLOCK_SIZE_X+kernelSize_X);
    u_int8_t *ptLocalImage     = (u_int8_t *)(&LocalImage[ptrl_Image]);

    for(int j=0;j<kernelSize_Y;j++)
    {
        u_int8_t *u__localptr = (u_int8_t *)&(LocalALignedMemory[j][0]);
        for(int i=0;i<kernelSize_X;i++)
        {
//            (u_int8_t*)(&(LocalALignedMemory[0][j]))+i)  = *ptLocalImage;
            u__localptr[i] = *ptLocalImage;
            ptLocalImage++;
        }
        ptrl_Image += (BLOCK_SIZE_X+kernelSize_X);
        ptLocalImage = (u_int8_t *)(&LocalImage[ptrl_Image]);
    }


    u_int32_t valMin = 255;
    u_int32_t tmpAND;
    u_int32_t tmpImage;
    u_int32_t tmpKernel;

//    ptrl_Image  = 0;
//    u_int32_t *pt32LocalAlignedImage     = (u_int32_t *)(&LocalALignedMemory[0][0]);
//    u_int32_t *pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);


    for(int j=0;j<kernelSize_Y;j++)
    {
        int i = 0;
        int ip =4;
        for(;(i)<kernelSize_X;)
        {
            if(blockIdx.x == 0 && blockIdx.y ==0 &&threadIdx.x ==0 && threadIdx.y==0)
                printf("j %d i %d adress %d %d %d \n",j,i,(&LocalALignedMemory[j][i]),(&LocalALignedMemory[j][i])- (&LocalALignedMemory[j][i+1]),(&LocalALignedMemory[j+1][i])- (&LocalALignedMemory[j][i]));
            //tmpImage = *LocalImage <<24 | *(LocalImage+1)<<16| *(LocalImage+2)<<8| *(LocalImage+3);
            tmpImage    = *(u_int32_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
//            tmpKernel   = *(u_int32_t *)(&(LocalKernel[i][j]));//*pt32LocalKernel;//*ptLocalKernel <<24 | *(ptLocalKernel+1)<<16| *(ptLocalKernel+2)<<8| *(ptLocalKernel+3);
            tmpAND = tmpImage;
            valMin = min(valMin,tmpAND);
//            pt32LocalAlignedImage++;
//            pt32LocalKernel++;
            i+=4;
            ip+=4;
        }
//        pt32LocalKernel    = (u_int32_t *)(&LocalKernel[0][0]);
//        pt32LocalAlignedImage = (u_int32_t *)(&LocalALignedMemory[0][0]);

//        i-=4;
        for(;(i+2)<kernelSize_X;i+=2)
        {
            tmpImage    = (0x00000000) | *(u_int16_t *)(&LocalALignedMemory[i][j]);//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

//        i-=2;
        for(;i<kernelSize_X;i++)
        {
            tmpImage    = (0x00000000) | LocalALignedMemory[i][j];//*pt32LocalAlignedImage;
            tmpAND = tmpImage ;
            valMin = min(valMin,tmpAND);
        }

    }

There is no misa lignes error but the result is not correct.

Topic		Replies	Views
CL_INVALID_COMMAND_QUEUE error due to local memory byte alignment CUDA Programming and Performance	13	2207	July 29, 2021
Misaligned address of union CUDA Programming and Performance	12	1915	October 16, 2020
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3054	November 13, 2017
Global memory access problem CUDA Programming and Performance	9	3472	July 13, 2011
Using Shared Memory in CUDA C/C++ Technical Blog	36	2007	October 8, 2020
multi dimension array CUDA Programming and Performance	26	32792	February 12, 2010
CUDA 4.0: linux malloc for page-aligned memory and cudaHostRegister How to malloc page-aligned memor CUDA Programming and Performance	9	19346	March 11, 2011
help with kernel synchronization? CUDA Programming and Performance	22	13904	August 26, 2010
Shared Memory Error CUDA Programming and Performance	7	3502	April 7, 2009
How to fix and debug misalignment errors that pop up sporadically? CUDA Programming and Performance	7	1802	March 6, 2023

CUDA Runtime API error 74: misaligned address

Related topics