Incorrect result while using shared memory to get maximum value

Zhuoqian · November 20, 2021, 6:41am

The maximum value is set to 136 in the data.

#include <stdio.h>
#include <cuda.h>
#include <chrono>
#include <iostream>

#define THREADS_PER_BLOCK 128
int main(int argc, char** argv){
    int dataSize = 10211201;
    int * data = new int[dataSize];
    for (int i = 0; i < dataSize; i++)
        data[i] = 63;
   
    data[8472677] = 136;
    getMaxValue(data, size);
}

Then call the kernel and return the maximum value

void getMaxValue(int * data, int size){
    int* device_data;
    std::chrono::time_point<std::chrono::steady_clock> start0, end0;
    std::chrono::duration<double, std::milli> elapsed;   

    cudaMalloc((void **)&device_data,  sizeof(int)*size);
    cudaMemcpy(device_data, data, sizeof(int)*size, cudaMemcpyHostToDevice);

    dim3 dimBlock(THREADS_PER_BLOCK, 1);
    dim3 dimGrid(ceil((size + dimBlock.x - 1) / dimBlock.x), 1);

    start0  = std::chrono::steady_clock::now();
    max_value_shared<<<dimGrid, dimBlock>>>(device_data, size);
    cudaDeviceSynchronize();
    end0 = std::chrono::steady_clock::now();

    elapsed = end0 - start0;
    std::cout << "[GPU]the time of get max value: " << elapsed.count() << "ms" << std::endl;

    int max_value;
    cudaMemcpy(&max_value, device_data, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(device_data);
    std::cout << "[GPU]max value: " << max_value << std::endl;
}

The kernel is shown below:

__global__ void max_value_shared(int* data, int data_size){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ int sdata[THREADS_PER_BLOCK];
    if (idx < data_size){
        sdata[threadIdx.x] = data[idx];
        __syncthreads();

        for(int stride=blockDim.x/2; stride > 0; stride /= 2) {
            if (threadIdx.x < stride) {
                int first = sdata[threadIdx.x];
                int second = sdata[threadIdx.x + stride];
                sdata[threadIdx.x] = first < second ? second : first;
            }
            __syncthreads();
        }
    }
    if (idx == 0) data[0] = sdata[0];
}

The result should be 136, but the calculation is 63, I don’t know what’s wrong, the kernel has added __syncthreads().

striker159 · November 20, 2021, 7:21am

Well, effectively you are only using the result of the first block, which is not correct.

Zhuoqian · November 20, 2021, 7:31am

Hi stricker,
How do I get the maximum of all the blocks？

Thank you!

striker159 · November 20, 2021, 8:01am

There is a lot of literature on parallel reductions. You may want to read some.
One approach would be to atomically apply the block results to the final result.

Topic		Replies	Views
One question regarding shared memory CUDA Programming and Performance	5	1241	April 24, 2013
How to find the maximum number in an array in GPU and CPU and calculate the time for the processes. CUDA Programming and Performance	0	577	February 13, 2016
Max shared memory CUDA Programming and Performance	0	1274	July 28, 2020
Parallel reduction problem CUDA Programming and Performance	1	5083	November 29, 2010
Finding max in array CUDA Programming and Performance	15	42342	November 26, 2017
Find maximum value from threads CUDA Programming and Performance	6	452	December 16, 2023
Cumpute Max of Vector or Matrix CUDA Programming and Performance	7	3776	June 6, 2011
CUDA code giving wrong result CUDA Programming and Performance	0	402	May 4, 2020
How to get larger data sets usable in CUDA Running out of room CUDA Programming and Performance	1	2747	October 2, 2009
Cuda giving wrong result CUDA Programming and Performance cuda	1	308	May 4, 2020

Incorrect result while using shared memory to get maximum value

Related topics