Incorrect result while using shared memory to get maximum value

The maximum value is set to 136 in the data.

#include <stdio.h>
#include <cuda.h>
#include <chrono>
#include <iostream>

#define THREADS_PER_BLOCK 128
int main(int argc, char** argv){
    int dataSize = 10211201;
    int * data = new int[dataSize];
    for (int i = 0; i < dataSize; i++)
        data[i] = 63;
   
    data[8472677] = 136;
    getMaxValue(data, size);
}

Then call the kernel and return the maximum value

void getMaxValue(int * data, int size){
    int* device_data;
    std::chrono::time_point<std::chrono::steady_clock> start0, end0;
    std::chrono::duration<double, std::milli> elapsed;   

    cudaMalloc((void **)&device_data,  sizeof(int)*size);
    cudaMemcpy(device_data, data, sizeof(int)*size, cudaMemcpyHostToDevice);

    dim3 dimBlock(THREADS_PER_BLOCK, 1);
    dim3 dimGrid(ceil((size + dimBlock.x - 1) / dimBlock.x), 1);

    start0  = std::chrono::steady_clock::now();
    max_value_shared<<<dimGrid, dimBlock>>>(device_data, size);
    cudaDeviceSynchronize();
    end0 = std::chrono::steady_clock::now();

    elapsed = end0 - start0;
    std::cout << "[GPU]the time of get max value: " << elapsed.count() << "ms" << std::endl;

    int max_value;
    cudaMemcpy(&max_value, device_data, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(device_data);
    std::cout << "[GPU]max value: " << max_value << std::endl;
}

The kernel is shown below:

__global__ void max_value_shared(int* data, int data_size){
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    __shared__ int sdata[THREADS_PER_BLOCK];
    if (idx < data_size){
        sdata[threadIdx.x] = data[idx];
        __syncthreads();

        for(int stride=blockDim.x/2; stride > 0; stride /= 2) {
            if (threadIdx.x < stride) {
                int first = sdata[threadIdx.x];
                int second = sdata[threadIdx.x + stride];
                sdata[threadIdx.x] = first < second ? second : first;
            }
            __syncthreads();
        }
    }
    if (idx == 0) data[0] = sdata[0];
}

The result should be 136, but the calculation is 63, I don’t know what’s wrong, the kernel has added __syncthreads().

Well, effectively you are only using the result of the first block, which is not correct.

Hi stricker,
How do I get the maximum of all the blocks?

Thank you!

There is a lot of literature on parallel reductions. You may want to read some.
One approach would be to atomically apply the block results to the final result.