The maximum value is set to 136 in the data.
#include <stdio.h>
#include <cuda.h>
#include <chrono>
#include <iostream>
#define THREADS_PER_BLOCK 128
int main(int argc, char** argv){
int dataSize = 10211201;
int * data = new int[dataSize];
for (int i = 0; i < dataSize; i++)
data[i] = 63;
data[8472677] = 136;
getMaxValue(data, size);
}
Then call the kernel and return the maximum value
void getMaxValue(int * data, int size){
int* device_data;
std::chrono::time_point<std::chrono::steady_clock> start0, end0;
std::chrono::duration<double, std::milli> elapsed;
cudaMalloc((void **)&device_data, sizeof(int)*size);
cudaMemcpy(device_data, data, sizeof(int)*size, cudaMemcpyHostToDevice);
dim3 dimBlock(THREADS_PER_BLOCK, 1);
dim3 dimGrid(ceil((size + dimBlock.x - 1) / dimBlock.x), 1);
start0 = std::chrono::steady_clock::now();
max_value_shared<<<dimGrid, dimBlock>>>(device_data, size);
cudaDeviceSynchronize();
end0 = std::chrono::steady_clock::now();
elapsed = end0 - start0;
std::cout << "[GPU]the time of get max value: " << elapsed.count() << "ms" << std::endl;
int max_value;
cudaMemcpy(&max_value, device_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(device_data);
std::cout << "[GPU]max value: " << max_value << std::endl;
}
The kernel is shown below:
__global__ void max_value_shared(int* data, int data_size){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ int sdata[THREADS_PER_BLOCK];
if (idx < data_size){
sdata[threadIdx.x] = data[idx];
__syncthreads();
for(int stride=blockDim.x/2; stride > 0; stride /= 2) {
if (threadIdx.x < stride) {
int first = sdata[threadIdx.x];
int second = sdata[threadIdx.x + stride];
sdata[threadIdx.x] = first < second ? second : first;
}
__syncthreads();
}
}
if (idx == 0) data[0] = sdata[0];
}
The result should be 136, but the calculation is 63, I don’t know what’s wrong, the kernel has added __syncthreads().