Please help,
I have been working on optimizing a CUDA kernel that utilizes atomic adds and comparing performance. I have been able to achieve approximately 2x faster but once the size of the input goes beyond 1025, e.g., 1026, I get the following output:
i (0): original (0.000000,0.000000), modified (16416.000000,8208.000000)
Atomic and Optimized kernels do not match.
Which is incorrect as the modified/optimized CUDA kernel should match the output from the original atomic add CUDA kernel - just faster execution. I am hoping that it is just something stupid simple I am doing wrong. Can anyone help?
The code is posted below, and I am using CUDA 12.2 (driver 535.54.03) with A100-SXM4-80GB device:
#include <cuda.h>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include <string>
#include <sstream>
#include <vector>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
// for easy gpu error checking
#define GPU_ERROR_CHECK(ans) do{gpuAssert((ans),__FILE__,__LINE__);}while(0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
printf("\nCUDA KERNEL ERROR: CUDA Kernel reports error: %s\n",cudaGetErrorString(code));
if (abort) exit(code);
}
}
__forceinline__ __host__ __device__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__forceinline__ __host__ __device__ float length(float3 v)
{
return sqrtf(dot(v, v));
}
__forceinline__ __device__ float2 myKernel(float val) {
return make_float2(val, val / 2);
}
/**
* @brief Works for values less than 1026 samples, that is up to and including 1025 samples
*/
__global__ void optimized_org_kernel(const float3 * __restrict__ pos_a, const float3 * __restrict__ pos_b,
const uint32_t input_size, const uint32_t output_size,
float2 * __restrict__ result, const int region,
const uint32_t first_idx_x, const uint32_t last_idx_x)
{
// Calculate thread indices
uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
uint32_t stride_x = blockDim.x * gridDim.x;
uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
uint32_t stride_y = blockDim.y * gridDim.y;
float3 distance3;
float distance;
uint32_t output_start, output_end;
// Local accumulation variables to reduce the number of atomic operations
float2 local_accum = make_float2(0.0f, 0.0f);
for (uint32_t x = thridx_x; x < last_idx_x; x += stride_x) {
// Calculate the distance between points
distance3.x = pos_a[x].x - pos_b[x].x;
distance3.y = pos_a[x].y - pos_b[x].y;
distance3.z = pos_a[x].z - pos_b[x].z;
distance = length(distance3);
// Determine the output range for this thread
output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
output_end = output_start + region;
// Clamp the values to ensure they stay within bounds
output_start = max(0u, output_start);
output_end = min(output_end, output_size);
for (uint32_t y = thridx_y; y < output_size; y += stride_y) {
// Only accumulate within the valid range
if (y >= output_start && y < output_end) {
float2 lval = myKernel(1.0f);
local_accum.x += lval.x;
local_accum.y += lval.y;
}
}
}
// Write back the accumulated values using atomic operations
if (local_accum.x != 0.0f || local_accum.y != 0.0f) {
atomicAdd(&result[thridx_y].x, local_accum.x);
atomicAdd(&result[thridx_y].y, local_accum.y);
}
}
__global__ void org_kernel(const float3 * pos_a, const float3 * pos_b,
const uint32_t input_size, const uint32_t output_size,
float2 * result, const int region,
const uint32_t first_idx_x, const uint32_t last_idx_x)
{
uint32_t thridx_x = threadIdx.x + blockDim.x * blockIdx.x + first_idx_x;
uint32_t thridx_y = threadIdx.y + blockDim.y * blockIdx.y;
uint32_t stride_x = blockDim.x * gridDim.x;
uint32_t stride_y = blockDim.y * gridDim.y;
float3 distance3 = make_float3(0.0f, 0.0f, 0.0f);
float distance = 0;
uint32_t output_start, output_end;
for(uint32_t x = thridx_x; x < last_idx_x; x += stride_x){
// distance calcs
distance3.x = pos_a[x].x - pos_b[x].x;
distance3.y = pos_a[x].y - pos_b[x].y;
distance3.z = pos_a[x].z - pos_b[x].z;
distance = length(distance3);
output_start = __fdiv_rz(output_size, 2) + __fdiv_rz(distance, output_size) - region;
output_end = output_start + region;
for(uint32_t y = thridx_y; y < output_size; y += stride_y){
if((y < output_end) && (y >= output_start)){
float2 lval = myKernel(1.0f);
atomicAdd(&result[y].x, lval.x);
atomicAdd(&result[y].y, lval.y);
}
}
}
}
bool eval_arrays_equal(float2 * d_org, float2 * d_mod, uint32_t n) {
if (d_org == nullptr || d_mod == nullptr) {
throw std::invalid_argument("Arrays are NULL.");
}
if (n < 1) {
throw std::invalid_argument("Invalid array length, less than 1.");
}
float2 * h_org;
float2 * h_mod;
size_t sz = n * sizeof(float2);
h_org = (float2*)malloc(sz);
h_mod = (float2*)malloc(sz);
GPU_ERROR_CHECK(cudaMemcpy(h_org, d_org, sz, cudaMemcpyDeviceToHost));
GPU_ERROR_CHECK(cudaMemcpy(h_mod, d_mod, sz, cudaMemcpyDeviceToHost));
GPU_ERROR_CHECK(cudaDeviceSynchronize());
for (uint32_t i = 0; i < n; ++i) {
if (h_org[i].x != h_mod[i].x || h_org[i].y != h_mod[i].y) {
printf("\ti (%i): original (%f,%f), modified (%f,%f)\n", i, h_org[i].x, h_org[i].y, h_mod[i].x, h_mod[i].y);
return false;
}
}
free(h_org);
free(h_mod);
// Every element is equal
return true;
}
void printDeviceProperties() {
int32_t device;
cudaError_t error = cudaGetDevice(&device);
if (error != cudaSuccess) {
std::cerr << "Failed to get current device: " << cudaGetErrorString(error) << std::endl;
return;
}
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, device);
if (error != cudaSuccess) {
std::cerr << "Failed to get device properties: " << cudaGetErrorString(error) << std::endl;
return;
}
std::cout << "Device " << device << ": \"" << deviceProp.name << "\"" << std::endl;
std::cout << " CUDA Capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
std::cout << " Total Global Memory: " << deviceProp.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
std::cout << " Shared Memory per Block: " << deviceProp.sharedMemPerBlock / 1024 << " KB" << std::endl;
std::cout << " Registers per Block: " << deviceProp.regsPerBlock << std::endl;
std::cout << " Warp Size: " << deviceProp.warpSize << std::endl;
std::cout << " Max Threads per Block: " << deviceProp.maxThreadsPerBlock << std::endl;
std::cout << " Max Threads Dim: [" << deviceProp.maxThreadsDim[0] << ", "
<< deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << "]" << std::endl;
std::cout << " Max Grid Size: [" << deviceProp.maxGridSize[0] << ", "
<< deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << "]" << std::endl;
std::cout << " Clock Rate: " << deviceProp.clockRate / 1000 << " MHz" << std::endl;
std::cout << " Total Constant Memory: " << deviceProp.totalConstMem / 1024 << " KB" << std::endl;
std::cout << " Multiprocessor Count: " << deviceProp.multiProcessorCount << std::endl;
std::cout << " Compute Mode: " << deviceProp.computeMode << std::endl;
}
int main(int argc, char * argv[]) {
if (argc != 2) {
fprintf(stderr, "\nPass the number of array elements via command line as follows:\n");
fprintf(stderr, "./xTest <num_elems>\n\n");
return EXIT_FAILURE;
}
// Dimensions
const uint32_t BLOCK_WIDTH = 512;
dim3 nblks(BLOCK_WIDTH,1,1);
dim3 nthreads(1,BLOCK_WIDTH,1);
// Retrieve command-line argument
uint32_t n_values = static_cast<uint32_t>(std::stoi(argv[1]));
uint32_t region = 3;
uint32_t n_float3s = n_values;
uint32_t float3_sz = n_float3s * sizeof(float3);
uint32_t output_sz = n_values * sizeof(float2);
// Allocate host & device side
float2 *d_out_org;
float2 *d_out_mod;
GPU_ERROR_CHECK(cudaMalloc(&d_out_org, output_sz));
GPU_ERROR_CHECK(cudaMalloc(&d_out_mod, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_org, 0, output_sz));
GPU_ERROR_CHECK(cudaMemset(d_out_mod, 0, output_sz));
// Float3s
float3 *pos_a, *pos_b;
float3 *d_pos_a, *d_pos_b;
pos_a = (float3*)malloc(float3_sz);
pos_b = (float3*)malloc(float3_sz);
for(size_t p = 0; p < n_float3s; ++p){
pos_a[p] = make_float3(1,1,1);
pos_b[p] = make_float3(0.1,0.1,0.1);
}
GPU_ERROR_CHECK(cudaMalloc(&d_pos_a, float3_sz));
GPU_ERROR_CHECK(cudaMalloc(&d_pos_b, float3_sz));
GPU_ERROR_CHECK(cudaMemcpy(d_pos_a, pos_a, float3_sz, cudaMemcpyHostToDevice));
GPU_ERROR_CHECK(cudaMemcpy(d_pos_b, pos_b, float3_sz, cudaMemcpyHostToDevice));
GPU_ERROR_CHECK(cudaDeviceSynchronize());
float total_time_org = 0.0f;
float total_time_mod = 0.0f;
uint32_t first_idx_x = 0;
uint32_t last_idx_x = n_values;
const uint32_t n_passes = 16;
for (uint32_t pass = 0; pass < n_passes; ++pass) {
auto start = std::chrono::high_resolution_clock::now();
// Original atomic add kernel
org_kernel<<<nblks,nthreads,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_org, region, first_idx_x, last_idx_x);
GPU_ERROR_CHECK(cudaDeviceSynchronize());
auto stop = std::chrono::high_resolution_clock::now();
total_time_org += static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());
start = std::chrono::high_resolution_clock::now();
// Optimized atomic add kernel
optimized_org_kernel<<<nblks,nthreads,0,0>>>(d_pos_a, d_pos_b, n_values, n_values, d_out_mod, region, first_idx_x, last_idx_x);
GPU_ERROR_CHECK(cudaDeviceSynchronize());
stop = std::chrono::high_resolution_clock::now();
total_time_mod += static_cast<float>(std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count());
}
// Check for fidelity
if (eval_arrays_equal(d_out_org, d_out_mod, n_values)) {
printf("\nFidelity achieved.\n");
printf("\tTotal number of passes: %d\n", n_passes);
float org_time = (total_time_org / n_passes);
float mod_time = (total_time_mod / n_passes);
printf("\t[ORIGINAL] Time: %8.9f (us.)\n", org_time);
printf("\t[MODIFIED] Time: %8.9f (us.)\n", mod_time);
printf("\tSpeedup Factor: %8.9f\n", (org_time / mod_time));
} else {
printf("\nAtomic and Optimized kernels do not match.\n");
return EXIT_FAILURE;
}
GPU_ERROR_CHECK(cudaPeekAtLastError());
GPU_ERROR_CHECK(cudaDeviceSynchronize());
GPU_ERROR_CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
Thanks to anyone who can point out any error or point a direction that might fix this issue.