Possible Rounding/Precision Errors in CUDA Math APIs?

Hi! Robert,
Thank you for your prompt reply. Below is the sample template we use to test all APIs. We focus on floating point precision at the moment.

We are in the process of developing a simple bug detector to detect floating point errors. However, we want to verify if some of our preliminary results are correct.

Detailed implementation for the source code of our bug detector can be found here:

Your reply in this regard is highly appreciated.

CUDA 10.1, g++ 9.4.0, NVIDIA 1080 Ti GPU

API Testing Template

# cat acos.c

#include <cmath>
#include <fenv.h>

extern "C" float cpp_kernel_1(float x0, int *flag) {
    float result;

    // Enable floating-point exceptions
    feclearexcept(FE_ALL_EXCEPT);
    result = acos(x0);
    int exceptions = fetestexcept(FE_ALL_EXCEPT);
    if (exceptions & FE_DIVBYZERO) {
        *flag = 1;
        return 1.0;
    }
    else if (exceptions & FE_OVERFLOW) {
        *flag = 1;
        return 2.0;
    }
    else if (exceptions & FE_INVALID) {
        *flag = 1;
        return 3.0;
    }
    else if (exceptions & FE_UNDERFLOW) {
        *flag = 1;
        return 4.0;
    }
    else {
        *flag = 0;
        return result;
    }
}

# cat acos.cu


#include <stdio.h>

__global__ void kernel_1(
  float x0,float *ret) {
   *ret = acos(x0);
}

extern "C" {
float kernel_wrapper_1(float x0) {
  float *dev_p;
  cudaMalloc(&dev_p, sizeof(float));
  kernel_1<<<1,1>>>(x0,dev_p);
  float res;
  cudaMemcpy (&res, dev_p, sizeof(float), cudaMemcpyDeviceToHost);
  return res;
  }
 }
compile the code: 
#CUDA: 
 nvcc -shared _tmp_csr-88307_10652/cuda_code_acos.cu -o _tmp_csr-88307_10652/cuda_code_acos.cu.so -Xcompiler -fPIC 
#C code. 
Running: g++ -shared _tmp_csr-88307_10652/c_code_acos.c -o _tmp_csr-88307_10652/c_code_acos.c.so -fPIC

# cat runner.py

def call_GPU_kernel_1(x0, shared_lib):
  script_dir = os.path.abspath(os.path.dirname(__file__))
  lib_path = os.path.join(script_dir, shared_lib)
  E = ctypes.cdll.LoadLibrary(lib_path)
  E.kernel_wrapper_1.restype = ctypes.c_float
  res = E.kernel_wrapper_1(ctypes.c_float(x0))
  return res

def call_CPU_kernel_1(x0, shared_lib, flag):
  script_dir = os.path.abspath(os.path.dirname(__file__))
  lib_path = os.path.join(script_dir, shared_lib)
  E = ctypes.cdll.LoadLibrary(lib_path)
  E.cpp_kernel_1.argtypes = [ctypes.c_float, ctypes.POINTER(ctypes.c_int)]
  E.cpp_kernel_1.restype = ctypes.c_float
  res = E.cpp_kernel_1(ctypes.c_float(x0), flag)
  return res

x0 = 0.0001590810570633039   # change input necessarily

gpu_start_time = time.time()
gpu_result = call_GPU_kernel_1(x0, g_shared_lib)
gpu_end_time = time.time()
gpu_elapsed = gpu_end_time - gpu_start_time

cpu_start_time = time.time()
cpu_result = call_CPU_kernel_1(x0, c_shared_lib, ctypes.byref(flag))
cpu_end_time= time.time()
cpu_elapsed = cpu_end_time - cpu_start_time

print(gpu_result ) # 1.5706373453140259
print(cpu_result)  # 1.5706372261047363