[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

overmighty · 2024-07-16T13:23:43Z

No description provided.

…able Remove support for __builtin_frexpf16 as it decreases performance.

llvmbot · 2024-07-16T13:24:13Z

@llvm/pr-subscribers-libc

Author: OverMighty (overmighty)

Changes

Patch is 26.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99037.diff

21 Files Affected:

(modified) libc/cmake/modules/CheckCompilerFeatures.cmake (+3)
(modified) libc/cmake/modules/LLVMLibCCompileOptionRules.cmake (+7)
(modified) libc/cmake/modules/LLVMLibCFlagRules.cmake (+3)
(added) libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp (+14)
(modified) libc/src/math/generic/CMakeLists.txt (+24)
(modified) libc/src/math/generic/copysign.cpp (+4)
(modified) libc/src/math/generic/copysignf.cpp (+4)
(modified) libc/src/math/generic/copysignf16.cpp (+4)
(modified) libc/src/math/generic/fabs.cpp (+7-1)
(modified) libc/src/math/generic/fabsf.cpp (+7-1)
(modified) libc/src/math/generic/fabsf16.cpp (+7-1)
(modified) libc/src/math/generic/fmaximum_num.cpp (+5-1)
(modified) libc/src/math/generic/fmaximum_numf.cpp (+5-1)
(modified) libc/src/math/generic/fmaximum_numf16.cpp (+4)
(modified) libc/src/math/generic/fminimum_num.cpp (+5-1)
(modified) libc/src/math/generic/fminimum_numf.cpp (+5-1)
(modified) libc/src/math/generic/fminimum_numf16.cpp (+4)
(modified) libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h (+4-3)
(modified) libc/test/src/math/performance_testing/CMakeLists.txt (+20)
(modified) libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h (+29-15)
(added) libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp (+153)

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index a6d793d495c45..2bab968f901eb 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -5,6 +5,7 @@
 set(
   ALL_COMPILER_FEATURES
     "builtin_ceil_floor_rint_trunc"
+    "builtin_fmax_fmin"
     "builtin_round"
     "builtin_roundeven"
     "float16"
@@ -82,6 +83,8 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
       set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
     elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc")
       set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE)
+    elseif(${feature} STREQUAL "builtin_fmax_fmin")
+      set(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN TRUE)
     elseif(${feature} STREQUAL "builtin_round")
       set(LIBC_COMPILER_HAS_BUILTIN_ROUND TRUE)
     elseif(${feature} STREQUAL "builtin_roundeven")
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 253da4ae890e5..ead578f95ac72 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -6,6 +6,7 @@ function(_get_compile_options_from_flags output_var)
   endif()
   check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
   check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
+  check_flag(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG ${MISC_MATH_BASIC_OPS_OPT_FLAG} ${ARGN})
 
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
     if(ADD_FMA_FLAG)
@@ -37,6 +38,12 @@ function(_get_compile_options_from_flags output_var)
     if(ADD_EXPLICIT_SIMD_OPT_FLAG)
       list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
     endif()
+    if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
+      list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
+      if(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_FMAX_FMIN")
+      endif()
+    endif()
   elseif(MSVC)
     if(ADD_FMA_FLAG)
       list(APPEND compile_options "/arch:AVX2")
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index eca7ba8d183e6..4398fe55db5aa 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -263,6 +263,9 @@ set(FMA_OPT_FLAG "FMA_OPT")
 set(ROUND_OPT_FLAG "ROUND_OPT")
 # This flag controls whether we use explicit SIMD instructions or not.
 set(EXPLICIT_SIMD_OPT_FLAG "EXPLICIT_SIMD_OPT")
+# This flag controls whether we use compiler builtin functions to implement
+# various basic math operations or not.
+set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")
 
 # Skip FMA_OPT flag for targets that don't support fma.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR
diff --git a/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp b/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp
new file mode 100644
index 0000000000000..a962df33e31c4
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp
@@ -0,0 +1,14 @@
+_Float16 try_builtin_fmaxf16(_Float16 x, _Float16 y) {
+  return __builtin_fmaxf16(x, y);
+}
+_Float16 try_builtin_fminf16(_Float16 x, _Float16 y) {
+  return __builtin_fminf16(x, y);
+}
+
+float try_builtin_fmaxf(float x, float y) { return __builtin_fmaxf(x, y); }
+float try_builtin_fminf(float x, float y) { return __builtin_fminf(x, y); }
+
+double try_builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
+double try_builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
+
+extern "C" void _start() {}
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index c2f58fb1a4f71..3d713368251f6 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -400,6 +400,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -412,6 +414,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -437,6 +441,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1406,6 +1412,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1418,6 +1426,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -1443,6 +1453,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.manipulation_functions
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2202,6 +2214,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2214,6 +2228,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2239,6 +2255,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2450,6 +2468,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2462,6 +2482,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O2
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
@@ -2487,6 +2509,8 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.basic_operations
   COMPILE_OPTIONS
     -O3
+  FLAGS
+    MISC_MATH_BASIC_OPS_OPT
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/copysign.cpp b/libc/src/math/generic/copysign.cpp
index 149d725af08e2..186bb2c5983f4 100644
--- a/libc/src/math/generic/copysign.cpp
+++ b/libc/src/math/generic/copysign.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysign(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf.cpp b/libc/src/math/generic/copysignf.cpp
index 17cd70d37c308..c79e50b61ebda 100644
--- a/libc/src/math/generic/copysignf.cpp
+++ b/libc/src/math/generic/copysignf.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysignf(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/copysignf16.cpp b/libc/src/math/generic/copysignf16.cpp
index 42695b3b4a6de..546622f049ebe 100644
--- a/libc/src/math/generic/copysignf16.cpp
+++ b/libc/src/math/generic/copysignf16.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, copysignf16, (float16 x, float16 y)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_copysignf16(x, y);
+#else
   return fputil::copysign(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabs.cpp b/libc/src/math/generic/fabs.cpp
index 472297aecb2f7..55fa958cd7c00 100644
--- a/libc/src/math/generic/fabs.cpp
+++ b/libc/src/math/generic/fabs.cpp
@@ -13,6 +13,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(double, fabs, (double x)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_fabs(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf.cpp b/libc/src/math/generic/fabsf.cpp
index ad4fcb30c795d..2ba18d09bbd5b 100644
--- a/libc/src/math/generic/fabsf.cpp
+++ b/libc/src/math/generic/fabsf.cpp
@@ -13,6 +13,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(float, fabsf, (float x)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_fabsf(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fabsf16.cpp b/libc/src/math/generic/fabsf16.cpp
index 57671fb6067e2..2f982517614c4 100644
--- a/libc/src/math/generic/fabsf16.cpp
+++ b/libc/src/math/generic/fabsf16.cpp
@@ -13,6 +13,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) { return fputil::abs(x); }
+LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
+#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
+  return __builtin_fabsf16(x);
+#else
+  return fputil::abs(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fmaximum_num.cpp b/libc/src/math/generic/fmaximum_num.cpp
index 33df7daa380df..1bfc1514393ee 100644
--- a/libc/src/math/generic/fmaximum_num.cpp
+++ b/libc/src/math/generic/fmaximum_num.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fmaximum_num function----------------------------===//
+//===-- Implementation of fmaximum_num function ---------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(double, fmaximum_num, (double x, double y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fmax(x, y);
+#else
   return fputil::fmaximum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fmaximum_numf.cpp b/libc/src/math/generic/fmaximum_numf.cpp
index 1577080ba2c25..f8c69fa78be3d 100644
--- a/libc/src/math/generic/fmaximum_numf.cpp
+++ b/libc/src/math/generic/fmaximum_numf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fmaximum_numf function---------------------------===//
+//===-- Implementation of fmaximum_numf function --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, fmaximum_numf, (float x, float y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fmaxf(x, y);
+#else
   return fputil::fmaximum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fmaximum_numf16.cpp b/libc/src/math/generic/fmaximum_numf16.cpp
index 394ce8b5fe4f3..6a012d38abea4 100644
--- a/libc/src/math/generic/fmaximum_numf16.cpp
+++ b/libc/src/math/generic/fmaximum_numf16.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, fmaximum_numf16, (float16 x, float16 y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fmaxf16(x, y);
+#else
   return fputil::fmaximum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fminimum_num.cpp b/libc/src/math/generic/fminimum_num.cpp
index 3ff79def58075..5b9c426ca50c2 100644
--- a/libc/src/math/generic/fminimum_num.cpp
+++ b/libc/src/math/generic/fminimum_num.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fminimum_num function----------------------------===//
+//===-- Implementation of fminimum_num function ---------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(double, fminimum_num, (double x, double y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fmin(x, y);
+#else
   return fputil::fminimum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fminimum_numf.cpp b/libc/src/math/generic/fminimum_numf.cpp
index c7ac99b14bd5a..6b6f905e63de3 100644
--- a/libc/src/math/generic/fminimum_numf.cpp
+++ b/libc/src/math/generic/fminimum_numf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of fminimum_numf function---------------------------===//
+//===-- Implementation of fminimum_numf function --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, fminimum_numf, (float x, float y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fminf(x, y);
+#else
   return fputil::fminimum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/fminimum_numf16.cpp b/libc/src/math/generic/fminimum_numf16.cpp
index 0af7205713c10..8e48aaf27070f 100644
--- a/libc/src/math/generic/fminimum_numf16.cpp
+++ b/libc/src/math/generic/fminimum_numf16.cpp
@@ -14,7 +14,11 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, fminimum_numf16, (float16 x, float16 y)) {
+#ifdef __LIBC_USE_BUILTIN_FMAX_FMIN
+  return __builtin_fminf16(x, y);
+#else
   return fputil::fminimum_num(x, y);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 995e41ba84b03..1ab0afbc9cbe8 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -10,6 +10,7 @@
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
 
+#include <algorithm>
 #include <cstddef>
 #include <fstream>
 
@@ -28,11 +29,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
   static void run_perf_in_range(Func myFunc, Func otherFunc,
                                 StorageType startingBit, StorageType endingBit,
                                 size_t N, size_t rounds, std::ofstream &log) {
-    if (endingBit - startingBit < N)
-      N = endingBit - startingBit;
+    if (sizeof(StorageType) <= sizeof(size_t))
+      N = std::min(N, static_cast<size_t>(endingBit - startingBit));
 
     auto runner = [=](Func func) {
-      volatile T result;
+      [[maybe_unused]] volatile T result;
       if (endingBit < startingBit) {
         return;
       }
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index a75becba04d07..a4059c8ff4dd8 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -402,3 +402,23 @@ add_perf_binary(
   LINK_LIBRARIES
     LibcFPTestHelpers
 )
+
+add_perf_binary(
+  misc_basic_ops_perf
+  SRCS
+    misc_basic_ops_perf.cpp
+  DEPENDS
+    .binary_op_single_output_diff
+    .single_input_single_output_diff
+    libc.src.math.copysignf
+    libc.src.math.copysignf16
+    libc.src.math.fabsf
+    libc.src.math.fabsf16
+    libc.src.math.fmaximum_numf
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fminimum_numf
+    libc.src.math.fminimum_numf16
+    libc.src.math.frexpf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
index 48ae43d6315e3..e0beb729cb9f5 100644
--- a/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h
@@ -10,6 +10,7 @@
 #include "src/__support/macros/config.h"
 #include "test/src/math/performance_testing/Timer.h"
 
+#include <algorithm>
 #include <fstream>
 
 namespace LIBC_NAMESPACE_DECL {
@@ -26,16 +27,21 @@ template <typename T> class SingleInputSingleOutputPerf {
 
   static void runPerfInRange(Func myFunc, Func otherFunc,
                              StorageType startingBit, StorageType endingBit,
-                             std::ofstream &log) {
+                             size_t rounds, std::ofstream &log) {
+    size_t n = 10'010'001;
+    if (sizeof(StorageType) <= sizeof(size_t))
+      n = std::min(n, static_cast<size_t>(endingBit - startingBit));
+
     auto runner = [=](Func func) {
-      constexpr StorageType N = 10'010'001;
-      StorageType step = (endingBit - startingBit) / N;
+      StorageType step = (endingBit - startingBit) / n;
       if (step == 0)
         step = 1;
-      volatile T result;
-      for (StorageType bits = startingBit; bits < endingBit; bits += step) {
-        T x = FPBits(bits).get_val();
-        result = func(x);
+      [[maybe_unused]] volatile T result;
+      for (size_t i = 0; i < rounds; i++) {
+        for (StorageType bits = startingBit; bits < endingBit; bits += step) {
+          T x = FPBits(bits).get_val();
+          result = func(x);
+        }
       }
     };
 
@@ -44,8 +50,7 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(myFunc);
     timer.stop();
 
-    StorageType numberOfRuns = endingBit - startingBit + 1;
-    double myAverage = static_cast<double>(timer.nanoseconds()) / numberOfRuns;
+    double myAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
     log << "-- My function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << myAverage << " ns/op \n";
@@ -56,8 +61,7 @@ template <typename T> class SingleInputSingleOutputPerf {
     runner(otherFunc);
     timer.stop();
 
-    double otherAverage =
-        static_cast<double>(timer.nanoseconds()) / numberOfRuns;
+    double otherAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
     log << "-- Other function --\n";
     log << "     Total time      : " << timer.nanoseconds() << " ns \n";
     log << "     Average runtime : " << otherAverage << " ns/op \n";
@@ -68,15 +72,18 @@ template <typename T> class SingleInputSingleOutputPerf {
     log << "     Mine / Other's  : " << myAverage / otherAverage << " \n";
   }
 
-  static void runPerf(Func myFunc, Func otherFunc, const char *logFile) {
+  static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
+                      const char *logFile) {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
+                   /* endingBit= */ FPBits::max_subnormal().uintval(), rounds,
+                   log);
     log << "\n Performance tests with inputs in normal range:\n";
     runPerfInRange(myFunc, otherFunc,
                    /* startingBit= */ FPBits::min_normal().uintval(),
-                   /* endingBit= */ FPBits::max_normal().uintval(), log);
+                   /* endingBit= */ FPBits::max_normal().uintval(), rounds,
+                   log);
   }
 };
 
@@ -86,6 +93,13 @@ template <typename T> class SingleInputSingleOutputPerf {
 #define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename)        \
   int main() {                                                                 \
     LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
-        &myFunc, &otherFunc, filename);                                        \
+        &myFunc, &otherFunc, 1, filename);                                     \
     return 0;                                                                  \
   }
+
+#define SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds,       \
+                                           filename)                           \
+  {                                                                            \
+    LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf(          \
+        &myFunc, &otherFunc, rounds, filename);                                \
+  }
diff --git a/libc/test/src/m...
[truncated]

overmighty · 2024-07-16T13:25:18Z

Relevant: https://gist.github.com/overmighty/a9a9de847eb11c667ba6b257375afe83.

overmighty · 2024-07-16T13:29:34Z

Before:

Intel Core i7-13700H, Clang 18

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 77722945 ns 
     Average runtime : 1.94307 ns/op 
     Ops per second  : 514649052 op/s 
-- Other function --
     Total time      : 55638398 ns 
     Average runtime : 1.39096 ns/op 
     Ops per second  : 718928679 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.39693 
Performance tests with inputs in normal range:

-- My function --

Total time      : 66004748 ns

Average runtime : 1.65012 ns/op

Ops per second  : 606017615 op/s

-- Other function --

Total time      : 52935088 ns

Average runtime : 1.32338 ns/op

Ops per second  : 755643213 op/s

-- Average runtime ratio --

Mine / Other's  : 1.2469
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 68139141 ns

Average runtime : 1.70348 ns/op

Ops per second  : 587034697 op/s

-- Other function --

Total time      : 53076741 ns

Average runtime : 1.32692 ns/op

Ops per second  : 753626527 op/s

-- Average runtime ratio --

Mine / Other's  : 1.28379

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 1443135472 ns 
     Average runtime : 70.5345 ns/op 
     Ops per second  : 14177463 op/s 
-- Other function --
     Total time      : 21837100 ns 
     Average runtime : 1.06731 ns/op 
     Ops per second  : 936937597 op/s 
-- Average runtime ratio --
     Mine / Other's  : 66.0864 
Performance tests with inputs in normal range:

-- My function --

Total time      : 6334515834 ns

Average runtime : 10.3104 ns/op

Ops per second  : 96989259 op/s

-- Other function --

Total time      : 653122054 ns

Average runtime : 1.06306 ns/op

Ops per second  : 940681755 op/s

-- Average runtime ratio --

Mine / Other's  : 9.69882
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 4224846676 ns

Average runtime : 10.3146 ns/op

Ops per second  : 96950263 op/s

-- Other function --

Total time      : 434116263 ns

Average runtime : 1.05985 ns/op

Ops per second  : 943526043 op/s

-- Average runtime ratio --

Mine / Other's  : 9.73206

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 355985096 ns 
     Average runtime : 1.06092 ns/op 
     Ops per second  : 942579573 op/s 
-- Other function --
     Total time      : 354381381 ns 
     Average runtime : 1.05614 ns/op 
     Ops per second  : 946845116 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00453 
Performance tests with inputs in normal range:

-- My function --

Total time      : 426755031 ns

Average runtime : 1.06582 ns/op

Ops per second  : 938243279 op/s

-- Other function --

Total time      : 426371198 ns

Average runtime : 1.06486 ns/op

Ops per second  : 939087916 op/s

-- Average runtime ratio --

Mine / Other's  : 1.0009

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 1409393987 ns 
     Average runtime : 68.8853 ns/op 
     Ops per second  : 14516877 op/s 
-- Other function --
     Total time      : 27228862 ns 
     Average runtime : 1.33083 ns/op 
     Ops per second  : 751408560 op/s 
-- Average runtime ratio --
     Mine / Other's  : 51.761 
Performance tests with inputs in normal range:

-- My function --

Total time      : 4989140516 ns

Average runtime : 8.12061 ns/op

Ops per second  : 123143454 op/s

-- Other function --

Total time      : 811360244 ns

Average runtime : 1.32062 ns/op

Ops per second  : 757222213 op/s

-- Average runtime ratio --

Mine / Other's  : 6.14911

frexpf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 79818927 ns 
     Average runtime : 3.90122 ns/op 
     Ops per second  : 256330180 op/s 
-- Other function --
     Total time      : 27108778 ns 
     Average runtime : 1.32496 ns/op 
     Ops per second  : 754737081 op/s 
-- Average runtime ratio --
     Mine / Other's  : 2.94439 
Performance tests with inputs in normal range:

-- My function --

Total time      : 2056091386 ns

Average runtime : 3.34661 ns/op

Ops per second  : 298809675 op/s

-- Other function --

Total time      : 810804852 ns

Average runtime : 1.31971 ns/op

Ops per second  : 757740902 op/s

-- Average runtime ratio --

Mine / Other's  : 2.53586

Intel Core i7-13700H, Clang 18, -march=native

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 66412388 ns 
     Average runtime : 1.66031 ns/op 
     Ops per second  : 602297872 op/s 
-- Other function --
     Total time      : 44271736 ns 
     Average runtime : 1.10679 ns/op 
     Ops per second  : 903511893 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.50011 
Performance tests with inputs in normal range:

-- My function --

Total time      : 63347347 ns

Average runtime : 1.58368 ns/op

Ops per second  : 631439861 op/s

-- Other function --

Total time      : 42233468 ns

Average runtime : 1.05584 ns/op

Ops per second  : 947117106 op/s

-- Average runtime ratio --

Mine / Other's  : 1.49993
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 63621198 ns

Average runtime : 1.59053 ns/op

Ops per second  : 628721892 op/s

-- Other function --

Total time      : 42412209 ns

Average runtime : 1.0603 ns/op

Ops per second  : 943125598 op/s

-- Average runtime ratio --

Mine / Other's  : 1.50007

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 145981237 ns 
     Average runtime : 7.13496 ns/op 
     Ops per second  : 140154998 op/s 
-- Other function --
     Total time      : 27214267 ns 
     Average runtime : 1.33012 ns/op 
     Ops per second  : 751811540 op/s 
-- Average runtime ratio --
     Mine / Other's  : 5.36414 
Performance tests with inputs in normal range:

-- My function --

Total time      : 4419541990 ns

Average runtime : 7.1935 ns/op

Ops per second  : 139014404 op/s

-- Other function --

Total time      : 810748674 ns

Average runtime : 1.31962 ns/op

Ops per second  : 757793407 op/s

-- Average runtime ratio --

Mine / Other's  : 5.45119
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 2947350719 ns

Average runtime : 7.19568 ns/op

Ops per second  : 138972263 op/s

-- Other function --

Total time      : 540643022 ns

Average runtime : 1.31993 ns/op

Ops per second  : 757616362 op/s

-- Average runtime ratio --

Mine / Other's  : 5.45157

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 443653967 ns 
     Average runtime : 1.32219 ns/op 
     Ops per second  : 756319800 op/s 
-- Other function --
     Total time      : 355766964 ns 
     Average runtime : 1.06027 ns/op 
     Ops per second  : 943157499 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.24704 
Performance tests with inputs in normal range:

-- My function --

Total time      : 532591169 ns

Average runtime : 1.33015 ns/op

Ops per second  : 751796243 op/s

-- Other function --

Total time      : 426456661 ns

Average runtime : 1.06508 ns/op

Ops per second  : 938899720 op/s

-- Average runtime ratio --

Mine / Other's  : 1.24888

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 138805491 ns 
     Average runtime : 6.78424 ns/op 
     Ops per second  : 147400508 op/s 
-- Other function --
     Total time      : 21842015 ns 
     Average runtime : 1.06755 ns/op 
     Ops per second  : 936726762 op/s 
-- Average runtime ratio --
     Mine / Other's  : 6.35498 
Performance tests with inputs in normal range:

-- My function --

Total time      : 4160219052 ns

Average runtime : 6.77141 ns/op

Ops per second  : 147679723 op/s

-- Other function --

Total time      : 652394189 ns

Average runtime : 1.06187 ns/op

Ops per second  : 941731257 op/s

-- Average runtime ratio --

Mine / Other's  : 6.37685

frexpf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 79732480 ns 
     Average runtime : 3.89699 ns/op 
     Ops per second  : 256608097 op/s 
-- Other function --
     Total time      : 21778504 ns 
     Average runtime : 1.06444 ns/op 
     Ops per second  : 939458467 op/s 
-- Average runtime ratio --
     Mine / Other's  : 3.66106 
Performance tests with inputs in normal range:

-- My function --

Total time      : 2056153914 ns

Average runtime : 3.34671 ns/op

Ops per second  : 298800588 op/s

-- Other function --

Total time      : 648790447 ns

Average runtime : 1.05601 ns/op

Ops per second  : 946962155 op/s

-- Average runtime ratio --

Mine / Other's  : 3.16921

Intel Core i7-13700H, GCC 14

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 66375785 ns 
     Average runtime : 1.65939 ns/op 
     Ops per second  : 602630010 op/s 
-- Other function --
     Total time      : 55458868 ns 
     Average runtime : 1.38647 ns/op 
     Ops per second  : 721255976 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.19685 
Performance tests with inputs in normal range:

-- My function --

Total time      : 63344285 ns

Average runtime : 1.58361 ns/op

Ops per second  : 631470384 op/s

-- Other function --

Total time      : 52964125 ns

Average runtime : 1.3241 ns/op

Ops per second  : 755228940 op/s

-- Average runtime ratio --

Mine / Other's  : 1.19598
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 63616444 ns

Average runtime : 1.59041 ns/op

Ops per second  : 628768876 op/s

-- Other function --

Total time      : 53014696 ns

Average runtime : 1.32537 ns/op

Ops per second  : 754508523 op/s

-- Average runtime ratio --

Mine / Other's  : 1.19998

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 27206240 ns 
     Average runtime : 1.32973 ns/op 
     Ops per second  : 752033357 op/s 
-- Other function --
     Total time      : 27409561 ns 
     Average runtime : 1.33967 ns/op 
     Ops per second  : 746454859 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.992582 
Performance tests with inputs in normal range:

-- My function --

Total time      : 810957088 ns

Average runtime : 1.31996 ns/op

Ops per second  : 757598656 op/s

-- Other function --

Total time      : 810936231 ns

Average runtime : 1.31993 ns/op

Ops per second  : 757618141 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00003
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 540714147 ns

Average runtime : 1.3201 ns/op

Ops per second  : 757516706 op/s

-- Other function --

Total time      : 542636456 ns

Average runtime : 1.3248 ns/op

Ops per second  : 754833176 op/s

-- Average runtime ratio --

Mine / Other's  : 0.996457

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 444296071 ns 
     Average runtime : 1.32411 ns/op 
     Ops per second  : 755226755 op/s 
-- Other function --
     Total time      : 354560881 ns 
     Average runtime : 1.05667 ns/op 
     Ops per second  : 946365766 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.25309 
Performance tests with inputs in normal range:

-- My function --

Total time      : 555452123 ns

Average runtime : 1.38724 ns/op

Ops per second  : 720854279 op/s

-- Other function --

Total time      : 424959681 ns

Average runtime : 1.06134 ns/op

Ops per second  : 942207126 op/s

-- Average runtime ratio --

Mine / Other's  : 1.30707

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 22185765 ns 
     Average runtime : 1.08435 ns/op 
     Ops per second  : 922212959 op/s 
-- Other function --
     Total time      : 21731294 ns 
     Average runtime : 1.06214 ns/op 
     Ops per second  : 941499387 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.02091 
Performance tests with inputs in normal range:

-- My function --

Total time      : 648617719 ns

Average runtime : 1.05573 ns/op

Ops per second  : 947214332 op/s

-- Other function --

Total time      : 648645565 ns

Average runtime : 1.05577 ns/op

Ops per second  : 947173669 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999957

Intel Core i7-13700H, GCC 14, -march=native

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 65193230 ns 
     Average runtime : 1.62983 ns/op 
     Ops per second  : 613561254 op/s 
-- Other function --
     Total time      : 55469963 ns 
     Average runtime : 1.38675 ns/op 
     Ops per second  : 721111712 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.17529 
Performance tests with inputs in normal range:

-- My function --

Total time      : 63111912 ns

Average runtime : 1.5778 ns/op

Ops per second  : 633795407 op/s

-- Other function --

Total time      : 52816339 ns

Average runtime : 1.32041 ns/op

Ops per second  : 757342155 op/s

-- Average runtime ratio --

Mine / Other's  : 1.19493
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 63470652 ns

Average runtime : 1.58676 ns/op

Ops per second  : 630213157 op/s

-- Other function --

Total time      : 53013833 ns

Average runtime : 1.32534 ns/op

Ops per second  : 754520805 op/s

-- Average runtime ratio --

Mine / Other's  : 1.19725

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 27289917 ns 
     Average runtime : 1.33382 ns/op 
     Ops per second  : 749727454 op/s 
-- Other function --
     Total time      : 27247564 ns 
     Average runtime : 1.33175 ns/op 
     Ops per second  : 750892813 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00155 
Performance tests with inputs in normal range:

-- My function --

Total time      : 810879806 ns

Average runtime : 1.31983 ns/op

Ops per second  : 757670860 op/s

-- Other function --

Total time      : 811157214 ns

Average runtime : 1.32029 ns/op

Ops per second  : 757411743 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999658
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 540745309 ns

Average runtime : 1.32018 ns/op

Ops per second  : 757473052 op/s

-- Other function --

Total time      : 541892195 ns

Average runtime : 1.32298 ns/op

Ops per second  : 755869901 op/s

-- Average runtime ratio --

Mine / Other's  : 0.997884

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 443527538 ns 
     Average runtime : 1.32182 ns/op 
     Ops per second  : 756535392 op/s 
-- Other function --
     Total time      : 354791083 ns 
     Average runtime : 1.05736 ns/op 
     Ops per second  : 945751728 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.25011 
Performance tests with inputs in normal range:

-- My function --

Total time      : 532578356 ns

Average runtime : 1.33012 ns/op

Ops per second  : 751814330 op/s

-- Other function --

Total time      : 426566986 ns

Average runtime : 1.06535 ns/op

Ops per second  : 938656888 op/s

-- Average runtime ratio --

Mine / Other's  : 1.24852

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 21735283 ns 
     Average runtime : 1.06233 ns/op 
     Ops per second  : 941326597 op/s 
-- Other function --
     Total time      : 21743332 ns 
     Average runtime : 1.06272 ns/op 
     Ops per second  : 940978135 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.99963 
Performance tests with inputs in normal range:

-- My function --

Total time      : 648659340 ns

Average runtime : 1.0558 ns/op

Ops per second  : 947153555 op/s

-- Other function --

Total time      : 648616137 ns

Average runtime : 1.05572 ns/op

Ops per second  : 947216643 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00007

Google Tensor G3, Clang 17

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 68445191 ns 
     Average runtime : 1.71113 ns/op 
     Ops per second  : 584409794 op/s 
-- Other function --
     Total time      : 53314128 ns 
     Average runtime : 1.33285 ns/op 
     Ops per second  : 750270922 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.28381 
Performance tests with inputs in normal range:

-- My function --

Total time      : 65239543 ns

Average runtime : 1.63099 ns/op

Ops per second  : 613125692 op/s

-- Other function --

Total time      : 50825480 ns

Average runtime : 1.27064 ns/op

Ops per second  : 787007619 op/s

-- Average runtime ratio --

Mine / Other's  : 1.2836
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 65539714 ns

Average runtime : 1.63849 ns/op

Ops per second  : 610317585 op/s

-- Other function --

Total time      : 51133586 ns

Average runtime : 1.27834 ns/op

Ops per second  : 782265495 op/s

-- Average runtime ratio --

Mine / Other's  : 1.28174

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 22220418 ns 
     Average runtime : 1.08604 ns/op 
     Ops per second  : 920774757 op/s 
-- Other function --
     Total time      : 14645345 ns 
     Average runtime : 0.715804 ns/op 
     Ops per second  : 1397030933 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.51723 
Performance tests with inputs in normal range:

-- My function --

Total time      : 657264120 ns

Average runtime : 1.0698 ns/op

Ops per second  : 934753596 op/s

-- Other function --

Total time      : 429685628 ns

Average runtime : 0.699381 ns/op

Ops per second  : 1429836047 op/s

-- Average runtime ratio --

Mine / Other's  : 1.52964
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 438531861 ns

Average runtime : 1.07063 ns/op

Ops per second  : 934025635 op/s

-- Other function --

Total time      : 284638753 ns

Average runtime : 0.694919 ns/op

Ops per second  : 1439016984 op/s

-- Average runtime ratio --

Mine / Other's  : 1.54066

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 355370361 ns 
     Average runtime : 1.05909 ns/op 
     Ops per second  : 944210088 op/s 
-- Other function --
     Total time      : 426281454 ns 
     Average runtime : 1.27042 ns/op 
     Ops per second  : 787142571 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.833652 
Performance tests with inputs in normal range:

-- My function --

Total time      : 425710694 ns

Average runtime : 1.06321 ns/op

Ops per second  : 940544942 op/s

-- Other function --

Total time      : 510568848 ns

Average runtime : 1.27515 ns/op

Ops per second  : 784223404 op/s

-- Average runtime ratio --

Mine / Other's  : 0.833797

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 51414754 ns 
     Average runtime : 2.51294 ns/op 
     Ops per second  : 397940248 op/s 
-- Other function --
     Total time      : 35449137 ns 
     Average runtime : 1.73261 ns/op 
     Ops per second  : 577164967 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.45038 
Performance tests with inputs in normal range:

-- My function --

Total time      : 437795085 ns

Average runtime : 0.71258 ns/op

Ops per second  : 1403350610 op/s

-- Other function --

Total time      : 422209473 ns

Average runtime : 0.687212 ns/op

Ops per second  : 1455154465 op/s

-- Average runtime ratio --

Mine / Other's  : 1.03691

Google Tensor G3, Clang 17, -mcpu=cortex-x3

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 45316447 ns 
     Average runtime : 1.13291 ns/op 
     Ops per second  : 882682616 op/s 
-- Other function --
     Total time      : 30075318 ns 
     Average runtime : 0.751882 ns/op 
     Ops per second  : 1329995579 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.50677 
Performance tests with inputs in normal range:

-- My function --

Total time      : 44029663 ns

Average runtime : 1.10074 ns/op

Ops per second  : 908479358 op/s

-- Other function --

Total time      : 29356445 ns

Average runtime : 0.73391 ns/op

Ops per second  : 1362564166 op/s

-- Average runtime ratio --

Mine / Other's  : 1.49983
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 42547444 ns

Average runtime : 1.06369 ns/op

Ops per second  : 940127919 op/s

-- Other function --

Total time      : 28212362 ns

Average runtime : 0.705308 ns/op

Ops per second  : 1417819606 op/s

-- Average runtime ratio --

Mine / Other's  : 1.50811

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 21709229 ns 
     Average runtime : 1.06106 ns/op 
     Ops per second  : 942456316 op/s 
-- Other function --
     Total time      : 14792684 ns 
     Average runtime : 0.723005 ns/op 
     Ops per second  : 1383116140 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.46757 
Performance tests with inputs in normal range:

-- My function --

Total time      : 653066447 ns

Average runtime : 1.06297 ns/op

Ops per second  : 940761851 op/s

-- Other function --

Total time      : 429773966 ns

Average runtime : 0.699525 ns/op

Ops per second  : 1429542151 op/s

-- Average runtime ratio --

Mine / Other's  : 1.51956
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 435379435 ns

Average runtime : 1.06294 ns/op

Ops per second  : 940788579 op/s

-- Other function --

Total time      : 285299153 ns

Average runtime : 0.696531 ns/op

Ops per second  : 1435686000 op/s

-- Average runtime ratio --

Mine / Other's  : 1.52605

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 230689942 ns 
     Average runtime : 0.68751 ns/op 
     Ops per second  : 1454524965 op/s 
-- Other function --
     Total time      : 230424194 ns 
     Average runtime : 0.686718 ns/op 
     Ops per second  : 1456202468 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00115 
Performance tests with inputs in normal range:

-- My function --

Total time      : 276262817 ns

Average runtime : 0.689967 ns/op

Ops per second  : 1449344665 op/s

-- Other function --

Total time      : 276254435 ns

Average runtime : 0.689946 ns/op

Ops per second  : 1449388640 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00003

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 27173706 ns 
     Average runtime : 1.32814 ns/op 
     Ops per second  : 752933736 op/s 
-- Other function --
     Total time      : 19877848 ns 
     Average runtime : 0.971547 ns/op 
     Ops per second  : 1029286470 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.36703 
Performance tests with inputs in normal range:

-- My function --

Total time      : 429031495 ns

Average runtime : 0.698316 ns/op

Ops per second  : 1432016080 op/s

-- Other function --

Total time      : 421921916 ns

Average runtime : 0.686744 ns/op

Ops per second  : 1456146212 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01685

After:

Intel Core i7-13700H, Clang 18

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 62097141 ns 
     Average runtime : 1.55243 ns/op 
     Ops per second  : 644152683 op/s 
-- Other function --
     Total time      : 60779460 ns 
     Average runtime : 1.51948 ns/op 
     Ops per second  : 658117725 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.02168 
Performance tests with inputs in normal range:

-- My function --

Total time      : 59544025 ns

Average runtime : 1.4886 ns/op

Ops per second  : 671772524 op/s

-- Other function --

Total time      : 58644646 ns

Average runtime : 1.46611 ns/op

Ops per second  : 682074881 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01534
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 59527994 ns

Average runtime : 1.4882 ns/op

Ops per second  : 671953434 op/s

-- Other function --

Total time      : 59034515 ns

Average runtime : 1.47586 ns/op

Ops per second  : 677570400 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00836

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 56816731 ns 
     Average runtime : 2.77697 ns/op 
     Ops per second  : 360105195 op/s 
-- Other function --
     Total time      : 24017634 ns 
     Average runtime : 1.17388 ns/op 
     Ops per second  : 851874085 op/s 
-- Average runtime ratio --
     Mine / Other's  : 2.36563 
Performance tests with inputs in normal range:

-- My function --

Total time      : 1715490027 ns

Average runtime : 2.79223 ns/op

Ops per second  : 358136736 op/s

-- Other function --

Total time      : 724341250 ns

Average runtime : 1.17898 ns/op

Ops per second  : 848191373 op/s

-- Average runtime ratio --

Mine / Other's  : 2.36835
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 1144110540 ns

Average runtime : 2.79324 ns/op

Ops per second  : 358007365 op/s

-- Other function --

Total time      : 481854266 ns

Average runtime : 1.1764 ns/op

Ops per second  : 850049545 op/s

-- Average runtime ratio --

Mine / Other's  : 2.37439

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 395817357 ns 
     Average runtime : 1.17963 ns/op 
     Ops per second  : 847725027 op/s 
-- Other function --
     Total time      : 394913485 ns 
     Average runtime : 1.17693 ns/op 
     Ops per second  : 849665288 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00229 
Performance tests with inputs in normal range:

-- My function --

Total time      : 473284499 ns

Average runtime : 1.18203 ns/op

Ops per second  : 846002860 op/s

-- Other function --

Total time      : 473167526 ns

Average runtime : 1.18174 ns/op

Ops per second  : 846212003 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00025

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 1426902546 ns 
     Average runtime : 69.7411 ns/op 
     Ops per second  : 14338750 op/s 
-- Other function --
     Total time      : 30469837 ns 
     Average runtime : 1.48924 ns/op 
     Ops per second  : 671483736 op/s 
-- Average runtime ratio --
     Mine / Other's  : 46.83 
Performance tests with inputs in normal range:

-- My function --

Total time      : 5575118970 ns

Average runtime : 9.07438 ns/op

Ops per second  : 110200338 op/s

-- Other function --

Total time      : 903257114 ns

Average runtime : 1.47019 ns/op

Ops per second  : 680182852 op/s

-- Average runtime ratio --

Mine / Other's  : 6.17224

frexpf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 80000914 ns 
     Average runtime : 3.91011 ns/op 
     Ops per second  : 255747078 op/s 
-- Other function --
     Total time      : 27108900 ns 
     Average runtime : 1.32497 ns/op 
     Ops per second  : 754733685 op/s 
-- Average runtime ratio --
     Mine / Other's  : 2.95109 
Performance tests with inputs in normal range:

-- My function --

Total time      : 2056103899 ns

Average runtime : 3.34663 ns/op

Ops per second  : 298807857 op/s

-- Other function --

Total time      : 810701512 ns

Average runtime : 1.31954 ns/op

Ops per second  : 757837491 op/s

-- Average runtime ratio --

Mine / Other's  : 2.5362

Intel Core i7-13700H, Clang 18, -march=native

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 55348041 ns 
     Average runtime : 1.3837 ns/op 
     Ops per second  : 722700194 op/s 
-- Other function --
     Total time      : 44278352 ns 
     Average runtime : 1.10696 ns/op 
     Ops per second  : 903376891 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.25 
Performance tests with inputs in normal range:

-- My function --

Total time      : 52808434 ns

Average runtime : 1.32021 ns/op

Ops per second  : 757455523 op/s

-- Other function --

Total time      : 42239658 ns

Average runtime : 1.05599 ns/op

Ops per second  : 946978311 op/s

-- Average runtime ratio --

Mine / Other's  : 1.25021
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 53027828 ns

Average runtime : 1.32569 ns/op

Ops per second  : 754321674 op/s

-- Other function --

Total time      : 42427951 ns

Average runtime : 1.0607 ns/op

Ops per second  : 942775671 op/s

-- Average runtime ratio --

Mine / Other's  : 1.24983

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 50963855 ns 
     Average runtime : 2.4909 ns/op 
     Ops per second  : 401460996 op/s 
-- Other function --
     Total time      : 27248525 ns 
     Average runtime : 1.33179 ns/op 
     Ops per second  : 750866331 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.87033 
Performance tests with inputs in normal range:

-- My function --

Total time      : 1539221989 ns

Average runtime : 2.50533 ns/op

Ops per second  : 399149703 op/s

-- Other function --

Total time      : 811293318 ns

Average runtime : 1.32051 ns/op

Ops per second  : 757284679 op/s

-- Average runtime ratio --

Mine / Other's  : 1.89724
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 1025525545 ns

Average runtime : 2.50372 ns/op

Ops per second  : 399404970 op/s

-- Other function --

Total time      : 541923450 ns

Average runtime : 1.32306 ns/op

Ops per second  : 755826307 op/s

-- Average runtime ratio --

Mine / Other's  : 1.89238

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 452690316 ns 
     Average runtime : 1.34912 ns/op 
     Ops per second  : 741222571 op/s 
-- Other function --
     Total time      : 355827535 ns 
     Average runtime : 1.06045 ns/op 
     Ops per second  : 942996949 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.27222 
Performance tests with inputs in normal range:

-- My function --

Total time      : 531344414 ns

Average runtime : 1.32703 ns/op

Ops per second  : 753560269 op/s

-- Other function --

Total time      : 426556382 ns

Average runtime : 1.06533 ns/op

Ops per second  : 938680223 op/s

-- Average runtime ratio --

Mine / Other's  : 1.24566

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 139220211 ns 
     Average runtime : 6.80451 ns/op 
     Ops per second  : 146961420 op/s 
-- Other function --
     Total time      : 21800669 ns 
     Average runtime : 1.06553 ns/op 
     Ops per second  : 938503309 op/s 
-- Average runtime ratio --
     Mine / Other's  : 6.38605 
Performance tests with inputs in normal range:

-- My function --

Total time      : 4166112770 ns

Average runtime : 6.781 ns/op

Ops per second  : 147470804 op/s

-- Other function --

Total time      : 652684432 ns

Average runtime : 1.06235 ns/op

Ops per second  : 941312477 op/s

-- Average runtime ratio --

Mine / Other's  : 6.38304

frexpf16 (reverted due to performance regression)

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 187513075 ns 
     Average runtime : 9.16486 ns/op 
     Ops per second  : 109112391 op/s 
-- Other function --
     Total time      : 21780392 ns 
     Average runtime : 1.06454 ns/op 
     Ops per second  : 939377032 op/s 
-- Average runtime ratio --
     Mine / Other's  : 8.60926 
Performance tests with inputs in normal range:

-- My function --

Total time      : 5629542662 ns

Average runtime : 9.16297 ns/op

Ops per second  : 109134975 op/s

-- Other function --

Total time      : 651998500 ns

Average runtime : 1.06123 ns/op

Ops per second  : 942302781 op/s

-- Average runtime ratio --

Mine / Other's  : 8.63429

Intel Core i7-13700H, GCC 14

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 49416369 ns 
     Average runtime : 1.23541 ns/op 
     Ops per second  : 809449192 op/s 
-- Other function --
     Total time      : 49076624 ns 
     Average runtime : 1.22691 ns/op 
     Ops per second  : 815052803 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00692 
Performance tests with inputs in normal range:

-- My function --

Total time      : 47351820 ns

Average runtime : 1.18379 ns/op

Ops per second  : 844741342 op/s

-- Other function --

Total time      : 46668521 ns

Average runtime : 1.16671 ns/op

Ops per second  : 857109656 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01464
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 47605571 ns

Average runtime : 1.19014 ns/op

Ops per second  : 840238635 op/s

-- Other function --

Total time      : 47512878 ns

Average runtime : 1.18782 ns/op

Ops per second  : 841877858 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00195

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 30121132 ns 
     Average runtime : 1.4722 ns/op 
     Ops per second  : 679257339 op/s 
-- Other function --
     Total time      : 23940709 ns 
     Average runtime : 1.17012 ns/op 
     Ops per second  : 854611281 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.25816 
Performance tests with inputs in normal range:

-- My function --

Total time      : 905814740 ns

Average runtime : 1.47436 ns/op

Ops per second  : 678262312 op/s

-- Other function --

Total time      : 723516022 ns

Average runtime : 1.17764 ns/op

Ops per second  : 849158804 op/s

-- Average runtime ratio --

Mine / Other's  : 1.25196
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 603451496 ns

Average runtime : 1.47327 ns/op

Ops per second  : 678762092 op/s

-- Other function --

Total time      : 481690086 ns

Average runtime : 1.176 ns/op

Ops per second  : 850339278 op/s

-- Average runtime ratio --

Mine / Other's  : 1.25278

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 396575461 ns 
     Average runtime : 1.18189 ns/op 
     Ops per second  : 846104494 op/s 
-- Other function --
     Total time      : 492061257 ns 
     Average runtime : 1.46646 ns/op 
     Ops per second  : 681915666 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.805947 
Performance tests with inputs in normal range:

-- My function --

Total time      : 473030196 ns

Average runtime : 1.18139 ns/op

Ops per second  : 846457675 op/s

-- Other function --

Total time      : 589891786 ns

Average runtime : 1.47326 ns/op

Ops per second  : 678768630 op/s

-- Average runtime ratio --

Mine / Other's  : 0.801893

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 32154131 ns 
     Average runtime : 1.57156 ns/op 
     Ops per second  : 636310152 op/s 
-- Other function --
     Total time      : 31410305 ns 
     Average runtime : 1.53521 ns/op 
     Ops per second  : 651378584 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.02368 
Performance tests with inputs in normal range:

-- My function --

Total time      : 735449435 ns

Average runtime : 1.19706 ns/op

Ops per second  : 835380341 op/s

-- Other function --

Total time      : 736150324 ns

Average runtime : 1.1982 ns/op

Ops per second  : 834584975 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999048

Intel Core i7-13700H, GCC 14, -march=native

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 44267142 ns 
     Average runtime : 1.10668 ns/op 
     Ops per second  : 903605658 op/s 
-- Other function --
     Total time      : 44277859 ns 
     Average runtime : 1.10695 ns/op 
     Ops per second  : 903386950 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.999758 
Performance tests with inputs in normal range:

-- My function --

Total time      : 42239987 ns

Average runtime : 1.056 ns/op

Ops per second  : 946970935 op/s

-- Other function --

Total time      : 42232675 ns

Average runtime : 1.05582 ns/op

Ops per second  : 947134890 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00017
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 42411208 ns

Average runtime : 1.06028 ns/op

Ops per second  : 943147858 op/s

-- Other function --

Total time      : 42421727 ns

Average runtime : 1.06054 ns/op

Ops per second  : 942913993 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999752

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 27199063 ns 
     Average runtime : 1.32938 ns/op 
     Ops per second  : 752231795 op/s 
-- Other function --
     Total time      : 21738550 ns 
     Average runtime : 1.06249 ns/op 
     Ops per second  : 941185129 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.25119 
Performance tests with inputs in normal range:

-- My function --

Total time      : 811386500 ns

Average runtime : 1.32066 ns/op

Ops per second  : 757197710 op/s

-- Other function --

Total time      : 648659588 ns

Average runtime : 1.0558 ns/op

Ops per second  : 947153193 op/s

-- Average runtime ratio --

Mine / Other's  : 1.25087
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 540747255 ns

Average runtime : 1.32018 ns/op

Ops per second  : 757470326 op/s

-- Other function --

Total time      : 432491178 ns

Average runtime : 1.05589 ns/op

Ops per second  : 947071341 op/s

-- Average runtime ratio --

Mine / Other's  : 1.25031

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 355877642 ns 
     Average runtime : 1.0606 ns/op 
     Ops per second  : 942864176 op/s 
-- Other function --
     Total time      : 444613645 ns 
     Average runtime : 1.32505 ns/op 
     Ops per second  : 754687319 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.80042 
Performance tests with inputs in normal range:

-- My function --

Total time      : 426754590 ns

Average runtime : 1.06582 ns/op

Ops per second  : 938244249 op/s

-- Other function --

Total time      : 530960760 ns

Average runtime : 1.32608 ns/op

Ops per second  : 754104766 op/s

-- Average runtime ratio --

Mine / Other's  : 0.80374

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 27393273 ns 
     Average runtime : 1.33887 ns/op 
     Ops per second  : 746898700 op/s 
-- Other function --
     Total time      : 27358359 ns 
     Average runtime : 1.33716 ns/op 
     Ops per second  : 747851872 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00128 
Performance tests with inputs in normal range:

-- My function --

Total time      : 648843685 ns

Average runtime : 1.0561 ns/op

Ops per second  : 946884456 op/s

-- Other function --

Total time      : 648962910 ns

Average runtime : 1.05629 ns/op

Ops per second  : 946710498 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999816

Google Tensor G3, Clang 17

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 45736368 ns 
     Average runtime : 1.14341 ns/op 
     Ops per second  : 874578409 op/s 
-- Other function --
     Total time      : 54215739 ns 
     Average runtime : 1.35539 ns/op 
     Ops per second  : 737793871 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.843599 
Performance tests with inputs in normal range:

-- My function --

Total time      : 44450887 ns

Average runtime : 1.11127 ns/op

Ops per second  : 899870457 op/s

-- Other function --

Total time      : 50862101 ns

Average runtime : 1.27155 ns/op

Ops per second  : 786440969 op/s

-- Average runtime ratio --

Mine / Other's  : 0.873949
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 43714844 ns

Average runtime : 1.09287 ns/op

Ops per second  : 915021908 op/s

-- Other function --

Total time      : 50984497 ns

Average runtime : 1.27461 ns/op

Ops per second  : 784552998 op/s

-- Average runtime ratio --

Mine / Other's  : 0.857414

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 14234742 ns 
     Average runtime : 0.695735 ns/op 
     Ops per second  : 1437328474 op/s 
-- Other function --
     Total time      : 14231975 ns 
     Average runtime : 0.6956 ns/op 
     Ops per second  : 1437607921 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.00019 
Performance tests with inputs in normal range:

-- My function --

Total time      : 435857219 ns

Average runtime : 0.709426 ns/op

Ops per second  : 1409590052 op/s

-- Other function --

Total time      : 428307170 ns

Average runtime : 0.697137 ns/op

Ops per second  : 1434437812 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01763
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 291204752 ns

Average runtime : 0.710949 ns/op

Ops per second  : 1406570453 op/s

-- Other function --

Total time      : 284953044 ns

Average runtime : 0.695686 ns/op

Ops per second  : 1437429810 op/s

-- Average runtime ratio --

Mine / Other's  : 1.02194

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 356190145 ns 
     Average runtime : 1.06153 ns/op 
     Ops per second  : 942036956 op/s 
-- Other function --
     Total time      : 426026245 ns 
     Average runtime : 1.26966 ns/op 
     Ops per second  : 787614105 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.836076 
Performance tests with inputs in normal range:

-- My function --

Total time      : 427515218 ns

Average runtime : 1.06772 ns/op

Ops per second  : 936574940 op/s

-- Other function --

Total time      : 511079712 ns

Average runtime : 1.27642 ns/op

Ops per second  : 783439511 op/s

-- Average runtime ratio --

Mine / Other's  : 0.836494

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 28205566 ns 
     Average runtime : 1.37857 ns/op 
     Ops per second  : 725388740 op/s 
-- Other function --
     Total time      : 19414795 ns 
     Average runtime : 0.948915 ns/op 
     Ops per second  : 1053835489 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.45279 
Performance tests with inputs in normal range:

-- My function --

Total time      : 428392985 ns

Average runtime : 0.697277 ns/op

Ops per second  : 1434150468 op/s

-- Other function --

Total time      : 422012940 ns

Average runtime : 0.686892 ns/op

Ops per second  : 1455832136 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01512

Google Tensor G3, Clang 17, -mcpu=cortex-x3

copysignf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 29511556 ns 
     Average runtime : 0.737788 ns/op 
     Ops per second  : 1355402609 op/s 
-- Other function --
     Total time      : 29574626 ns 
     Average runtime : 0.739365 ns/op 
     Ops per second  : 1352512116 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.997867 
Performance tests with inputs in normal range:

-- My function --

Total time      : 28641398 ns

Average runtime : 0.716034 ns/op

Ops per second  : 1396581270 op/s

-- Other function --

Total time      : 28091267 ns

Average runtime : 0.702281 ns/op

Ops per second  : 1423931501 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01958
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 28293619 ns

Average runtime : 0.70734 ns/op

Ops per second  : 1413747742 op/s

-- Other function --

Total time      : 28212118 ns

Average runtime : 0.705302 ns/op

Ops per second  : 1417831869 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00289

copysignf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 14233439 ns 
     Average runtime : 0.695672 ns/op 
     Ops per second  : 1437460054 op/s 
-- Other function --
     Total time      : 14462281 ns 
     Average runtime : 0.706856 ns/op 
     Ops per second  : 1414714594 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.984177 
Performance tests with inputs in normal range:

-- My function --

Total time      : 432031413 ns

Average runtime : 0.703199 ns/op

Ops per second  : 1422072519 op/s

-- Other function --

Total time      : 431731568 ns

Average runtime : 0.702711 ns/op

Ops per second  : 1423060173 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00069
Performance tests with inputs in normal range with exponents close to each other:

-- My function --

Total time      : 287640909 ns

Average runtime : 0.702248 ns/op

Ops per second  : 1423997724 op/s

-- Other function --

Total time      : 287166260 ns

Average runtime : 0.70109 ns/op

Ops per second  : 1426351410 op/s

-- Average runtime ratio --

Mine / Other's  : 1.00165

fabsf

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 230337158 ns 
     Average runtime : 0.686458 ns/op 
     Ops per second  : 1456752713 op/s 
-- Other function --
     Total time      : 230388184 ns 
     Average runtime : 0.68661 ns/op 
     Ops per second  : 1456430074 op/s 
-- Average runtime ratio --
     Mine / Other's  : 0.999779 
Performance tests with inputs in normal range:

-- My function --

Total time      : 275919353 ns

Average runtime : 0.689109 ns/op

Ops per second  : 1451148807 op/s

-- Other function --

Total time      : 276016398 ns

Average runtime : 0.689352 ns/op

Ops per second  : 1450638595 op/s

-- Average runtime ratio --

Mine / Other's  : 0.999648

fabsf16

 Performance tests with inputs in denormal range:
-- My function --
     Total time      : 26601847 ns 
     Average runtime : 1.30019 ns/op 
     Ops per second  : 769119527 op/s 
-- Other function --
     Total time      : 20337443 ns 
     Average runtime : 0.99401 ns/op 
     Ops per second  : 1006026175 op/s 
-- Average runtime ratio --
     Mine / Other's  : 1.30802 
Performance tests with inputs in normal range:

-- My function --

Total time      : 429470174 ns

Average runtime : 0.69903 ns/op

Ops per second  : 1430553358 op/s

-- Other function --

Total time      : 422461303 ns

Average runtime : 0.687622 ns/op

Ops per second  : 1454287045 op/s

-- Average runtime ratio --

Mine / Other's  : 1.01659

libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp

jhuber6 · 2024-07-16T14:48:42Z

Just FYI, if we get this working in general we can probably remove a lot of GPU implementations that are just __builtin_thing.

…able Split checks for __builtin_{fmax,fmin}f16 into separate feature.

overmighty · 2024-07-17T11:20:53Z

For some reason, on x86, __builtin_{fmaximum_num}{fminimum_num}{,f} return signaling NaNs in cases where we expect quiet NaNs.

[862/1780] Running unit test libc.test.src.math.smoke.fmaximum_numf_test.__unit__
FAILED: projects/libc/test/src/math/smoke/CMakeFiles/libc.test.src.math.smoke.fmaximum_numf_test.__unit__ /tmp/llvm-build-overlay-release-native/projects/libc/test/src/math/smoke/CMakeFiles/libc.test.src.math.smoke.fmaximum_numf_test.__unit__
cd /tmp/llvm-build-overlay-release-native/projects/libc/test/src/math/smoke && /tmp/llvm-build-overlay-release-native/projects/libc/test/src/math/smoke/libc.test.src.math.smoke.fmaximum_numf_test.__unit__.__build__
[==========] Running 5 tests from 1 test suite.
[ RUN      ] LlvmLibcFMaximumNumTest.NaN
/home/overmighty/projects/llvm-project/libc/test/src/math/smoke/FMaximumNumTest.h:42: FAILURE
      Expected: FPBits(aNaN).uintval()
      Which is: 2143289344
To be equal to: FPBits(func(aNaN, sNaN)).uintval()
      Which is: 2141192192
/home/overmighty/projects/llvm-project/libc/test/src/math/smoke/FMaximumNumTest.h:44: FAILURE
      Expected: FPBits(aNaN).uintval()
      Which is: 2143289344
To be equal to: FPBits(func(sNaN, sNaN)).uintval()
      Which is: 2141192192
[  FAILED  ] LlvmLibcFMaximumNumTest.NaN
[ RUN      ] LlvmLibcFMaximumNumTest.InfArg
[       OK ] LlvmLibcFMaximumNumTest.InfArg (0 ns)
[ RUN      ] LlvmLibcFMaximumNumTest.NegInfArg
[       OK ] LlvmLibcFMaximumNumTest.NegInfArg (0 ns)
[ RUN      ] LlvmLibcFMaximumNumTest.BothZero

>>> f"{2141192192:>032b}"
'01111111101000000000000000000000'
>>>

Code generated by Clang without -march=native:

0000000000004dc0 <__llvm_libc_19_0_0_git::fmaximum_numf(float, float)>:
    4dc0: 55                            push    rbp
    4dc1: 48 89 e5                      mov     rbp, rsp
    4dc4: 0f 28 d0                      movaps  xmm2, xmm0
    4dc7: f3 0f c2 d0 03                cmpunordss      xmm2, xmm0
    4dcc: 0f 28 da                      movaps  xmm3, xmm2
    4dcf: 0f 54 d9                      andps   xmm3, xmm1
    4dd2: f3 0f 5f c8                   maxss   xmm1, xmm0
    4dd6: 0f 55 d1                      andnps  xmm2, xmm1
    4dd9: 0f 56 d3                      orps    xmm2, xmm3
    4ddc: 0f 28 c2                      movaps  xmm0, xmm2
    4ddf: 5d                            pop     rbp
    4de0: c3                            ret

Code generated by Clang with -march=native on Intel Raptor Lake:

0000000000004d50 <__llvm_libc_19_0_0_git::fmaximum_numf(float, float)>:
    4d50: 55                            push    rbp
    4d51: 48 89 e5                      mov     rbp, rsp
    4d54: c5 f2 5f d0                   vmaxss  xmm2, xmm1, xmm0
    4d58: c5 fa c2 c0 03                vcmpunordss     xmm0, xmm0, xmm0
    4d5d: c4 e3 69 4a c1 00             vblendvps       xmm0, xmm2, xmm1, xmm0
    4d63: 5d                            pop     rbp
    4d64: c3                            ret

overmighty · 2024-07-17T11:33:46Z

The problem is that maxss and minss don't behave the same as IEEE 754 maximumNumber and minimumNumber. If one or both inputs are NaNs, maxss and minss just return the second input, whereas maximumNumber and minimumNumber return a quiet NaN unless both inputs are signaling NaNs.

lntue · 2024-07-18T16:46:28Z

The problem is that maxss and minss don't behave the same as IEEE 754 maximumNumber and minimumNumber. If one or both inputs are NaNs, maxss and minss just return the second input, whereas maximumNumber and minimumNumber return a quiet NaN unless both inputs are signaling NaNs.

Let skip the builtin / hardware instructions for these functions.

…able Revert "Split checks for __builtin_{fmax,fmin}f16 into separate feature."

…able Remove support for __builtin_{fmax,fmin}* due to incorrect results on x86-64 and AArch64.

…able Remove benchmark for frexpf16.

…able Disable __builtin_fabsf16 on GCC for x86.

libc/src/math/generic/CMakeLists.txt

libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h

libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h

lntue

LGTM with few nits.

… when available (#99037) Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251098

overmighty added 2 commits July 16, 2024 15:21

[libc][math] Optimize misc basic ops using builtins when available

9554a0b

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

c422169

…able Remove support for __builtin_frexpf16 as it decreases performance.

overmighty requested a review from lntue July 16, 2024 13:23

llvmbot added the libc label Jul 16, 2024

lntue reviewed Jul 16, 2024

View reviewed changes

libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp Outdated Show resolved Hide resolved

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

29b47b2

…able Split checks for __builtin_{fmax,fmin}f16 into separate feature.

overmighty added 4 commits July 18, 2024 19:54

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

dda59ea

…able Revert "Split checks for __builtin_{fmax,fmin}f16 into separate feature."

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

d405f8b

…able Remove support for __builtin_{fmax,fmin}* due to incorrect results on x86-64 and AArch64.

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

147b327

…able Remove benchmark for frexpf16.

fixup! [libc][math] Optimize misc basic ops using builtins when avail…

9b85659

…able Disable __builtin_fabsf16 on GCC for x86.

overmighty changed the title ~~[libc][math] Optimize misc basic math operations with builtins when available~~ [libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available Jul 22, 2024

overmighty requested a review from lntue July 22, 2024 15:18

lntue reviewed Jul 22, 2024

View reviewed changes

libc/src/math/generic/CMakeLists.txt Outdated Show resolved Hide resolved

lntue reviewed Jul 22, 2024

View reviewed changes

libc/src/math/generic/CMakeLists.txt Outdated Show resolved Hide resolved

lntue reviewed Jul 22, 2024

View reviewed changes

libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h Outdated Show resolved Hide resolved

overmighty added 2 commits July 22, 2024 17:26

[libc][math] Change fabs* functions' compile options from -O2 to -O3

2189647

[libc][math] Switch from <algorithm> to CPP/algorithm.h in perf tests

0a0818e

lntue reviewed Jul 22, 2024

View reviewed changes

libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h Outdated Show resolved Hide resolved

lntue approved these changes Jul 22, 2024

View reviewed changes

overmighty merged commit 70843bf into llvm:main Jul 22, 2024
6 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

Uh oh!

overmighty commented Jul 16, 2024

Uh oh!

llvmbot commented Jul 16, 2024

Uh oh!

overmighty commented Jul 16, 2024

Uh oh!

overmighty commented Jul 16, 2024 •

edited

Loading

Uh oh!

Uh oh!

jhuber6 commented Jul 16, 2024

Uh oh!

overmighty commented Jul 17, 2024

Uh oh!

overmighty commented Jul 17, 2024 •

edited

Loading

Uh oh!

lntue commented Jul 18, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lntue left a comment

Uh oh!

Uh oh!

Uh oh!

[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

Uh oh!

Conversation

overmighty commented Jul 16, 2024

Uh oh!

llvmbot commented Jul 16, 2024

Uh oh!

overmighty commented Jul 16, 2024

Uh oh!

overmighty commented Jul 16, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

jhuber6 commented Jul 16, 2024

Uh oh!

overmighty commented Jul 17, 2024

Uh oh!

overmighty commented Jul 17, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

lntue commented Jul 18, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lntue left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

overmighty commented Jul 16, 2024 •

edited

Loading

overmighty commented Jul 17, 2024 •

edited

Loading