From cfc5e9fe77f96225ec67a044377b10113c98ce0d Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Tue, 2 Apr 2024 15:54:49 -0500
Subject: [PATCH v22 1/3] inline pg_popcount for small numbers of bytes

---
 src/include/port/pg_bitutils.h | 34 ++++++++++++++++++++++++++++++++--
 src/port/pg_bitutils.c         | 12 ++++++------
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..1f487a4bc3 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -302,16 +302,46 @@ pg_ceil_log2_64(uint64 num)
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
-extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
 
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
-extern uint64 pg_popcount(const char *buf, int bytes);
+extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 
 #endif							/* TRY_POPCNT_FAST */
 
+/*
+ * Returns the number of 1-bits in buf.
+ *
+ * If there aren't many bytes to process, the function call overhead of the
+ * optimized versions isn't worth taking, so we inline a loop that consults
+ * pg_number_of_ones in that case.  If there are many bytes to process, we
+ * accept the function call overhead because the optimized versions are likely
+ * to be faster.
+ */
+static inline uint64
+pg_popcount(const char *buf, int bytes)
+{
+	/*
+	 * We use 8 bytes as the threshold because that's where we'll first use
+	 * special instructions on 64-bit systems.  A threshold of 4 bytes might
+	 * make more sense on 32-bit systems, but it seems unlikely to make a
+	 * tremendous difference.
+	 */
+	if (bytes < 8)
+	{
+		uint64		popcnt = 0;
+
+		while (bytes--)
+			popcnt += pg_number_of_ones[(unsigned char) *buf++];
+		return popcnt;
+	}
+
+	return pg_popcount_optimized(buf, bytes);
+}
+
 /*
  * Rotate the bits of "word" to the right/left by n bits.
  */
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 28312f3dd9..6271acea60 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -118,7 +118,7 @@ static uint64 pg_popcount_fast(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -155,13 +155,13 @@ choose_popcount_functions(void)
 	{
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
+		pg_popcount_optimized = pg_popcount_fast;
 	}
 	else
 	{
 		pg_popcount32 = pg_popcount32_slow;
 		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
+		pg_popcount_optimized = pg_popcount_slow;
 	}
 }
 
@@ -183,7 +183,7 @@ static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
 	choose_popcount_functions();
-	return pg_popcount(buf, bytes);
+	return pg_popcount_optimized(buf, bytes);
 }
 
 /*
@@ -387,11 +387,11 @@ pg_popcount64(uint64 word)
 }
 
 /*
- * pg_popcount
+ * pg_popcount_optimized
  *		Returns the number of 1-bits in buf
  */
 uint64
-pg_popcount(const char *buf, int bytes)
+pg_popcount_optimized(const char *buf, int bytes)
 {
 	return pg_popcount_slow(buf, bytes);
 }
-- 
2.25.1

