Optimize visibilitymap_count() with AVX-512 instructions.
authorNathan Bossart <[email protected]>
Sun, 7 Apr 2024 03:58:23 +0000 (22:58 -0500)
committerNathan Bossart <[email protected]>
Sun, 7 Apr 2024 03:58:23 +0000 (22:58 -0500)
Commit 792752af4e added infrastructure for using AVX-512 intrinsic
functions, and this commit uses that infrastructure to optimize
visibilitymap_count().  Specificially, a new pg_popcount_masked()
function is introduced that applies a bitmask to every byte in the
buffer prior to calculating the population count, which is used to
filter out the all-visible or all-frozen bits as needed.  Platforms
without AVX-512 support should also see a nice speedup due to the
reduced number of calls to a function pointer.

Co-authored-by: Ants Aasma
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com

src/backend/access/heap/visibilitymap.c
src/include/port/pg_bitutils.h
src/port/pg_bitutils.c
src/port/pg_popcount_avx512.c

index 1ab6c865e3c226fb1d4234a84fb03e8a86caf4ac..8b24e7bc33cf6153fffd88a65e2e4c2b00321258 100644 (file)
 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 
 /* Masks for counting subsets of bits in the visibility map. */
-#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each
-                                                        * bit pair */
-#define FROZEN_MASK64  UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
-                                                        * bit pair */
+#define VISIBLE_MASK8  (0x55)  /* The lower bit of each bit pair */
+#define FROZEN_MASK8   (0xaa)  /* The upper bit of each bit pair */
 
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
    {
        Buffer      mapBuffer;
        uint64     *map;
-       int         i;
 
        /*
         * Read till we fall off the end of the map.  We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
         */
        map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 
-       StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
-                        "unsupported MAPSIZE");
-       if (all_frozen == NULL)
-       {
-           for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-               nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-       }
-       else
-       {
-           for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-           {
-               nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-               nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
-           }
-       }
+       nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
+       if (all_frozen)
+           nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
 
        ReleaseBuffer(mapBuffer);
    }
index b453f84d8f154f0a602f3ccebea7012c7dc0e0e7..4d88478c9c24d78da7e3069d06837e0692cc679d 100644 (file)
@@ -303,6 +303,7 @@ pg_ceil_log2_64(uint64 num)
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
 
 /*
  * We can also try to use the AVX-512 popcount instruction on some systems.
@@ -313,6 +314,7 @@ extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 extern bool pg_popcount_avx512_available(void);
 extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
 #else
@@ -320,6 +322,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern int pg_popcount32(uint32 word);
 extern int pg_popcount64(uint64 word);
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
 
 #endif                         /* TRY_POPCNT_FAST */
 
@@ -357,6 +360,37 @@ pg_popcount(const char *buf, int bytes)
    return pg_popcount_optimized(buf, bytes);
 }
 
+/*
+ * Returns the number of 1-bits in buf after applying the mask to each byte.
+ *
+ * Similar to pg_popcount(), we only take on the function pointer overhead when
+ * it's likely to be faster.
+ */
+static inline uint64
+pg_popcount_masked(const char *buf, int bytes, bits8 mask)
+{
+   /*
+    * We set the threshold to the point at which we'll first use special
+    * instructions in the optimized version.
+    */
+#if SIZEOF_VOID_P >= 8
+   int         threshold = 8;
+#else
+   int         threshold = 4;
+#endif
+
+   if (bytes < threshold)
+   {
+       uint64      popcnt = 0;
+
+       while (bytes--)
+           popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+       return popcnt;
+   }
+
+   return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
 /*
  * Rotate the bits of "word" to the right/left by n bits.
  */
index 411be90f734b12fdbf984792115068c19e8e1fdd..87f56e82b80105f95c3b392ce85020c18367e345 100644 (file)
@@ -106,19 +106,23 @@ const uint8 pg_number_of_ones[256] = {
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
+static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int pg_popcount32_choose(uint32 word);
 static int pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
 static uint64 pg_popcount_fast(const char *buf, int bytes);
+static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
 
 int            (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int            (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64     (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64     (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 #endif                         /* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -156,17 +160,22 @@ choose_popcount_functions(void)
        pg_popcount32 = pg_popcount32_fast;
        pg_popcount64 = pg_popcount64_fast;
        pg_popcount_optimized = pg_popcount_fast;
+       pg_popcount_masked_optimized = pg_popcount_masked_fast;
    }
    else
    {
        pg_popcount32 = pg_popcount32_slow;
        pg_popcount64 = pg_popcount64_slow;
        pg_popcount_optimized = pg_popcount_slow;
+       pg_popcount_masked_optimized = pg_popcount_masked_slow;
    }
 
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
    if (pg_popcount_avx512_available())
+   {
        pg_popcount_optimized = pg_popcount_avx512;
+       pg_popcount_masked_optimized = pg_popcount_masked_avx512;
+   }
 #endif
 }
 
@@ -191,6 +200,13 @@ pg_popcount_choose(const char *buf, int bytes)
    return pg_popcount_optimized(buf, bytes);
 }
 
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+   choose_popcount_functions();
+   return pg_popcount_masked(buf, bytes, mask);
+}
+
 /*
  * pg_popcount32_fast
  *     Return the number of 1 bits set in word
@@ -271,6 +287,56 @@ pg_popcount_fast(const char *buf, int bytes)
    return popcnt;
 }
 
+/*
+ * pg_popcount_masked_fast
+ *     Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+static uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+   uint64      popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+   /* Process in 64-bit chunks if the buffer is aligned */
+   uint64      maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+   if (buf == (const char *) TYPEALIGN(8, buf))
+   {
+       const uint64 *words = (const uint64 *) buf;
+
+       while (bytes >= 8)
+       {
+           popcnt += pg_popcount64_fast(*words++ & maskv);
+           bytes -= 8;
+       }
+
+       buf = (const char *) words;
+   }
+#else
+   /* Process in 32-bit chunks if the buffer is aligned. */
+   uint32      maskv = ~((uint32) 0) / 0xFF * mask;
+
+   if (buf == (const char *) TYPEALIGN(4, buf))
+   {
+       const uint32 *words = (const uint32 *) buf;
+
+       while (bytes >= 4)
+       {
+           popcnt += pg_popcount32_fast(*words++ & maskv);
+           bytes -= 4;
+       }
+
+       buf = (const char *) words;
+   }
+#endif
+
+   /* Process any remaining bytes */
+   while (bytes--)
+       popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+   return popcnt;
+}
+
 #endif                         /* TRY_POPCNT_FAST */
 
 
@@ -370,6 +436,56 @@ pg_popcount_slow(const char *buf, int bytes)
    return popcnt;
 }
 
+/*
+ * pg_popcount_masked_slow
+ *     Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+static uint64
+pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+{
+   uint64      popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+   /* Process in 64-bit chunks if the buffer is aligned */
+   uint64      maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+   if (buf == (const char *) TYPEALIGN(8, buf))
+   {
+       const uint64 *words = (const uint64 *) buf;
+
+       while (bytes >= 8)
+       {
+           popcnt += pg_popcount64_slow(*words++ & maskv);
+           bytes -= 8;
+       }
+
+       buf = (const char *) words;
+   }
+#else
+   /* Process in 32-bit chunks if the buffer is aligned. */
+   uint32      maskv = ~((uint32) 0) / 0xFF * mask;
+
+   if (buf == (const char *) TYPEALIGN(4, buf))
+   {
+       const uint32 *words = (const uint32 *) buf;
+
+       while (bytes >= 4)
+       {
+           popcnt += pg_popcount32_slow(*words++ & maskv);
+           bytes -= 4;
+       }
+
+       buf = (const char *) words;
+   }
+#endif
+
+   /* Process any remaining bytes */
+   while (bytes--)
+       popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+   return popcnt;
+}
+
 #ifndef TRY_POPCNT_FAST
 
 /*
@@ -401,4 +517,14 @@ pg_popcount_optimized(const char *buf, int bytes)
    return pg_popcount_slow(buf, bytes);
 }
 
+/*
+ * pg_popcount_masked_optimized
+ *     Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+   return pg_popcount_masked_slow(buf, bytes, mask);
+}
+
 #endif                         /* !TRY_POPCNT_FAST */
index 908817617ac2a4655f58c5753102ae837ae19d13..a67092b9c47e2cb9a975e8236ccab564aa7d7680 100644 (file)
@@ -78,4 +78,64 @@ pg_popcount_avx512(const char *buf, int bytes)
    return _mm512_reduce_add_epi64(accum);
 }
 
+/*
+ * pg_popcount_masked_avx512
+ *     Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+uint64
+pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
+{
+   __m512i     val,
+               vmasked,
+               cnt;
+   __m512i     accum = _mm512_setzero_si512();
+   const char *final;
+   int         tail_idx;
+   __mmask64   bmask = ~UINT64CONST(0);
+   const       __m512i maskv = _mm512_set1_epi8(mask);
+
+   /*
+    * Align buffer down to avoid double load overhead from unaligned access.
+    * Calculate a mask to ignore preceding bytes.  Find start offset of final
+    * iteration and ensure it is not empty.
+    */
+   bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
+   tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
+   final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
+   buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
+
+   /*
+    * Iterate through all but the final iteration.  Starting from the second
+    * iteration, the mask is ignored.
+    */
+   if (buf < final)
+   {
+       val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
+       vmasked = _mm512_and_si512(val, maskv);
+       cnt = _mm512_popcnt_epi64(vmasked);
+       accum = _mm512_add_epi64(accum, cnt);
+
+       buf += sizeof(__m512i);
+       bmask = ~UINT64CONST(0);
+
+       for (; buf < final; buf += sizeof(__m512i))
+       {
+           val = _mm512_load_si512((const __m512i *) buf);
+           vmasked = _mm512_and_si512(val, maskv);
+           cnt = _mm512_popcnt_epi64(vmasked);
+           accum = _mm512_add_epi64(accum, cnt);
+       }
+   }
+
+   /* Final iteration needs to ignore bytes that are not within the length */
+   bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
+
+   val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
+   vmasked = _mm512_and_si512(val, maskv);
+   cnt = _mm512_popcnt_epi64(vmasked);
+   accum = _mm512_add_epi64(accum, cnt);
+
+   return _mm512_reduce_add_epi64(accum);
+}
+
 #endif                         /* TRY_POPCNT_FAST */