Delay extraction of TIDBitmap per page offsets
authorMelanie Plageman <[email protected]>
Mon, 24 Feb 2025 21:07:55 +0000 (16:07 -0500)
committerMelanie Plageman <[email protected]>
Mon, 24 Feb 2025 21:10:19 +0000 (16:10 -0500)
Pages from the bitmap created by the TIDBitmap API can be exact or
lossy. The TIDBitmap API extracts the tuple offsets from exact pages
into an array for the convenience of the caller.

This was done in tbm_private|shared_iterate() right after advancing the
iterator. However, as long as tbm_private|shared_iterate() set a
reference to the PagetableEntry in the TBMIterateResult, the offset
extraction can be done later.

Waiting to extract the tuple offsets has a few benefits. For the shared
iterator case, it allows us to extract the offsets after dropping the
shared iterator state lock, reducing time spent holding a contended
lock.

Separating the iteration step and extracting the offsets later also
allows us to avoid extracting the offsets for prefetched blocks. Those
offsets were never used, so the overhead of extracting and storing them
was wasted.

The real motivation for this change, however, is that future commits
will make bitmap heap scan use the read stream API. This requires a
TBMIterateResult per issued block. By removing the array of tuple
offsets from the TBMIterateResult and only extracting the offsets when
they are used, we reduce the memory required for per buffer data
substantially.

Suggested-by: Thomas Munro <[email protected]>
Reviewed-by: Thomas Munro <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKGLHbKP3jwJ6_%2BhnGi37Pw3BD5j2amjV3oSk7j-KyCnY7Q%40mail.gmail.com

src/backend/access/gin/ginget.c
src/backend/access/gin/ginscan.c
src/backend/access/heap/heapam_handler.c
src/backend/nodes/tidbitmap.c
src/include/access/gin_private.h
src/include/nodes/tidbitmap.h

index 54aecc1d1f1dbbaa4b8ad863d358c88ace64bcbc..f3b19d280c3ba80a8bc530fa0e126c1965ef73da 100644 (file)
@@ -333,6 +333,7 @@ restartScanEntry:
    entry->nlist = 0;
    entry->matchBitmap = NULL;
    entry->matchResult = NULL;
+   entry->matchNtuples = -1;
    entry->reduceResult = false;
    entry->predictNumberResult = 0;
 
@@ -828,7 +829,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
             */
            while (entry->matchResult == NULL ||
                   (!entry->matchResult->lossy &&
-                   entry->offset >= entry->matchResult->ntuples) ||
+                   entry->offset >= entry->matchNtuples) ||
                   entry->matchResult->blockno < advancePastBlk ||
                   (ItemPointerIsLossyPage(&advancePast) &&
                    entry->matchResult->blockno == advancePastBlk))
@@ -845,9 +846,15 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
                    break;
                }
 
+               /* Exact pages need their tuple offsets extracted. */
+               if (!entry->matchResult->lossy)
+                   entry->matchNtuples = tbm_extract_page_tuple(entry->matchResult,
+                                                                entry->matchOffsets,
+                                                                TBM_MAX_TUPLES_PER_PAGE);
+
                /*
                 * Reset counter to the beginning of entry->matchResult. Note:
-                * entry->offset is still greater than matchResult->ntuples if
+                * entry->offset is still greater than entry->matchNtuples if
                 * matchResult is lossy.  So, on next call we will get next
                 * result from TIDBitmap.
                 */
@@ -874,32 +881,35 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
            }
 
            /*
-            * Not a lossy page. Skip over any offsets <= advancePast, and
-            * return that.
+            * Not a lossy page. If tuple offsets were extracted,
+            * entry->matchNtuples must be > -1
             */
+           Assert(entry->matchNtuples > -1);
+
+           /* Skip over any offsets <= advancePast, and return that. */
            if (entry->matchResult->blockno == advancePastBlk)
            {
-               Assert(entry->matchResult->ntuples > 0);
+               Assert(entry->matchNtuples > 0);
 
                /*
                 * First, do a quick check against the last offset on the
                 * page. If that's > advancePast, so are all the other
                 * offsets, so just go back to the top to get the next page.
                 */
-               if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
+               if (entry->matchOffsets[entry->matchNtuples - 1] <= advancePastOff)
                {
-                   entry->offset = entry->matchResult->ntuples;
+                   entry->offset = entry->matchNtuples;
                    continue;
                }
 
                /* Otherwise scan to find the first item > advancePast */
-               while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+               while (entry->matchOffsets[entry->offset] <= advancePastOff)
                    entry->offset++;
            }
 
            ItemPointerSet(&entry->curItem,
                           entry->matchResult->blockno,
-                          entry->matchResult->offsets[entry->offset]);
+                          entry->matchOffsets[entry->offset]);
            entry->offset++;
 
            /* Done unless we need to reduce the result */
index 7d1e6615260e95b298af4b194edde68b9dade1a0..63ded6301e25f2cec6908ab056cd6df0e3501a8c 100644 (file)
@@ -107,6 +107,7 @@ ginFillScanEntry(GinScanOpaque so, OffsetNumber attnum,
    scanEntry->matchBitmap = NULL;
    scanEntry->matchIterator = NULL;
    scanEntry->matchResult = NULL;
+   scanEntry->matchNtuples = -1;
    scanEntry->list = NULL;
    scanEntry->nlist = 0;
    scanEntry->offset = InvalidOffsetNumber;
index 269d581c2ec5013c9b525732e262202b1aa72776..e78682c3cef214926d13f0a609e88d01d11f5dbc 100644 (file)
@@ -2127,6 +2127,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
    Snapshot    snapshot;
    int         ntup;
    TBMIterateResult *tbmres;
+   OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE];
+   int         noffsets = -1;
 
    Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN);
 
@@ -2145,6 +2147,11 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
        if (tbmres == NULL)
            return false;
 
+       /* Exact pages need their tuple offsets extracted. */
+       if (!tbmres->lossy)
+           noffsets = tbm_extract_page_tuple(tbmres, offsets,
+                                             TBM_MAX_TUPLES_PER_PAGE);
+
        /*
         * Ignore any claimed entries past what we think is the end of the
         * relation. It may have been extended after the start of our scan (we
@@ -2172,8 +2179,9 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
        /* can't be lossy in the skip_fetch case */
        Assert(!tbmres->lossy);
        Assert(bscan->rs_empty_tuples_pending >= 0);
+       Assert(noffsets > -1);
 
-       bscan->rs_empty_tuples_pending += tbmres->ntuples;
+       bscan->rs_empty_tuples_pending += noffsets;
 
        return true;
    }
@@ -2216,9 +2224,12 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
         */
        int         curslot;
 
-       for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+       /* We must have extracted the tuple offsets by now */
+       Assert(noffsets > -1);
+
+       for (curslot = 0; curslot < noffsets; curslot++)
        {
-           OffsetNumber offnum = tbmres->offsets[curslot];
+           OffsetNumber offnum = offsets[curslot];
            ItemPointerData tid;
            HeapTupleData heapTuple;
 
index 3e0bca651f56f3b72e51e1c0c091af77d5712b82..3d835024caa23dd86d8d792bac1c5840ad3ad096 100644 (file)
@@ -40,7 +40,6 @@
 
 #include <limits.h>
 
-#include "access/htup_details.h"
 #include "common/hashfn.h"
 #include "common/int.h"
 #include "nodes/bitmapset.h"
 #include "storage/lwlock.h"
 #include "utils/dsa.h"
 
-/*
- * The maximum number of tuples per page is not large (typically 256 with
- * 8K pages, or 1024 with 32K pages).  So there's not much point in making
- * the per-page bitmaps variable size.  We just legislate that the size
- * is this:
- */
-#define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
-
 /*
  * When we have to switch over to lossy storage, we use a data structure
  * with one bit per page, where all pages having the same number DIV
@@ -67,7 +58,7 @@
  * table, using identical data structures.  (This is because the memory
  * management for hashtables doesn't easily/efficiently allow space to be
  * transferred easily from one hashtable to another.)  Therefore it's best
- * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
+ * if PAGES_PER_CHUNK is the same as TBM_MAX_TUPLES_PER_PAGE, or at least not
  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
  * avoid expensive integer remainder operations.  So, define it like this:
  */
@@ -79,7 +70,7 @@
 #define BITNUM(x)  ((x) % BITS_PER_BITMAPWORD)
 
 /* number of active words for an exact page: */
-#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
+#define WORDS_PER_PAGE ((TBM_MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
 /* number of active words for a lossy chunk: */
 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
 
@@ -181,7 +172,7 @@ struct TBMPrivateIterator
    int         spageptr;       /* next spages index */
    int         schunkptr;      /* next schunks index */
    int         schunkbit;      /* next bit to check in current schunk */
-   TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
+   TBMIterateResult output;
 };
 
 /*
@@ -222,7 +213,7 @@ struct TBMSharedIterator
    PTEntryArray *ptbase;       /* pagetable element array */
    PTIterationArray *ptpages;  /* sorted exact page index list */
    PTIterationArray *ptchunks; /* sorted lossy page index list */
-   TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
+   TBMIterateResult output;
 };
 
 /* Local function prototypes */
@@ -390,7 +381,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
                    bitnum;
 
        /* safety check to ensure we don't overrun bit array bounds */
-       if (off < 1 || off > MAX_TUPLES_PER_PAGE)
+       if (off < 1 || off > TBM_MAX_TUPLES_PER_PAGE)
            elog(ERROR, "tuple offset out of range: %u", off);
 
        /*
@@ -696,9 +687,7 @@ tbm_begin_private_iterate(TIDBitmap *tbm)
     * Create the TBMPrivateIterator struct, with enough trailing space to
     * serve the needs of the TBMIterateResult sub-struct.
     */
-   iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator) +
-                                            MAX_TUPLES_PER_PAGE *
-                                            sizeof(OffsetNumber));
+   iterator = (TBMPrivateIterator *) palloc(sizeof(TBMPrivateIterator));
    iterator->tbm = tbm;
 
    /*
@@ -906,11 +895,16 @@ tbm_prepare_shared_iterate(TIDBitmap *tbm)
 /*
  * tbm_extract_page_tuple - extract the tuple offsets from a page
  *
- * The extracted offsets are stored into TBMIterateResult.
+ * Returns the number of offsets it filled in if <= max_offsets. Otherwise,
+ * fills in as many offsets as fit and returns the total number of offsets in
+ * the page.
  */
-static inline int
-tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
+int
+tbm_extract_page_tuple(TBMIterateResult *iteritem,
+                      OffsetNumber *offsets,
+                      uint32 max_offsets)
 {
+   PagetableEntry *page = iteritem->internal_page;
    int         wordnum;
    int         ntuples = 0;
 
@@ -925,7 +919,11 @@ tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
            while (w != 0)
            {
                if (w & 1)
-                   output->offsets[ntuples++] = (OffsetNumber) off;
+               {
+                   if (ntuples < max_offsets)
+                       offsets[ntuples] = (OffsetNumber) off;
+                   ntuples++;
+               }
                off++;
                w >>= 1;
            }
@@ -1012,9 +1010,9 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
        {
            /* Return a lossy page indicator from the chunk */
            output->blockno = chunk_blockno;
-           output->ntuples = -1;
            output->lossy = true;
            output->recheck = true;
+           output->internal_page = NULL;
            iterator->schunkbit++;
            return output;
        }
@@ -1023,7 +1021,6 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
    if (iterator->spageptr < tbm->npages)
    {
        PagetableEntry *page;
-       int         ntuples;
 
        /* In TBM_ONE_PAGE state, we don't allocate an spages[] array */
        if (tbm->status == TBM_ONE_PAGE)
@@ -1031,10 +1028,8 @@ tbm_private_iterate(TBMPrivateIterator *iterator)
        else
            page = tbm->spages[iterator->spageptr];
 
-       /* scan bitmap to extract individual offset numbers */
-       ntuples = tbm_extract_page_tuple(page, output);
+       output->internal_page = page;
        output->blockno = page->blockno;
-       output->ntuples = ntuples;
        output->lossy = false;
        output->recheck = page->recheck;
        iterator->spageptr++;
@@ -1107,9 +1102,9 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
        {
            /* Return a lossy page indicator from the chunk */
            output->blockno = chunk_blockno;
-           output->ntuples = -1;
            output->lossy = true;
            output->recheck = true;
+           output->internal_page = NULL;
            istate->schunkbit++;
 
            LWLockRelease(&istate->lock);
@@ -1120,12 +1115,9 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
    if (istate->spageptr < istate->npages)
    {
        PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
-       int         ntuples;
 
-       /* scan bitmap to extract individual offset numbers */
-       ntuples = tbm_extract_page_tuple(page, output);
+       output->internal_page = page;
        output->blockno = page->blockno;
-       output->ntuples = ntuples;
        output->lossy = false;
        output->recheck = page->recheck;
        istate->spageptr++;
@@ -1473,8 +1465,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
     * Create the TBMSharedIterator struct, with enough trailing space to
     * serve the needs of the TBMIterateResult sub-struct.
     */
-   iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
-                                            MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+   iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
 
    istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
 
index dcd1ae3fc349c67782c44c3bbe85dc37ac78f31c..50478db9820618db54b2c5bad5f15c888bbd7369 100644 (file)
@@ -354,6 +354,8 @@ typedef struct GinScanEntryData
    TIDBitmap  *matchBitmap;
    TBMPrivateIterator *matchIterator;
    TBMIterateResult *matchResult;
+   OffsetNumber matchOffsets[TBM_MAX_TUPLES_PER_PAGE];
+   int         matchNtuples;
 
    /* used for Posting list and one page in Posting tree */
    ItemPointerData *list;
index 8cd93d90a868ecb5e2717b7f28ba9d8b8828b06f..e185635c10be446a3e7511aa09e81e6a99c93b98 100644 (file)
 #ifndef TIDBITMAP_H
 #define TIDBITMAP_H
 
+#include "access/htup_details.h"
 #include "storage/itemptr.h"
 #include "utils/dsa.h"
 
+/*
+ * The maximum number of tuples per page is not large (typically 256 with
+ * 8K pages, or 1024 with 32K pages).  So there's not much point in making
+ * the per-page bitmaps variable size.  We just legislate that the size
+ * is this:
+ */
+#define TBM_MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
 
 /*
  * Actual bitmap representation is private to tidbitmap.c.  Callers can
@@ -53,12 +61,22 @@ typedef struct TBMIterator
 /* Result structure for tbm_iterate */
 typedef struct TBMIterateResult
 {
-   BlockNumber blockno;        /* page number containing tuples */
-   int         ntuples;        /* -1 when lossy */
+   BlockNumber blockno;        /* block number containing tuples */
+
    bool        lossy;
-   bool        recheck;        /* should the tuples be rechecked? */
-   /* Note: recheck is always true if lossy */
-   OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+
+   /*
+    * Whether or not the tuples should be rechecked. This is always true if
+    * the page is lossy but may also be true if the query requires recheck.
+    */
+   bool        recheck;
+
+   /*
+    * Pointer to the page containing the bitmap for this block. It is a void *
+    * to avoid exposing the details of the tidbitmap PagetableEntry to API
+    * users.
+    */
+   void       *internal_page;
 } TBMIterateResult;
 
 /* function prototypes in nodes/tidbitmap.c */
@@ -75,6 +93,10 @@ extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno);
 extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
 extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 
+extern int tbm_extract_page_tuple(TBMIterateResult *iteritem,
+                                  OffsetNumber *offsets,
+                                  uint32 max_offsets);
+
 extern bool tbm_is_empty(const TIDBitmap *tbm);
 
 extern TBMPrivateIterator *tbm_begin_private_iterate(TIDBitmap *tbm);