bufmgr: Use AIO in StartReadBuffers()

author Andres Freund <[email protected]>

Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)

committer Andres Freund <[email protected]>

Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)
author Andres Freund <[email protected]>
Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)
committer Andres Freund <[email protected]>
Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 4fea8ebf482132e37a13c46e8c5c1fd61f013302..6a18e334809557b83c3496cea56af1dbcc2c899a 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -531,6 +531,8 @@ static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
                                       BlockNumber blockNum,
                                       BufferAccessStrategy strategy,
                                       bool *foundPtr, IOContext io_context);
+static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks);
+static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
  static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
  static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
                         IOObject io_object, IOContext io_context);
@@ -1231,10 +1233,14 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
         return buffer;
     }
  
+   /*
+    * Signal that we are going to immediately wait. If we're immediately
+    * waiting, there is no benefit in actually executing the IO
+    * asynchronously, it would just add dispatch overhead.
+    */
+   flags = READ_BUFFERS_SYNCHRONOUSLY;
     if (mode == RBM_ZERO_ON_ERROR)
-       flags = READ_BUFFERS_ZERO_ON_ERROR;
-   else
-       flags = 0;
+       flags |= READ_BUFFERS_ZERO_ON_ERROR;
     operation.smgr = smgr;
     operation.rel = rel;
     operation.persistence = persistence;
@@ -1259,6 +1265,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
  {
     int         actual_nblocks = *nblocks;
     int         maxcombine = 0;
+   bool        did_start_io;
  
     Assert(*nblocks == 1 || allow_forwarding);
     Assert(*nblocks > 0);
@@ -1326,6 +1333,20 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
             if (i == 0)
             {
                 *nblocks = 1;
+
+#ifdef USE_ASSERT_CHECKING
+
+               /*
+                * Initialize enough of ReadBuffersOperation to make
+                * CheckReadBuffersOperation() work. Outside of assertions
+                * that's not necessary when no IO is issued.
+                */
+               operation->buffers = buffers;
+               operation->blocknum = blockNum;
+               operation->nblocks = 1;
+               operation->nblocks_done = 1;
+               CheckReadBuffersOperation(operation, true);
+#endif
                 return false;
             }
  
@@ -1368,25 +1389,74 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
     operation->blocknum = blockNum;
     operation->flags = flags;
     operation->nblocks = actual_nblocks;
+   operation->nblocks_done = 0;
+   pgaio_wref_clear(&operation->io_wref);
  
-   if (flags & READ_BUFFERS_ISSUE_ADVICE)
+   /*
+    * When using AIO, start the IO in the background. If not, issue prefetch
+    * requests if desired by the caller.
+    *
+    * The reason we have a dedicated path for IOMETHOD_SYNC here is to
+    * de-risk the introduction of AIO somewhat. It's a large architectural
+    * change, with lots of chances for unanticipated performance effects.
+    *
+    * Use of IOMETHOD_SYNC already leads to not actually performing IO
+    * asynchronously, but without the check here we'd execute IO earlier than
+    * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
+    */
+   if (io_method != IOMETHOD_SYNC)
     {
         /*
-        * In theory we should only do this if PinBufferForBlock() had to
-        * allocate new buffers above.  That way, if two calls to
-        * StartReadBuffers() were made for the same blocks before
-        * WaitReadBuffers(), only the first would issue the advice. That'd be
-        * a better simulation of true asynchronous I/O, which would only
-        * start the I/O once, but isn't done here for simplicity.
+        * Try to start IO asynchronously. It's possible that no IO needs to
+        * be started, if another backend already performed the IO.
+        *
+        * Note that if an IO is started, it might not cover the entire
+        * requested range, e.g. because an intermediary block has been read
+        * in by another backend.  In that case any "trailing" buffers we
+        * already pinned above will be "forwarded" by read_stream.c to the
+        * next call to StartReadBuffers().
+        *
+        * This is signalled to the caller by decrementing *nblocks *and*
+        * reducing operation->nblocks. The latter is done here, but not below
+        * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
+        * overall read size anymore, we need to retry until done in its
+        * entirety or until failed.
          */
-       smgrprefetch(operation->smgr,
-                    operation->forknum,
-                    blockNum,
-                    actual_nblocks);
+       did_start_io = AsyncReadBuffers(operation, nblocks);
+
+       operation->nblocks = *nblocks;
     }
+   else
+   {
+       operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
  
-   /* Indicate that WaitReadBuffers() should be called. */
-   return true;
+       if (flags & READ_BUFFERS_ISSUE_ADVICE)
+       {
+           /*
+            * In theory we should only do this if PinBufferForBlock() had to
+            * allocate new buffers above.  That way, if two calls to
+            * StartReadBuffers() were made for the same blocks before
+            * WaitReadBuffers(), only the first would issue the advice.
+            * That'd be a better simulation of true asynchronous I/O, which
+            * would only start the I/O once, but isn't done here for
+            * simplicity.
+            */
+           smgrprefetch(operation->smgr,
+                        operation->forknum,
+                        blockNum,
+                        actual_nblocks);
+       }
+
+       /*
+        * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
+        * will initiate the necessary IO.
+        */
+       did_start_io = true;
+   }
+
+   CheckReadBuffersOperation(operation, !did_start_io);
+
+   return did_start_io;
  }
  
  /*
@@ -1452,8 +1522,35 @@ StartReadBuffer(ReadBuffersOperation *operation,
     return result;
  }
  
+/*
+ * Perform sanity checks on the ReadBuffersOperation.
+ */
+static void
+CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
+{
+#ifdef USE_ASSERT_CHECKING
+   Assert(operation->nblocks_done <= operation->nblocks);
+   Assert(!is_complete || operation->nblocks == operation->nblocks_done);
+
+   for (int i = 0; i < operation->nblocks; i++)
+   {
+       Buffer      buffer = operation->buffers[i];
+       BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
+           GetLocalBufferDescriptor(-buffer - 1) :
+           GetBufferDescriptor(buffer - 1);
+
+       Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
+       Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_TAG_VALID);
+
+       if (i < operation->nblocks_done)
+           Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_VALID);
+   }
+#endif
+}
+
+/* helper for ReadBuffersCanStartIO(), to avoid repetition */
  static inline bool
-WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
+ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
  {
     if (BufferIsLocal(buffer))
         return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
@@ -1462,28 +1559,85 @@ WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
         return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
  }
  
+/*
+ * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
+ */
+static inline bool
+ReadBuffersCanStartIO(Buffer buffer, bool nowait)
+{
+   /*
+    * If this backend currently has staged IO, we need to submit the pending
+    * IO before waiting for the right to issue IO, to avoid the potential for
+    * deadlocks (and, more commonly, unnecessary delays for other backends).
+    */
+   if (!nowait && pgaio_have_staged())
+   {
+       if (ReadBuffersCanStartIOOnce(buffer, true))
+           return true;
+
+       /*
+        * Unfortunately StartBufferIO() returning false doesn't allow to
+        * distinguish between the buffer already being valid and IO already
+        * being in progress. Since IO already being in progress is quite
+        * rare, this approach seems fine.
+        */
+       pgaio_submit_staged();
+   }
+
+   return ReadBuffersCanStartIOOnce(buffer, nowait);
+}
+
+/*
+ * Helper for WaitReadBuffers() that processes the results of a readv
+ * operation, raising an error if necessary.
+ */
+static void
+ProcessReadBuffersResult(ReadBuffersOperation *operation)
+{
+   PgAioReturn *aio_ret = &operation->io_return;
+   PgAioResultStatus rs = aio_ret->result.status;
+   int         newly_read_blocks = 0;
+
+   Assert(pgaio_wref_valid(&operation->io_wref));
+   Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
+
+   /*
+    * SMGR reports the number of blocks successfully read as the result of
+    * the IO operation. Thus we can simply add that to ->nblocks_done.
+    */
+
+   if (likely(rs != PGAIO_RS_ERROR))
+       newly_read_blocks = aio_ret->result.result;
+
+   if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
+       pgaio_result_report(aio_ret->result, &aio_ret->target_data,
+                           rs == PGAIO_RS_ERROR ? ERROR : WARNING);
+   else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
+   {
+       /*
+        * We'll retry, so we just emit a debug message to the server log (or
+        * not even that in prod scenarios).
+        */
+       pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
+       elog(DEBUG3, "partial read, will retry");
+   }
+
+   Assert(newly_read_blocks > 0);
+   Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
+
+   operation->nblocks_done += newly_read_blocks;
+
+   Assert(operation->nblocks_done <= operation->nblocks);
+}
+
  void
  WaitReadBuffers(ReadBuffersOperation *operation)
  {
-   Buffer     *buffers;
-   int         nblocks;
-   BlockNumber blocknum;
-   ForkNumber  forknum;
+   PgAioReturn *aio_ret = &operation->io_return;
     IOContext   io_context;
     IOObject    io_object;
-   char        persistence;
-
-   /* Find the range of the physical read we need to perform. */
-   nblocks = operation->nblocks;
-   buffers = &operation->buffers[0];
-   blocknum = operation->blocknum;
-   forknum = operation->forknum;
-   persistence = operation->persistence;
  
-   Assert(nblocks > 0);
-   Assert(nblocks <= MAX_IO_COMBINE_LIMIT);
-
-   if (persistence == RELPERSISTENCE_TEMP)
+   if (operation->persistence == RELPERSISTENCE_TEMP)
     {
         io_context = IOCONTEXT_NORMAL;
         io_object = IOOBJECT_TEMP_RELATION;
@@ -1494,155 +1648,335 @@ WaitReadBuffers(ReadBuffersOperation *operation)
         io_object = IOOBJECT_RELATION;
     }
  
-   for (int i = 0; i < nblocks; ++i)
+   /*
+    * If we get here without an IO operation having been issued, the
+    * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
+    * caller should not have called WaitReadBuffers().
+    *
+    * In the case of IOMETHOD_SYNC, we start - as we used to before the
+    * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
+    * of the retry logic below, no extra code is required.
+    *
+    * This path is expected to eventually go away.
+    */
+   if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
+       elog(ERROR, "waiting for read operation that didn't read");
+
+   /*
+    * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
+    * done. We may need multiple retries, not just because we could get
+    * multiple partial reads, but also because some of the remaining
+    * to-be-read buffers may have been read in by other backends, limiting
+    * the IO size.
+    */
+   while (true)
     {
-       int         io_buffers_len;
-       Buffer      io_buffers[MAX_IO_COMBINE_LIMIT];
-       void       *io_pages[MAX_IO_COMBINE_LIMIT];
-       instr_time  io_start;
-       BlockNumber io_first_block;
+       int         ignored_nblocks_progress;
+
+       CheckReadBuffersOperation(operation, false);
  
         /*
-        * Skip this block if someone else has already completed it.  If an
-        * I/O is already in progress in another backend, this will wait for
-        * the outcome: either done, or something went wrong and we will
-        * retry.
+        * If there is an IO associated with the operation, we may need to
+        * wait for it.
          */
-       if (!WaitReadBuffersCanStartIO(buffers[i], false))
+       if (pgaio_wref_valid(&operation->io_wref))
         {
             /*
-            * Report and track this as a 'hit' for this backend, even though
-            * it must have started out as a miss in PinBufferForBlock(). The
-            * other backend will track this as a 'read'.
+            * Track the time spent waiting for the IO to complete. As
+            * tracking a wait even if we don't actually need to wait
+            *
+            * a) is not cheap, due to the timestamping overhead
+            *
+            * b) reports some time as waiting, even if we never waited
+            *
+            * we first check if we already know the IO is complete.
              */
-           TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
-                                             operation->smgr->smgr_rlocator.locator.spcOid,
-                                             operation->smgr->smgr_rlocator.locator.dbOid,
-                                             operation->smgr->smgr_rlocator.locator.relNumber,
-                                             operation->smgr->smgr_rlocator.backend,
-                                             true);
-
-           if (persistence == RELPERSISTENCE_TEMP)
-               pgBufferUsage.local_blks_hit += 1;
+           if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+               !pgaio_wref_check_done(&operation->io_wref))
+           {
+               instr_time  io_start = pgstat_prepare_io_time(track_io_timing);
+
+               pgaio_wref_wait(&operation->io_wref);
+
+               /*
+                * The IO operation itself was already counted earlier, in
+                * AsyncReadBuffers(), this just accounts for the wait time.
+                */
+               pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
+                                       io_start, 0, 0);
+           }
             else
-               pgBufferUsage.shared_blks_hit += 1;
+           {
+               Assert(pgaio_wref_check_done(&operation->io_wref));
+           }
  
-           if (operation->rel)
-               pgstat_count_buffer_hit(operation->rel);
+           /*
+            * We now are sure the IO completed. Check the results. This
+            * includes reporting on errors if there were any.
+            */
+           ProcessReadBuffersResult(operation);
+       }
  
-           pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
+       /*
+        * Most of the time, the one IO we already started, will read in
+        * everything.  But we need to deal with partial reads and buffers not
+        * needing IO anymore.
+        */
+       if (operation->nblocks_done == operation->nblocks)
+           break;
  
-           if (VacuumCostActive)
-               VacuumCostBalance += VacuumCostPageHit;
+       CHECK_FOR_INTERRUPTS();
  
-           continue;
-       }
+       /*
+        * This may only complete the IO partially, either because some
+        * buffers were already valid, or because of a partial read.
+        *
+        * NB: In contrast to after the AsyncReadBuffers() call in
+        * StartReadBuffers(), we do *not* reduce
+        * ReadBuffersOperation->nblocks here, callers expect the full
+        * operation to be completed at this point (as more operations may
+        * have been queued).
+        */
+       AsyncReadBuffers(operation, &ignored_nblocks_progress);
+   }
+
+   CheckReadBuffersOperation(operation, true);
+
+   /* NB: READ_DONE tracepoint was already executed in completion callback */
+}
+
+/*
+ * Initiate IO for the ReadBuffersOperation
+ *
+ * This function only starts a single IO at a time. The size of the IO may be
+ * limited to below the to-be-read blocks, if one of the buffers has
+ * concurrently been read in. If the first to-be-read buffer is already valid,
+ * no IO will be issued.
+ *
+ * To support retries after partial reads, the first operation->nblocks_done
+ * buffers are skipped.
+ *
+ * On return *nblocks_progress is updated to reflect the number of buffers
+ * affected by the call. If the first buffer is valid, *nblocks_progress is
+ * set to 1 and operation->nblocks_done is incremented.
+ *
+ * Returns true if IO was initiated, false if no IO was necessary.
+ */
+static bool
+AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
+{
+   Buffer     *buffers = &operation->buffers[0];
+   int         flags = operation->flags;
+   BlockNumber blocknum = operation->blocknum;
+   ForkNumber  forknum = operation->forknum;
+   char        persistence = operation->persistence;
+   int16       nblocks_done = operation->nblocks_done;
+   Buffer     *io_buffers = &operation->buffers[nblocks_done];
+   int         io_buffers_len = 0;
+   PgAioHandle *ioh;
+   uint32      ioh_flags = 0;
+   void       *io_pages[MAX_IO_COMBINE_LIMIT];
+   IOContext   io_context;
+   IOObject    io_object;
+   bool        did_start_io;
+
+   /*
+    * When this IO is executed synchronously, either because the caller will
+    * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
+    * the AIO subsystem needs to know.
+    */
+   if (flags & READ_BUFFERS_SYNCHRONOUSLY)
+       ioh_flags |= PGAIO_HF_SYNCHRONOUS;
+
+   if (persistence == RELPERSISTENCE_TEMP)
+   {
+       io_context = IOCONTEXT_NORMAL;
+       io_object = IOOBJECT_TEMP_RELATION;
+       ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
+   }
+   else
+   {
+       io_context = IOContextForStrategy(operation->strategy);
+       io_object = IOOBJECT_RELATION;
+   }
+
+   /*
+    * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
+    * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
+    * set globally, but on a per-session basis. The completion callback,
+    * which may be run in other processes, e.g. in IO workers, may have a
+    * different value of the zero_damaged_pages GUC.
+    *
+    * XXX: We probably should eventually use a different flag for
+    * zero_damaged_pages, so we can report different log levels / error codes
+    * for zero_damaged_pages and ZERO_ON_ERROR.
+    */
+   if (zero_damaged_pages)
+       flags |= READ_BUFFERS_ZERO_ON_ERROR;
+
+   /*
+    * For the same reason as with zero_damaged_pages we need to use this
+    * backend's ignore_checksum_failure value.
+    */
+   if (ignore_checksum_failure)
+       flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
+
+
+   /*
+    * To be allowed to report stats in the local completion callback we need
+    * to prepare to report stats now. This ensures we can safely report the
+    * checksum failure even in a critical section.
+    */
+   pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
+
+   /*
+    * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
+    * might block, which we don't want after setting IO_IN_PROGRESS.
+    *
+    * If we need to wait for IO before we can get a handle, submit
+    * already-staged IO first, so that other backends don't need to wait.
+    * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
+    * wait for already submitted IO, which doesn't require additional locks,
+    * but it could still cause undesirable waits.
+    *
+    * A secondary benefit is that this would allow us to measure the time in
+    * pgaio_io_acquire() without causing undue timer overhead in the common,
+    * non-blocking, case.  However, currently the pgstats infrastructure
+    * doesn't really allow that, as it a) asserts that an operation can't
+    * have time without operations b) doesn't have an API to report
+    * "accumulated" time.
+    */
+   ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return);
+   if (unlikely(!ioh))
+   {
+       pgaio_submit_staged();
+
+       ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
+   }
+
+   /*
+    * Check if we can start IO on the first to-be-read buffer.
+    *
+    * If an I/O is already in progress in another backend, we want to wait
+    * for the outcome: either done, or something went wrong and we will
+    * retry.
+    */
+   if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
+   {
+       /*
+        * Someone else has already completed this block, we're done.
+        *
+        * When IO is necessary, ->nblocks_done is updated in
+        * ProcessReadBuffersResult(), but that is not called if no IO is
+        * necessary. Thus update here.
+        */
+       operation->nblocks_done += 1;
+       *nblocks_progress = 1;
+
+       pgaio_io_release(ioh);
+       pgaio_wref_clear(&operation->io_wref);
+       did_start_io = false;
+
+       /*
+        * Report and track this as a 'hit' for this backend, even though it
+        * must have started out as a miss in PinBufferForBlock(). The other
+        * backend will track this as a 'read'.
+        */
+       TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
+                                         operation->smgr->smgr_rlocator.locator.spcOid,
+                                         operation->smgr->smgr_rlocator.locator.dbOid,
+                                         operation->smgr->smgr_rlocator.locator.relNumber,
+                                         operation->smgr->smgr_rlocator.backend,
+                                         true);
+
+       if (persistence == RELPERSISTENCE_TEMP)
+           pgBufferUsage.local_blks_hit += 1;
+       else
+           pgBufferUsage.shared_blks_hit += 1;
+
+       if (operation->rel)
+           pgstat_count_buffer_hit(operation->rel);
+
+       pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
+
+       if (VacuumCostActive)
+           VacuumCostBalance += VacuumCostPageHit;
+   }
+   else
+   {
+       instr_time  io_start;
  
         /* We found a buffer that we need to read in. */
-       io_buffers[0] = buffers[i];
-       io_pages[0] = BufferGetBlock(buffers[i]);
-       io_first_block = blocknum + i;
+       Assert(io_buffers[0] == buffers[nblocks_done]);
+       io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
         io_buffers_len = 1;
  
         /*
          * How many neighboring-on-disk blocks can we scatter-read into other
          * buffers at the same time?  In this case we don't wait if we see an
-        * I/O already in progress.  We already hold BM_IO_IN_PROGRESS for the
+        * I/O already in progress.  We already set BM_IO_IN_PROGRESS for the
          * head block, so we should get on with that I/O as soon as possible.
-        * We'll come back to this block again, above.
          */
-       while ((i + 1) < nblocks &&
-              WaitReadBuffersCanStartIO(buffers[i + 1], true))
+       for (int i = nblocks_done + 1; i < operation->nblocks; i++)
         {
+           if (!ReadBuffersCanStartIO(buffers[i], true))
+               break;
             /* Must be consecutive block numbers. */
-           Assert(BufferGetBlockNumber(buffers[i + 1]) ==
-                  BufferGetBlockNumber(buffers[i]) + 1);
+           Assert(BufferGetBlockNumber(buffers[i - 1]) ==
+                  BufferGetBlockNumber(buffers[i]) - 1);
+           Assert(io_buffers[io_buffers_len] == buffers[i]);
  
-           io_buffers[io_buffers_len] = buffers[++i];
             io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
         }
  
-       io_start = pgstat_prepare_io_time(track_io_timing);
-       smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
-       pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
-                               1, io_buffers_len * BLCKSZ);
+       /* get a reference to wait for in WaitReadBuffers() */
+       pgaio_io_get_wref(ioh, &operation->io_wref);
  
-       /* Verify each block we read, and terminate the I/O. */
-       for (int j = 0; j < io_buffers_len; ++j)
-       {
-           BufferDesc *bufHdr;
-           Block       bufBlock;
-           int         piv_flags;
-           bool        verified;
-           bool        checksum_failure;
-
-           if (persistence == RELPERSISTENCE_TEMP)
-           {
-               bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
-               bufBlock = LocalBufHdrGetBlock(bufHdr);
-           }
-           else
-           {
-               bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
-               bufBlock = BufHdrGetBlock(bufHdr);
-           }
-
-           /* check for garbage data */
-           piv_flags = PIV_LOG_WARNING;
-           if (ignore_checksum_failure)
-               piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
-           verified = PageIsVerified((Page) bufBlock, io_first_block + j,
-                                     piv_flags, &checksum_failure);
-           if (checksum_failure)
-           {
-               RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator;
+       /* provide the list of buffers to the completion callbacks */
+       pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
  
-               pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
-               pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
-           }
+       pgaio_io_register_callbacks(ioh,
+                                   persistence == RELPERSISTENCE_TEMP ?
+                                   PGAIO_HCB_LOCAL_BUFFER_READV :
+                                   PGAIO_HCB_SHARED_BUFFER_READV,
+                                   flags);
  
-           if (!verified)
-           {
-               if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
-               {
-                   ereport(WARNING,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s; zeroing out page",
-                                   io_first_block + j,
-                                   relpath(operation->smgr->smgr_rlocator, forknum).str)));
-                   memset(bufBlock, 0, BLCKSZ);
-               }
-               else
-                   ereport(ERROR,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s",
-                                   io_first_block + j,
-                                   relpath(operation->smgr->smgr_rlocator, forknum).str)));
-           }
+       pgaio_io_set_flag(ioh, ioh_flags);
  
-           /* Set BM_VALID, terminate IO, and wake up any waiters */
-           if (persistence == RELPERSISTENCE_TEMP)
-               TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
-           else
-               TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
-
-           /* Report I/Os as completing individually. */
-           TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
-                                             operation->smgr->smgr_rlocator.locator.spcOid,
-                                             operation->smgr->smgr_rlocator.locator.dbOid,
-                                             operation->smgr->smgr_rlocator.locator.relNumber,
-                                             operation->smgr->smgr_rlocator.backend,
-                                             false);
-       }
+       /* ---
+        * Even though we're trying to issue IO asynchronously, track the time
+        * in smgrstartreadv():
+        * - if io_method == IOMETHOD_SYNC, we will always perform the IO
+        *   immediately
+        * - the io method might not support the IO (e.g. worker IO for a temp
+        *   table)
+        * ---
+        */
+       io_start = pgstat_prepare_io_time(track_io_timing);
+       smgrstartreadv(ioh, operation->smgr, forknum,
+                      blocknum + nblocks_done,
+                      io_pages, io_buffers_len);
+       pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
+                               io_start, 1, io_buffers_len * BLCKSZ);
  
         if (persistence == RELPERSISTENCE_TEMP)
             pgBufferUsage.local_blks_read += io_buffers_len;
         else
             pgBufferUsage.shared_blks_read += io_buffers_len;
  
+       /*
+        * Track vacuum cost when issuing IO, not after waiting for it.
+        * Otherwise we could end up issuing a lot of IO in a short timespan,
+        * despite a low cost limit.
+        */
         if (VacuumCostActive)
             VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
+
+       *nblocks_progress = io_buffers_len;
+       did_start_io = true;
     }
+
+   return did_start_io;
  }
  
  /*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index 11f8508a90b58aa06de4b25008ad9d271c7fadcc..867ae9facb5172a94b40e229017734ec200c465c 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -114,6 +114,9 @@ typedef struct BufferManagerRelation
  #define READ_BUFFERS_ISSUE_ADVICE (1 << 1)
  /* Don't treat page as invalid due to checksum failures. */
  #define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES (1 << 2)
+/* IO will immediately be waited for */
+#define READ_BUFFERS_SYNCHRONOUSLY (1 << 3)
+
  
  struct ReadBuffersOperation
  {
@@ -133,6 +136,9 @@ struct ReadBuffersOperation
     BlockNumber blocknum;
     int         flags;
     int16       nblocks;
+   int16       nblocks_done;
+   PgAioWaitRef io_wref;
+   PgAioReturn io_return;
  };
  
  typedef struct ReadBuffersOperation ReadBuffersOperation;
author	Andres Freund <[email protected]>
	Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)
committer	Andres Freund <[email protected]>
	Sun, 30 Mar 2025 22:02:23 +0000 (18:02 -0400)
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/include/storage/bufmgr.h		patch \| blob \| blame \| history