aio: --- BASE PATCH -- (to-be-split).
authorAndres Freund <[email protected]>
Thu, 29 Oct 2020 19:41:12 +0000 (12:41 -0700)
committerAndres Freund <[email protected]>
Mon, 11 Jan 2021 23:09:15 +0000 (15:09 -0800)
40 files changed:
contrib/pg_prewarm/pg_prewarm.c
src/backend/access/heap/hio.c
src/backend/access/transam/xact.c
src/backend/access/transam/xlog.c
src/backend/bootstrap/bootstrap.c
src/backend/catalog/system_views.sql
src/backend/postmaster/checkpointer.c
src/backend/postmaster/pgstat.c
src/backend/postmaster/postmaster.c
src/backend/storage/buffer/buf_init.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/file/fd.c
src/backend/storage/ipc/Makefile
src/backend/storage/ipc/aio.c [new file with mode: 0644]
src/backend/storage/ipc/aio_util.c [new file with mode: 0644]
src/backend/storage/ipc/ipci.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/lwlocknames.txt
src/backend/storage/lmgr/proc.c
src/backend/storage/page/bufpage.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/backend/tcop/postgres.c
src/backend/utils/init/miscinit.c
src/backend/utils/misc/guc.c
src/backend/utils/resowner/resowner.c
src/include/access/xlog_internal.h
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/include/storage/aio.h [new file with mode: 0644]
src/include/storage/buf_internals.h
src/include/storage/bufmgr.h
src/include/storage/bufpage.h
src/include/storage/fd.h
src/include/storage/lwlock.h
src/include/storage/md.h
src/include/storage/smgr.h
src/include/utils/resowner_private.h
src/test/regress/expected/rules.out

index a8554529361e64d108ed37c0f61c25c542e7560e..93af05cdf937840c2429f8be30b77a4da03b84c7 100644 (file)
@@ -18,7 +18,9 @@
 #include "access/relation.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
+#include "storage/bufpage.h"
 #include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@@ -33,11 +35,116 @@ typedef enum
 {
    PREWARM_PREFETCH,
    PREWARM_READ,
-   PREWARM_BUFFER
+   PREWARM_READ_AIO,
+   PREWARM_BUFFER,
+   PREWARM_BUFFER_AIO
 } PrewarmType;
 
 static PGAlignedBlock blockbuffer;
 
+typedef struct prefetch
+{
+   Relation rel;
+   ForkNumber forkNumber;
+   int64 curblock;
+   int64 lastblock;
+   List *bbs;
+} prefetch;
+
+static PgStreamingReadNextStatus
+prewarm_buffer_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   Buffer buf;
+   bool already_valid;
+   BlockNumber blockno;
+
+   if (p->curblock <= p->lastblock)
+       blockno = p->curblock++;
+   else
+       return PGSR_NEXT_END;
+
+   buf = ReadBufferAsync(p->rel, p->forkNumber, blockno,
+                         RBM_NORMAL, NULL, &already_valid,
+                         &aio);
+
+   *read_private = (uintptr_t) buf;
+
+   if (already_valid)
+   {
+       ereport(DEBUG3,
+               errmsg("pgsr %s: found block %d already in buf %d",
+                      NameStr(p->rel->rd_rel->relname),
+                      blockno, buf),
+               errhidestmt(true),
+               errhidecontext(true));
+       return PGSR_NEXT_NO_IO;
+   }
+   else
+   {
+       ereport(DEBUG3,
+               errmsg("pgsr %s: fetching block %d into buf %d",
+                      NameStr(p->rel->rd_rel->relname),
+                      blockno, buf),
+               errhidestmt(true),
+               errhidecontext(true));
+       return PGSR_NEXT_IO;
+   }
+}
+
+static void
+prewarm_buffer_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   Buffer buf = (Buffer) read_private;
+
+   ereport(DEBUG2,
+           errmsg("pgsr %s: releasing buf %d",
+                  NameStr(p->rel->rd_rel->relname),
+                  buf),
+           errhidestmt(true),
+           errhidecontext(true));
+
+   Assert(BufferIsValid(buf));
+   ReleaseBuffer(buf);
+}
+
+
+static PgStreamingReadNextStatus
+prewarm_smgr_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+   prefetch *p = (prefetch *) pgsr_private;
+   BlockNumber blockno;
+   PgAioBounceBuffer *bb;
+
+   if (p->curblock <= p->lastblock)
+       blockno = p->curblock++;
+   else
+       return PGSR_NEXT_END;
+
+   if (p->bbs != NIL)
+   {
+       bb = lfirst(list_tail(p->bbs));
+       p->bbs = list_delete_last(p->bbs);
+   }
+   else
+       bb = pgaio_bounce_buffer_get();
+
+   pgaio_assoc_bounce_buffer(aio, bb);
+
+   smgrstartread(aio, p->rel->rd_smgr, p->forkNumber, blockno,
+                 pgaio_bounce_buffer_buffer(bb), InvalidBuffer, 0);
+
+   *read_private = (uintptr_t) bb;
+
+   return PGSR_NEXT_IO;
+}
+
+static void
+prewarm_smgr_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+}
+
 /*
  * pg_prewarm(regclass, mode text, fork text,
  *           first_block int8, last_block int8)
@@ -86,6 +193,10 @@ pg_prewarm(PG_FUNCTION_ARGS)
        ptype = PREWARM_READ;
    else if (strcmp(ttype, "buffer") == 0)
        ptype = PREWARM_BUFFER;
+   else if (strcmp(ttype, "buffer_aio") == 0)
+       ptype = PREWARM_BUFFER_AIO;
+   else if (strcmp(ttype, "read_aio") == 0)
+       ptype = PREWARM_READ_AIO;
    else
    {
        ereport(ERROR,
@@ -197,6 +308,82 @@ pg_prewarm(PG_FUNCTION_ARGS)
            ++blocks_done;
        }
    }
+   else if (ptype == PREWARM_BUFFER_AIO)
+   {
+       PgStreamingRead *pgsr;
+       prefetch p;
+
+       p.rel = rel;
+       p.forkNumber = forkNumber;
+       p.curblock = 0;
+       p.lastblock = last_block;
+       p.bbs = NIL;
+
+       pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+                                      prewarm_buffer_next,
+                                      prewarm_buffer_release);
+
+       for (block = first_block; block <= last_block; ++block)
+       {
+           Buffer      buf;
+
+           CHECK_FOR_INTERRUPTS();
+
+           buf = (Buffer) pg_streaming_read_get_next(pgsr);
+           if (BufferIsValid(buf))
+               ReleaseBuffer(buf);
+           else
+               elog(ERROR, "prefetch ended early");
+
+           ++blocks_done;
+       }
+
+       if (BufferIsValid(pg_streaming_read_get_next(pgsr)))
+           elog(ERROR, "unexpected additional buffer");
+
+       pg_streaming_read_free(pgsr);
+   }
+   else if (ptype == PREWARM_READ_AIO)
+   {
+       PgStreamingRead *pgsr;
+       prefetch p;
+       ListCell *lc;
+
+       p.rel = rel;
+       p.forkNumber = forkNumber;
+       p.curblock = 0;
+       p.lastblock = last_block;
+       p.bbs = NIL;
+
+       pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+                                      prewarm_smgr_next,
+                                      prewarm_smgr_release);
+
+       for (block = first_block; block <= last_block; ++block)
+       {
+           PgAioBounceBuffer *bb;
+
+           CHECK_FOR_INTERRUPTS();
+
+           bb = (PgAioBounceBuffer *) pg_streaming_read_get_next(pgsr);
+           if (bb == NULL)
+               elog(ERROR, "prefetch ended early");
+
+           p.bbs = lappend(p.bbs, (void *) bb);
+
+           ++blocks_done;
+       }
+
+       if (pg_streaming_read_get_next(pgsr) != 0)
+           elog(ERROR, "unexpected additional buffer");
+
+       pg_streaming_read_free(pgsr);
+
+       foreach(lc, p.bbs)
+       {
+           pgaio_bounce_buffer_release(lfirst(lc));
+       }
+   }
 
    /* Close relation, release lock. */
    relation_close(rel, AccessShareLock);
index fac3b8e9ff28731e99f861f7048bc2718df96b26..69af7e8762876db7180278aeb972cfda47518539 100644 (file)
@@ -129,6 +129,75 @@ ReadBufferBI(Relation relation, BlockNumber targetBlock,
    return buffer;
 }
 
+static Buffer
+ExtendRelation(Relation relation, BulkInsertState bistate, bool use_fsm)
+{
+   Buffer buf;
+   Page        page;
+
+   /* FIXME: There should be a better approach to both of these exceptions */
+   if (RELATION_IS_LOCAL(relation) || !use_fsm)
+   {
+       LockRelationForExtension(relation, ExclusiveLock);
+       buf = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+       UnlockRelationForExtension(relation, ExclusiveLock);
+   }
+   else
+   {
+       int extendby;
+       int newblockno;
+
+       /*
+        * Determine number of pages to extend relation by.
+        */
+       {
+           BlockNumber start_nblocks;
+
+           RelationOpenSmgr(relation);
+           start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+           extendby = Max(Min(start_nblocks / 16 * BLCKSZ, (16 * 1024 * 1024)) / BLCKSZ, 1);
+           extendby = Max(Min(extendby, NBuffers / 128), 1);
+       }
+
+       /*
+        * FIXME: There should probably be some chunking, either from here, or
+        * inside BulkExtendBuffered.
+        */
+       buf = BulkExtendBuffered(relation, MAIN_FORKNUM, extendby,
+                                bistate ? bistate->strategy : NULL);
+
+       newblockno = BufferGetBlockNumber(buf);
+
+       for (int i = newblockno + 1; i < newblockno + extendby; i++)
+       {
+           RecordPageWithFreeSpace(relation, i, BLCKSZ - SizeOfPageHeaderData);
+       }
+
+       if (use_fsm && extendby > 1)
+       {
+           FreeSpaceMapVacuumRange(relation, newblockno, newblockno + extendby);
+       }
+   }
+
+   /*
+    * We need to initialize the empty new page.  Double-check that it really
+    * is empty (this should never happen, but if it does we don't want to
+    * risk wiping out valid data).
+    */
+   page = BufferGetPage(buf);
+
+   if (!PageIsNew(page))
+       elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
+            BufferGetBlockNumber(buf),
+            RelationGetRelationName(relation));
+
+   PageInit(page, BufferGetPageSize(buf), 0);
+   MarkBufferDirty(buf);
+
+   return buf;
+}
+
+
 /*
  * For each heap page which is all-visible, acquire a pin on the appropriate
  * visibility map page, if we haven't already got one.
@@ -187,90 +256,6 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
    }
 }
 
-/*
- * Extend a relation by multiple blocks to avoid future contention on the
- * relation extension lock.  Our goal is to pre-extend the relation by an
- * amount which ramps up as the degree of contention ramps up, but limiting
- * the result to some sane overall value.
- */
-static void
-RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
-{
-   BlockNumber blockNum,
-               firstBlock = InvalidBlockNumber;
-   int         extraBlocks;
-   int         lockWaiters;
-
-   /* Use the length of the lock wait queue to judge how much to extend. */
-   lockWaiters = RelationExtensionLockWaiterCount(relation);
-   if (lockWaiters <= 0)
-       return;
-
-   /*
-    * It might seem like multiplying the number of lock waiters by as much as
-    * 20 is too aggressive, but benchmarking revealed that smaller numbers
-    * were insufficient.  512 is just an arbitrary cap to prevent
-    * pathological results.
-    */
-   extraBlocks = Min(512, lockWaiters * 20);
-
-   do
-   {
-       Buffer      buffer;
-       Page        page;
-       Size        freespace;
-
-       /*
-        * Extend by one page.  This should generally match the main-line
-        * extension code in RelationGetBufferForTuple, except that we hold
-        * the relation extension lock throughout, and we don't immediately
-        * initialize the page (see below).
-        */
-       buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
-       page = BufferGetPage(buffer);
-
-       if (!PageIsNew(page))
-           elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-                BufferGetBlockNumber(buffer),
-                RelationGetRelationName(relation));
-
-       /*
-        * Add the page to the FSM without initializing. If we were to
-        * initialize here, the page would potentially get flushed out to disk
-        * before we add any useful content. There's no guarantee that that'd
-        * happen before a potential crash, so we need to deal with
-        * uninitialized pages anyway, thus avoid the potential for
-        * unnecessary writes.
-        */
-
-       /* we'll need this info below */
-       blockNum = BufferGetBlockNumber(buffer);
-       freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData;
-
-       UnlockReleaseBuffer(buffer);
-
-       /* Remember first block number thus added. */
-       if (firstBlock == InvalidBlockNumber)
-           firstBlock = blockNum;
-
-       /*
-        * Immediately update the bottom level of the FSM.  This has a good
-        * chance of making this page visible to other concurrently inserting
-        * backends, and we want that to happen without delay.
-        */
-       RecordPageWithFreeSpace(relation, blockNum, freespace);
-   }
-   while (--extraBlocks > 0);
-
-   /*
-    * Updating the upper levels of the free space map is too expensive to do
-    * for every block, but it's worth doing once at the end to make sure that
-    * subsequent insertion activity sees all of those nifty free pages we
-    * just inserted.
-    */
-   FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1);
-}
-
 /*
  * RelationGetBufferForTuple
  *
@@ -340,7 +325,6 @@ RelationGetBufferForTuple(Relation relation, Size len,
                saveFreeSpace = 0;
    BlockNumber targetBlock,
                otherBlock;
-   bool        needLock;
 
    len = MAXALIGN(len);        /* be conservative */
 
@@ -533,6 +517,23 @@ loop:
            ReleaseBuffer(buffer);
        }
 
+       /*
+        * FIXME: definitely needs a better solution.
+        */
+       if (!use_fsm && bistate && bistate->current_buf != InvalidBuffer)
+       {
+           BlockNumber blocknum = BufferGetBlockNumber(bistate->current_buf) + 1;
+
+           RelationOpenSmgr(relation);
+
+           if (blocknum < smgrnblocks(relation->rd_smgr, MAIN_FORKNUM))
+           {
+               targetBlock = blocknum;
+
+               goto loop;
+           }
+       }
+
        /* Without FSM, always fall out of the loop and extend */
        if (!use_fsm)
            break;
@@ -547,85 +548,10 @@ loop:
                                                    len + saveFreeSpace);
    }
 
-   /*
-    * Have to extend the relation.
-    *
-    * We have to use a lock to ensure no one else is extending the rel at the
-    * same time, else we will both try to initialize the same new page.  We
-    * can skip locking for new or temp relations, however, since no one else
-    * could be accessing them.
-    */
-   needLock = !RELATION_IS_LOCAL(relation);
-
-   /*
-    * If we need the lock but are not able to acquire it immediately, we'll
-    * consider extending the relation by multiple blocks at a time to manage
-    * contention on the relation extension lock.  However, this only makes
-    * sense if we're using the FSM; otherwise, there's no point.
-    */
-   if (needLock)
-   {
-       if (!use_fsm)
-           LockRelationForExtension(relation, ExclusiveLock);
-       else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
-       {
-           /* Couldn't get the lock immediately; wait for it. */
-           LockRelationForExtension(relation, ExclusiveLock);
-
-           /*
-            * Check if some other backend has extended a block for us while
-            * we were waiting on the lock.
-            */
-           targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
-           /*
-            * If some other waiter has already extended the relation, we
-            * don't need to do so; just use the existing freespace.
-            */
-           if (targetBlock != InvalidBlockNumber)
-           {
-               UnlockRelationForExtension(relation, ExclusiveLock);
-               goto loop;
-           }
-
-           /* Time to bulk-extend. */
-           RelationAddExtraBlocks(relation, bistate);
-       }
-   }
-
-   /*
-    * In addition to whatever extension we performed above, we always add at
-    * least one block to satisfy our own request.
-    *
-    * XXX This does an lseek - rather expensive - but at the moment it is the
-    * only way to accurately determine how many blocks are in a relation.  Is
-    * it worth keeping an accurate file length in shared memory someplace,
-    * rather than relying on the kernel to do it for us?
-    */
-   buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+   buffer = ExtendRelation(relation, bistate, use_fsm);
 
-   /*
-    * We need to initialize the empty new page.  Double-check that it really
-    * is empty (this should never happen, but if it does we don't want to
-    * risk wiping out valid data).
-    */
    page = BufferGetPage(buffer);
 
-   if (!PageIsNew(page))
-       elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
-            BufferGetBlockNumber(buffer),
-            RelationGetRelationName(relation));
-
-   PageInit(page, BufferGetPageSize(buffer), 0);
-   MarkBufferDirty(buffer);
-
-   /*
-    * Release the file-extension lock; it's now OK for someone else to extend
-    * the relation some more.
-    */
-   if (needLock)
-       UnlockRelationForExtension(relation, ExclusiveLock);
-
    /*
     * Lock the other buffer. It's guaranteed to be of a lower page number
     * than the new page. To conform with the deadlock prevent rules, we ought
index a2068e3fd45d83026b97496e1ba0c3139eec4524..326ec8f44a87862e4647846088c10ba5565e66e9 100644 (file)
@@ -34,6 +34,7 @@
 #include "catalog/namespace.h"
 #include "catalog/pg_enum.h"
 #include "catalog/storage.h"
+#include "storage/aio.h"
 #include "commands/async.h"
 #include "commands/tablecmds.h"
 #include "commands/trigger.h"
@@ -2211,6 +2212,8 @@ CommitTransaction(void)
     */
    ProcArrayEndTransaction(MyProc, latestXid);
 
+   pgaio_at_commit();
+
    /*
     * This is all post-commit cleanup.  Note that if an error is raised here,
     * it's too late to abort the transaction.  This should be just
@@ -2620,6 +2623,8 @@ AbortTransaction(void)
    AtAbort_Memory();
    AtAbort_ResourceOwner();
 
+   pgaio_at_abort();
+
    /*
     * Release any LW locks we might be holding as quickly as possible.
     * (Regular locks, however, must be held till we finish aborting.)
index ed921c49e438da764eda33b4ccc5abc03b1b1e4e..1d168ed9c80d1c956d2fff2fb98747a6e7fe49ba 100644 (file)
@@ -59,6 +59,7 @@
 #include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -2450,6 +2451,17 @@ XLogCheckpointNeeded(XLogSegNo new_segno)
    return false;
 }
 
+
+void
+XLogWriteComplete(PgAioInProgress *aio, uint32 write_no)
+{
+}
+
+void
+XLogFlushComplete(struct PgAioInProgress *aio, uint32 flush_no)
+{
+}
+
 /*
  * Write and/or fsync the log at least as far as WriteRqst indicates.
  *
@@ -3283,6 +3295,11 @@ XLogNeedsFlush(XLogRecPtr record)
    return true;
 }
 
+static void
+XLogFileInitComplete(void *pgsw_private, void *write_private)
+{
+}
+
 /*
  * Create a new XLOG file segment, or open a pre-existing one.
  *
@@ -3313,6 +3330,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
    XLogSegNo   max_segno;
    int         fd;
    int         save_errno;
+   int         open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
 
    XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
 
@@ -3345,8 +3363,11 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
 
    unlink(tmppath);
 
+   if (io_wal_init_direct)
+       open_flags |= PG_O_DIRECT;
+
    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
-   fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+   fd = BasicOpenFile(tmppath, open_flags);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
@@ -3929,7 +3950,7 @@ XLogFileClose(void)
     * use the cache to read the WAL segment.
     */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-   if (!XLogIsNeeded())
+   if (!XLogIsNeeded() && !io_wal_direct)
        (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -10464,10 +10485,17 @@ get_sync_bit(int method)
 {
    int         o_direct_flag = 0;
 
+   /* make O_DIRECT setting only depend on GUC */
+   if (io_wal_direct)
+       o_direct_flag |= PG_O_DIRECT;
+
+#if 0
    /* If fsync is disabled, never open in sync mode */
    if (!enableFsync)
        return 0;
+#endif
 
+#if 0
    /*
     * Optimize writes by bypassing kernel cache with O_DIRECT when using
     * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
@@ -10476,14 +10504,19 @@ get_sync_bit(int method)
     * read if we bypassed the kernel cache. We also skip the
     * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
     * reason.
-    *
+    */
+   if (!XLogIsNeeded())
+       o_direct_flag = PG_O_DIRECT;
+#endif
+
+   /*
     * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
     * written by walreceiver is normally read by the startup process soon
     * after it's written. Also, walreceiver performs unaligned writes, which
     * don't work with O_DIRECT, so it is required for correctness too.
     */
-   if (!XLogIsNeeded() && !AmWalReceiverProcess())
-       o_direct_flag = PG_O_DIRECT;
+   if (AmWalReceiverProcess())
+       return 0;
 
    switch (method)
    {
@@ -10496,7 +10529,7 @@ get_sync_bit(int method)
        case SYNC_METHOD_FSYNC:
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
        case SYNC_METHOD_FDATASYNC:
-           return 0;
+           return o_direct_flag;
 #ifdef OPEN_SYNC_FLAG
        case SYNC_METHOD_OPEN:
            return OPEN_SYNC_FLAG | o_direct_flag;
index 6f615e66220b36ae056d646a750f12f478808007..604a4f75ac8f48a80c42bac2014f37965dde77b9 100644 (file)
@@ -432,6 +432,7 @@ AuxiliaryProcessMain(int argc, char *argv[])
             */
            SetProcessingMode(BootstrapProcessing);
            bootstrap_signals();
+           InitProcess();
            BootStrapXLOG();
            BootstrapModeMain();
            proc_exit(1);       /* should never return */
@@ -502,11 +503,6 @@ BootstrapModeMain(void)
    if (pg_link_canary_is_frontend())
        elog(ERROR, "backend is incorrectly linked to frontend functions");
 
-   /*
-    * Do backend-like initialization for bootstrap mode
-    */
-   InitProcess();
-
    InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
 
    /* Initialize stuff for bootstrap-file processing */
index 5d89e77dbe2f56982dc382bfc20f397130e4c7a8..ceae1d79c502bf7e4b576048409ef1690054e4cc 100644 (file)
@@ -1000,6 +1000,16 @@ CREATE VIEW pg_stat_wal AS
         w.stats_reset
     FROM pg_stat_get_wal() w;
 
+CREATE VIEW pg_stat_aio_backends AS
+    /* FIXME: easier to maintain without column names during development */
+    SELECT s.*
+    FROM pg_stat_get_aio_backends() s;
+
+CREATE VIEW pg_stat_aios AS
+    /* FIXME: easier to maintain without column names during development */
+    SELECT s.*
+    FROM pg_stat_get_aios() s;
+
 CREATE VIEW pg_stat_progress_analyze AS
     SELECT
         S.pid AS pid, S.datid AS datid, D.datname AS datname,
index e208b5878dd367bf99358ec8e6b34346656c513c..bb17be74867ff878575975082ea8753f07f7dc16 100644 (file)
@@ -47,6 +47,7 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
 #include "replication/syncrep.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
@@ -707,13 +708,20 @@ CheckpointWriteDelay(int flags, double progress)
         */
        pgstat_send_bgwriter();
 
+       /*
+        * Ensure all pending IO is submitted to avoid unnecessary delays
+        * for other processes.
+        */
+       pgaio_submit_pending(true);
+
        /*
         * This sleep used to be connected to bgwriter_delay, typically 200ms.
         * That resulted in more frequent wakeups if not much work to do.
         * Checkpointer and bgwriter are no longer related so take the Big
         * Sleep.
         */
-       pg_usleep(100000L);
+       if (IsCheckpointOnSchedule(progress))
+           pg_usleep(100000L);
    }
    else if (--absorb_counter <= 0)
    {
index e1de64c60d095f7abc1ed0396f4b3fd803a0d16e..7a0947f10eb3963670caa89769e9832e66877580 100644 (file)
@@ -4308,9 +4308,15 @@ pgstat_get_wait_io(WaitEventIO w)
        case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
            event_name = "WALSyncMethodAssign";
            break;
+       case WAIT_EVENT_WAL_WAIT_FLUSH:
+           event_name = "WALWaitFlush";
+           break;
        case WAIT_EVENT_WAL_WAIT_INSERT:
            event_name = "WALWaitInsert";
            break;
+       case WAIT_EVENT_WAL_WAIT_WRITE:
+           event_name = "WALWaitWrite";
+           break;
        case WAIT_EVENT_WAL_WRITE:
            event_name = "WALWrite";
            break;
@@ -4326,6 +4332,21 @@ pgstat_get_wait_io(WaitEventIO w)
        case WAIT_EVENT_LOGICAL_SUBXACT_WRITE:
            event_name = "LogicalSubxactWrite";
            break;
+       case WAIT_EVENT_AIO_SUBMIT:
+           event_name = "AIOSubmit";
+           break;
+       case WAIT_EVENT_AIO_IO_COMPLETE_ANY:
+           event_name = "AIOCompleteAny";
+           break;
+       case WAIT_EVENT_AIO_IO_COMPLETE_ONE:
+           event_name = "AIOCompleteOne";
+           break;
+       case WAIT_EVENT_AIO_REFS:
+           event_name = "AIORefs";
+           break;
+       case WAIT_EVENT_AIO_BACKPRESSURE:
+           event_name = "AIOBackpressure";
+           break;
 
            /* no default case, so that compiler will warn */
    }
index 7de27ee4e0171863faca2f24d62488b773a7636e..5fe083518ea748ff07985e6cf6eb1af82ee16fbf 100644 (file)
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
@@ -1004,6 +1005,13 @@ PostmasterMain(int argc, char *argv[])
     */
    InitializeMaxBackends();
 
+   /*
+    * As AIO might create interal FDs, and will trigger shared memory
+    * allocations, need to do this before reset_shared() and
+    * set_max_safe_fds().
+    */
+   pgaio_postmaster_init();
+
    /*
     * Set up shared memory and semaphores.
     */
index d611dac26b87ea3bf02bc89b586b2ccf577c0565..97603a522c061e999b10f19523cd44459ebb359d 100644 (file)
@@ -123,6 +123,8 @@ InitBufferPool(void)
 
            buf->buf_id = i;
 
+           pgaio_io_ref_clear(&buf->io_in_progress);
+
            /*
             * Initially link all the buffers together as unused. Subsequent
             * management of this list is done by freelist.c.
index c5395c3061f4be00758fc9c22cf06e0ac7cb85bb..1b477d26cfe8e9eeb88ee0361821104fc5973f63 100644 (file)
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
@@ -149,6 +152,18 @@ int            checkpoint_flush_after = 0;
 int            bgwriter_flush_after = 0;
 int            backend_flush_after = 0;
 
+
+/*
+ * GUC variables related to the AIO subsystem.
+ *
+ * XXX: It's not that clear where these best belong? Particularly the WAL ones
+ * probably should move.
+ */
+bool       io_data_direct = 0;
+bool       io_data_force_async = 1;
+bool       io_wal_direct = 0;
+bool       io_wal_init_direct = 0;
+
 /* local state for StartBufferIO and related functions */
 static BufferDesc *InProgressBuf = NULL;
 static bool IsForInput;
@@ -453,17 +468,24 @@ static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
                                ForkNumber forkNum, BlockNumber blockNum,
                                ReadBufferMode mode, BufferAccessStrategy strategy,
                                bool *hit);
+static BufferDesc *ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                BlockNumber blockNum, ReadBufferMode mode,
+                BufferAccessStrategy strategy, bool *hit, bool isLocalBuf);
 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
 static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
 static void BufferSync(int flags);
 static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
-static int SyncOneBuffer(int buf_id, bool skip_recently_used,
-                         WritebackContext *wb_context);
+static int BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+                                pg_streaming_write *pgsw);
 static void WaitIO(BufferDesc *buf);
 static bool StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-                             uint32 set_flag_bits);
+static void TerminateBufferIO(BufferDesc *buf, bool local, bool syncio,
+                             bool clear_dirty, uint32 set_flag_bits);
+static void TerminateSharedBufferIO(BufferDesc *buf,
+                                   bool sync_io,
+                                   bool clear_dirty,
+                                   uint32 set_flag_bits);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
 static BufferDesc *BufferAlloc(SMgrRelation smgr,
@@ -473,6 +495,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
                               BufferAccessStrategy strategy,
                               bool *foundPtr);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static bool AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
 static void CheckForBufferLeaks(void);
 static int rnode_comparator(const void *p1, const void *p2);
@@ -485,65 +508,30 @@ static int    ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
  * Implementation of PrefetchBuffer() for shared buffers.
  */
 PrefetchBufferResult
-PrefetchSharedBuffer(SMgrRelation smgr_reln,
+PrefetchSharedBuffer(Relation reln,
+                    SMgrRelation smgr_reln,
                     ForkNumber forkNum,
                     BlockNumber blockNum)
 {
    PrefetchBufferResult result = {InvalidBuffer, false};
-   BufferTag   newTag;         /* identity of requested block */
-   uint32      newHash;        /* hash value for newTag */
-   LWLock     *newPartitionLock;   /* buffer partition lock for it */
-   int         buf_id;
-
-   Assert(BlockNumberIsValid(blockNum));
-
-   /* create a tag so we can lookup the buffer */
-   INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
-                  forkNum, blockNum);
-
-   /* determine its hash code and partition lock ID */
-   newHash = BufTableHashCode(&newTag);
-   newPartitionLock = BufMappingPartitionLock(newHash);
-
-   /* see if the block is in the buffer pool already */
-   LWLockAcquire(newPartitionLock, LW_SHARED);
-   buf_id = BufTableLookup(&newTag, newHash);
-   LWLockRelease(newPartitionLock);
-
-   /* If not in buffers, initiate prefetch */
-   if (buf_id < 0)
-   {
-#ifdef USE_PREFETCH
-       /*
-        * Try to initiate an asynchronous read.  This returns false in
-        * recovery if the relation file doesn't exist.
-        */
-       if (smgrprefetch(smgr_reln, forkNum, blockNum))
-           result.initiated_io = true;
-#endif                         /* USE_PREFETCH */
-   }
-   else
-   {
-       /*
-        * Report the buffer it was in at that time.  The caller may be able
-        * to avoid a buffer table lookup, but it's not pinned and it must be
-        * rechecked!
-        */
-       result.recent_buffer = buf_id + 1;
-   }
+   bool already_valid;
 
    /*
-    * If the block *is* in buffers, we do nothing.  This is not really ideal:
-    * the block might be just about to be evicted, which would be stupid
-    * since we know we are going to need it soon.  But the only easy answer
-    * is to bump the usage_count, which does not seem like a great solution:
-    * when the caller does ultimately touch the block, usage_count would get
-    * bumped again, resulting in too much favoritism for blocks that are
-    * involved in a prefetch sequence. A real fix would involve some
-    * additional per-buffer state, and it's not clear that there's enough of
-    * a problem to justify that.
+    * Report the buffer it was in at that time.  The caller may be able
+    * to avoid a buffer table lookup, but it's not pinned and it must be
+    * rechecked!
     */
 
+   result.recent_buffer = ReadBufferAsync(reln, forkNum, blockNum, RBM_NORMAL,
+                                          NULL, &already_valid, NULL);
+   result.initiated_io = !already_valid;
+
+   if (already_valid)
+       ReleaseBuffer(result.recent_buffer);
+#if 0
+   else
+       pgaio_submit_pending(true);
+#endif
    return result;
 }
 
@@ -594,7 +582,7 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
    else
    {
        /* pass it to the shared buffer version */
-       return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
+       return PrefetchSharedBuffer(reln, reln->rd_smgr, forkNum, blockNum);
    }
 }
 
@@ -707,106 +695,173 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
                             mode, strategy, &hit);
 }
 
+static void
+ReadBufferInitRead(PgAioInProgress *aio,
+                  SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
+                  Buffer buf, BufferDesc *bufHdr, int mode)
+{
+   Block       bufBlock;
+
+   //bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+   if (BufferIsLocal(buf))
+       bufBlock = LocalBufHdrGetBlock(bufHdr);
+   else
+       bufBlock = BufHdrGetBlock(bufHdr);
+
+   /*
+    * if we have gotten to this point, we have allocated a buffer for the
+    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
+    * if it's a shared buffer.
+    */
+   Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));   /* spinlock not needed */
+
+   /* FIXME: improve */
+   InProgressBuf = NULL;
+
+   pgaio_io_ref(aio, &bufHdr->io_in_progress);
+
+   smgrstartread(aio, smgr, forkNum, blockNum,
+                 bufBlock, buf, mode);
+}
+
+Buffer
+ReadBufferAsync(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+               ReadBufferMode mode, BufferAccessStrategy strategy,
+               bool *already_valid, PgAioInProgress **aiop)
+{
+   Buffer buf;
+   BufferDesc *bufHdr;
+   bool hit;
+   bool release_io;
+   PgAioInProgress *aio;
+
+   if (mode != RBM_NORMAL || blockNum == P_NEW)
+       elog(ERROR, "unsupported");
+
+   /*
+    * Don't support AIO for local buffers yet, so just fall back to operating
+    * synchronously. This is important because otherwise callers would all
+    * need to have a non-prefetching fallback implementation.
+    */
+   if (RelationUsesLocalBuffers(reln))
+   {
+       *already_valid = true;
+       return ReadBufferExtended(reln, forkNum, blockNum, mode, strategy);
+   }
+
+   /* Open it at the smgr level if not already done */
+   RelationOpenSmgr(reln);
+
+   pgstat_count_buffer_read(reln);
+
+   bufHdr = ReadBuffer_start(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum,
+                             blockNum, mode, strategy, &hit, false);
+   buf = BufferDescriptorGetBuffer(bufHdr);
+
+   if (hit)
+   {
+       pgstat_count_buffer_hit(reln);
+
+       //Assert(BufferIsPinned(buf));
+       *already_valid = true;
+       return buf;
+   }
+
+   *already_valid = false;
+
+   if (aiop == NULL)
+   {
+       release_io = true;
+       aio = pgaio_io_get();
+   }
+   else if(*aiop == NULL)
+   {
+       release_io = false;
+       *aiop = aio = pgaio_io_get();
+   }
+   else
+   {
+       release_io = false;
+       aio = *aiop;
+   }
+
+   /*
+    * FIXME: Not accurate anymore.
+    * Decrement local pin, but keep shared pin. The latter will be released
+    * upon completion of the IO. Otherwise the buffer could be recycled while
+    * the IO is ongoing.
+    *
+    * FIXME: Make this optional? It's only useful for fire-and-forget style
+    * IO.
+    */
+   if (!release_io)
+   {
+       uint32      buf_state;
+
+       buf_state = LockBufHdr(bufHdr);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(bufHdr, buf_state);
+   }
+   else
+   {
+       PrivateRefCountEntry *ref;
+
+       ref = GetPrivateRefCountEntry(buf, false);
+       Assert(ref != NULL);
+       Assert(ref->refcount > 0);
+
+       ResourceOwnerForgetBuffer(CurrentResourceOwner, buf);
+       ref->refcount--;
+       ForgetPrivateRefCountEntry(ref);
+   }
+
+   ReadBufferInitRead(aio, reln->rd_smgr, forkNum, blockNum, buf, bufHdr, mode);
+
+   if (release_io)
+       pgaio_io_release(aio);
+
+   return buf;
+}
 
-/*
- * ReadBuffer_common -- common logic for all ReadBuffer variants
- *
- * *hit is set to true if the request was satisfied from shared buffer cache.
- */
 static Buffer
-ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ReadBuffer_extend(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                  BlockNumber blockNum, ReadBufferMode mode,
-                 BufferAccessStrategy strategy, bool *hit)
+                 BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
 {
+   bool        found;
    BufferDesc *bufHdr;
    Block       bufBlock;
-   bool        found;
-   bool        isExtend;
-   bool        isLocalBuf = SmgrIsTemp(smgr);
 
    *hit = false;
 
-   /* Make sure we will have room to remember the buffer pin */
-   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-
-   isExtend = (blockNum == P_NEW);
-
    TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
                                       smgr->smgr_rnode.node.spcNode,
                                       smgr->smgr_rnode.node.dbNode,
                                       smgr->smgr_rnode.node.relNode,
                                       smgr->smgr_rnode.backend,
-                                      isExtend);
+                                      true);
+
+   /* Make sure we will have room to remember the buffer pin */
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
-   /* Substitute proper block number if caller asked for P_NEW */
-   if (isExtend)
-       blockNum = smgrnblocks(smgr, forkNum);
+   /* Substitute proper block number */
+   blockNum = smgrnblocks(smgr, forkNum);
 
    if (isLocalBuf)
    {
        bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
-       if (found)
-           pgBufferUsage.local_blks_hit++;
-       else if (isExtend)
-           pgBufferUsage.local_blks_written++;
-       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
-                mode == RBM_ZERO_ON_ERROR)
-           pgBufferUsage.local_blks_read++;
+       pgBufferUsage.local_blks_written++;
    }
    else
    {
-       /*
-        * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
-        * not currently in memory.
-        */
        bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
                             strategy, &found);
-       if (found)
-           pgBufferUsage.shared_blks_hit++;
-       else if (isExtend)
-           pgBufferUsage.shared_blks_written++;
-       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
-                mode == RBM_ZERO_ON_ERROR)
-           pgBufferUsage.shared_blks_read++;
+       pgBufferUsage.shared_blks_written++;
    }
 
-   /* At this point we do NOT hold any locks. */
-
-   /* if it was already in the buffer pool, we're done */
    if (found)
    {
-       if (!isExtend)
-       {
-           /* Just need to update stats before we exit */
-           *hit = true;
-           VacuumPageHit++;
-
-           if (VacuumCostActive)
-               VacuumCostBalance += VacuumCostPageHit;
-
-           TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
-                                             smgr->smgr_rnode.node.spcNode,
-                                             smgr->smgr_rnode.node.dbNode,
-                                             smgr->smgr_rnode.node.relNode,
-                                             smgr->smgr_rnode.backend,
-                                             isExtend,
-                                             found);
-
-           /*
-            * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
-            * locked on return.
-            */
-           if (!isLocalBuf)
-           {
-               if (mode == RBM_ZERO_AND_LOCK)
-                   LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
-                                 LW_EXCLUSIVE);
-               else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
-                   LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
-           }
-
-           return BufferDescriptorGetBuffer(bufHdr);
-       }
+       Block       bufBlock;
 
        /*
         * We get here only in the corner case where we are trying to extend
@@ -859,13 +914,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                UnlockBufHdr(bufHdr, buf_state);
            } while (!StartBufferIO(bufHdr, true));
        }
+
    }
 
    /*
-    * if we have gotten to this point, we have allocated a buffer for the
-    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
-    * if it's a shared buffer.
-    *
     * Note: if smgrextend fails, we will end up with a buffer that is
     * allocated but not marked BM_VALID.  P_NEW will still select the same
     * block number (because the relation didn't get any longer on disk) and
@@ -877,68 +929,17 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
    bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
-   if (isExtend)
-   {
-       /* new buffers are zero-filled */
-       MemSet((char *) bufBlock, 0, BLCKSZ);
-       /* don't set checksum for all-zero page */
-       smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
-
-       /*
-        * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
-        * although we're essentially performing a write. At least on linux
-        * doing so defeats the 'delayed allocation' mechanism, leading to
-        * increased file fragmentation.
-        */
-   }
-   else
-   {
-       /*
-        * Read in the page, unless the caller intends to overwrite it and
-        * just wants us to allocate a buffer.
-        */
-       if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
-           MemSet((char *) bufBlock, 0, BLCKSZ);
-       else
-       {
-           instr_time  io_start,
-                       io_time;
-
-           if (track_io_timing)
-               INSTR_TIME_SET_CURRENT(io_start);
-
-           smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+   /* new buffers are zero-filled */
+   MemSet((char *) bufBlock, 0, BLCKSZ);
+   /* don't set checksum for all-zero page */
+   smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
 
-           if (track_io_timing)
-           {
-               INSTR_TIME_SET_CURRENT(io_time);
-               INSTR_TIME_SUBTRACT(io_time, io_start);
-               pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
-               INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
-           }
-
-           /* check for garbage data */
-           if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
-                                       PIV_LOG_WARNING | PIV_REPORT_STAT))
-           {
-               if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
-               {
-                   ereport(WARNING,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s; zeroing out page",
-                                   blockNum,
-                                   relpath(smgr->smgr_rnode, forkNum))));
-                   MemSet((char *) bufBlock, 0, BLCKSZ);
-               }
-               else
-                   ereport(ERROR,
-                           (errcode(ERRCODE_DATA_CORRUPTED),
-                            errmsg("invalid page in block %u of relation %s",
-                                   blockNum,
-                                   relpath(smgr->smgr_rnode, forkNum))));
-           }
-       }
-   }
+   /*
+    * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
+    * although we're essentially performing a write. At least on linux
+    * doing so defeats the 'delayed allocation' mechanism, leading to
+    * increased file fragmentation.
+    */
 
    /*
     * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
@@ -956,18 +957,226 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
        LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
    }
 
+
+   TerminateBufferIO(bufHdr, isLocalBuf,
+                     /* syncio = */ true, /* clear_dirty = */ false,
+                     BM_VALID);
+
+   return BufferDescriptorGetBuffer(bufHdr);
+}
+
+static BufferDesc *
+ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                BlockNumber blockNum, ReadBufferMode mode,
+                BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
+{
+   BufferDesc *bufHdr;
+   bool        found;
+
+   Assert(blockNum != P_NEW);
+
+   *hit = false;
+
+   /* Make sure we will have room to remember the buffer pin */
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+   TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
+                                      smgr->smgr_rnode.node.spcNode,
+                                      smgr->smgr_rnode.node.dbNode,
+                                      smgr->smgr_rnode.node.relNode,
+                                      smgr->smgr_rnode.backend,
+                                      false);
+
    if (isLocalBuf)
    {
-       /* Only need to adjust flags */
-       uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
+       bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
+       if (found)
+           pgBufferUsage.local_blks_hit++;
+       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+                mode == RBM_ZERO_ON_ERROR)
+           pgBufferUsage.local_blks_read++;
+   }
+   else
+   {
+       /*
+        * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
+        * not currently in memory.
+        */
+       bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
+                            strategy, &found);
+       if (found)
+           pgBufferUsage.shared_blks_hit++;
+       else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+                mode == RBM_ZERO_ON_ERROR)
+           pgBufferUsage.shared_blks_read++;
+   }
 
-       buf_state |= BM_VALID;
-       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+   /* At this point we do NOT hold any locks. */
+
+   /* if it was already in the buffer pool, we're done */
+   if (found)
+   {
+       /* Just need to update stats before we exit */
+       *hit = true;
+       VacuumPageHit++;
+
+       if (VacuumCostActive)
+           VacuumCostBalance += VacuumCostPageHit;
+
+       TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+                                         smgr->smgr_rnode.node.spcNode,
+                                         smgr->smgr_rnode.node.dbNode,
+                                         smgr->smgr_rnode.node.relNode,
+                                         smgr->smgr_rnode.backend,
+                                         false,
+                                         found);
+
+       /*
+        * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
+        * locked on return.
+        */
+       if (!isLocalBuf)
+       {
+           if (mode == RBM_ZERO_AND_LOCK)
+               LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+                             LW_EXCLUSIVE);
+           else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+               LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+       }
+
+       return bufHdr;
+   }
+   else if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+   {
+       /*
+        * The caller intends to overwrite the page and just wants us to
+        * allocate a buffer. Finish IO here, so sync/async don't have to
+        * duplicate the logic.
+        */
+
+       Block       bufBlock;
+
+       bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+
+       MemSet((char *) bufBlock, 0, BLCKSZ);
+
+       /*
+        * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+        * the page as valid, to make sure that no other backend sees the zeroed
+        * page before the caller has had a chance to initialize it.
+        *
+        * Since no-one else can be looking at the page contents yet, there is no
+        * difference between an exclusive lock and a cleanup-strength lock. (Note
+        * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
+        * they assert that the buffer is already valid.)
+        */
+       if (!isLocalBuf)
+       {
+           LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+       }
+
+       TerminateBufferIO(bufHdr, isLocalBuf,
+                         /* syncio = */ true, /* clear_dirty = */ false,
+                         BM_VALID);
+
+       *hit = true;
+   }
+
+   return bufHdr;
+}
+
+/*
+ * ReadBuffer_common -- common logic for all ReadBuffer variants
+ *
+ * *hit is set to true if the request was satisfied from shared buffer cache.
+ */
+static Buffer
+ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                 BlockNumber blockNum, ReadBufferMode mode,
+                 BufferAccessStrategy strategy, bool *hit)
+{
+   Block       bufBlock;
+   bool        isLocalBuf = SmgrIsTemp(smgr);
+   BufferDesc *bufHdr;
+   Buffer      buf;
+   instr_time  io_start, io_time;
+
+   if (blockNum == P_NEW)
+       return ReadBuffer_extend(smgr, relpersistence, forkNum,
+                                blockNum, mode, strategy,
+                                hit, isLocalBuf);
+
+
+   bufHdr = ReadBuffer_start(smgr, relpersistence, forkNum,
+                             blockNum, mode, strategy,
+                             hit, isLocalBuf);
+
+   if (*hit)
+       return BufferDescriptorGetBuffer(bufHdr);
+
+   /*
+    * if we have gotten to this point, we have allocated a buffer for the
+    * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
+    * if it's a shared buffer.
+    */
+   Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));   /* spinlock not needed */
+
+   bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+   buf = BufferDescriptorGetBuffer(bufHdr);
+
+   if (isLocalBuf || !io_data_force_async)
+   {
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
+
+       smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+
+       if (track_io_timing)
+       {
+           INSTR_TIME_SET_CURRENT(io_time);
+           INSTR_TIME_SUBTRACT(io_time, io_start);
+           pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+           INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+       }
+
+       /* check for garbage data */
+       if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+                                   PIV_LOG_WARNING | PIV_REPORT_STAT))
+       {
+           if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+           {
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s; zeroing out page",
+                               blockNum,
+                               relpath(smgr->smgr_rnode, forkNum))));
+               MemSet((char *) bufBlock, 0, BLCKSZ);
+           }
+           else
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s",
+                               blockNum,
+                               relpath(smgr->smgr_rnode, forkNum))));
+       }
+
+       TerminateBufferIO(bufHdr, isLocalBuf,
+                         /* syncio = */ true, /* clear_dirty = */ false,
+                         BM_VALID);
    }
    else
    {
-       /* Set BM_VALID, terminate IO, and wake up any waiters */
-       TerminateBufferIO(bufHdr, false, BM_VALID);
+       PgAioInProgress* aio = pgaio_io_get();
+       uint32      buf_state;
+
+       buf_state = LockBufHdr(bufHdr);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(bufHdr, buf_state);
+
+       ReadBufferInitRead(aio, smgr, forkNum, blockNum, buf, bufHdr, mode);
+       pgaio_io_wait(aio);
+       pgaio_io_release(aio);
    }
 
    VacuumPageMiss++;
@@ -979,12 +1188,91 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                      smgr->smgr_rnode.node.dbNode,
                                      smgr->smgr_rnode.node.relNode,
                                      smgr->smgr_rnode.backend,
-                                     isExtend,
-                                     found);
+                                     false,
+                                     false);
 
    return BufferDescriptorGetBuffer(bufHdr);
 }
 
+void
+ReadBufferCompleteRead(Buffer buffer, const AioBufferTag *tag, char *bufdata, int mode, bool failed)
+{
+   /* FIXME: implement track_io_timing */
+
+   if (!failed)
+   {
+       Block       bufBlock = (Block) bufdata;
+       BlockNumber blockNum = tag->blockNum;
+
+       /* check for garbage data */
+       if (!PageIsVerified((Page) bufdata, blockNum))
+       {
+           RelFileNode rnode = tag->rnode.node;
+           BlockNumber forkNum = tag->forkNum;
+
+           failed = true;
+
+           if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+           {
+               ereport(WARNING,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s; zeroing out page",
+                               blockNum,
+                               relpathperm(rnode, forkNum))));
+               MemSet((char *) bufBlock, 0, BLCKSZ);
+           }
+           else
+           {
+               ereport(ERROR,
+                       (errcode(ERRCODE_DATA_CORRUPTED),
+                        errmsg("invalid page in block %u of relation %s",
+                               blockNum,
+                               relpathperm(rnode, forkNum))));
+           }
+       }
+   }
+
+   if (BufferIsValid(buffer))
+   {
+       bool        islocal = BufferIsLocal(buffer);
+       BufferDesc *bufHdr;
+
+       if (islocal)
+           bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+       else
+           bufHdr = GetBufferDescriptor(buffer - 1);
+
+       TerminateBufferIO(bufHdr, islocal,
+                         /* syncio = */ false, /* clear_dirty = */ false,
+                         failed ? BM_IO_ERROR : BM_VALID);
+   }
+}
+
+void
+ReadBufferCompleteWrite(Buffer buffer, bool failed, bool release_lock)
+{
+   BufferDesc *bufHdr;
+   bool        islocal = BufferIsLocal(buffer);
+
+   if (islocal)
+       bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+   else
+       bufHdr = GetBufferDescriptor(buffer - 1);
+
+   /* FIXME: implement track_io_timing */
+
+   TerminateBufferIO(bufHdr, islocal,
+                     /* syncio = */ false, /* clear_dirty = */ true,
+                     failed ? BM_IO_ERROR : 0);
+
+   /*
+    * The initiator of IO is not managing the lock (i.e. called
+    * LWLockReleaseOwnership()), we are.
+    */
+   if (release_lock)
+       LWLockReleaseUnowned(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+}
+
 /*
  * BufferAlloc -- subroutine for ReadBuffer.  Handles lookup of a shared
  *     buffer.  If no buffer exists already, selects a replacement
@@ -1022,6 +1310,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
    bool        valid;
    uint32      buf_state;
 
+   Assert(blockNum != P_NEW);
+
    /* create a tag so we can lookup the buffer */
    INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
@@ -1432,30 +1722,402 @@ retry:
        goto retry;
    }
 
-   /*
-    * Clear out the buffer's tag and flags.  We must do this to ensure that
-    * linear scans of the buffer array don't think the buffer is valid.
-    */
-   oldFlags = buf_state & BUF_FLAG_MASK;
-   CLEAR_BUFFERTAG(buf->tag);
-   buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
-   UnlockBufHdr(buf, buf_state);
+   /*
+    * Clear out the buffer's tag and flags.  We must do this to ensure that
+    * linear scans of the buffer array don't think the buffer is valid.
+    */
+   oldFlags = buf_state & BUF_FLAG_MASK;
+   CLEAR_BUFFERTAG(buf->tag);
+   buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+   UnlockBufHdr(buf, buf_state);
+
+   /*
+    * Remove the buffer from the lookup hashtable, if it was in there.
+    */
+   if (oldFlags & BM_TAG_VALID)
+       BufTableDelete(&oldTag, oldHash);
+
+   /*
+    * Done with mapping lock.
+    */
+   LWLockRelease(oldPartitionLock);
+
+   /*
+    * Insert the buffer at the head of the list of free buffers.
+    */
+   StrategyFreeBuffer(buf);
+}
+
+static bool
+bulk_extend_buffer_inval(BufferDesc *buf_hdr)
+{
+   uint32      buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+   Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+   Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) > 0);
+
+   /* can't change while we're holding the pin */
+   if (buf_state & BM_TAG_VALID)
+   {
+       uint32      hash;
+       LWLock     *partition_lock;
+       BufferTag   tag;
+       uint32      old_flags;
+
+       /* have buffer pinned, so it's safe to read tag without lock */
+       tag = buf_hdr->tag;
+       hash = BufTableHashCode(&tag);
+       partition_lock = BufMappingPartitionLock(hash);
+
+       LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+       /* lock the buffer header */
+       buf_state = LockBufHdr(buf_hdr);
+       old_flags = buf_state & BUF_FLAG_MASK;
+
+       /*
+        * If somebody else pinned the buffer since, or even worse, dirtied it,
+        * give up on this buffer: It's clearly in use.
+        */
+       if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
+       {
+           Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+           UnlockBufHdr(buf_hdr, buf_state);
+           LWLockRelease(partition_lock);
+
+           return false;
+       }
+
+       /*
+        * Clear out the buffer's tag and flags.  We must do this to ensure that
+        * linear scans of the buffer array don't think the buffer is valid.
+        */
+       CLEAR_BUFFERTAG(buf_hdr->tag);
+       buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+       UnlockBufHdr(buf_hdr, buf_state);
+
+       if (old_flags & BM_TAG_VALID)
+           BufTableDelete(&tag, hash);
+
+       LWLockRelease(partition_lock);
+   }
+
+   return true;
+}
+
+
+typedef struct BulkExtendOneBuffer
+{
+   BufferDesc *buf_hdr;
+   dlist_node node;
+} BulkExtendOneBuffer;
+
+typedef struct BulkExtendBufferedState
+{
+   int acquired_buffers_count;
+   int pending_buffers_count;
+
+   dlist_head acquired_buffers;
+   dlist_head pending_buffers;
+
+   dlist_head allocated_buffers;
+
+   pg_streaming_write *pgsw;
+
+   BulkExtendOneBuffer ios[];
+} BulkExtendBufferedState;
+
+static void
+bulk_extend_undirty_complete(void *pgsw_private, void *write_private)
+{
+   BulkExtendBufferedState *be_state = (BulkExtendBufferedState * ) pgsw_private;
+   BulkExtendOneBuffer *ex_buf = (BulkExtendOneBuffer *) write_private;
+   BufferTag   tag;
+
+   /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+   tag = ex_buf->buf_hdr->tag;
+   be_state->pending_buffers_count--;
+   dlist_delete_from(&be_state->pending_buffers, &ex_buf->node);
+
+   if (bulk_extend_buffer_inval(ex_buf->buf_hdr))
+   {
+       dlist_push_head(&be_state->acquired_buffers, &ex_buf->node);
+       be_state->acquired_buffers_count++;
+   }
+   else
+   {
+       dlist_push_tail(&be_state->allocated_buffers, &ex_buf->node);
+       UnpinBuffer(ex_buf->buf_hdr, true);
+   }
+
+   ScheduleBufferTagForWriteback(&BackendWritebackContext, &tag);
+}
+
+/*
+ * WIP interface to more efficient relation extension.
+ *
+ * Todo:
+ *
+ * - Write initialized buffers - otherwise we'll waste a lot of time doing
+ *   another set of memsets at PageInit(), as well as making PageIsVerified()
+ *   a lot more expensive (verifying all-zeroes).
+ * - De-duplication of work between concurrent extensions?
+ * - Chunking, to avoid pinning quite as many buffers at once
+ * - Strategy integration
+ * - cleanup
+ *
+ */
+extern Buffer
+BulkExtendBuffered(Relation relation, ForkNumber forkNum, int extendby, BufferAccessStrategy strategy)
+{
+   BulkExtendBufferedState *be_state;
+   bool        need_extension_lock = !RELATION_IS_LOCAL(relation);
+   BlockNumber start_nblocks;
+   SMgrRelation smgr;
+   BufferDesc *return_buf_hdr = NULL;
+   char relpersistence = relation->rd_rel->relpersistence;
+   dlist_iter iter;
+   BlockNumber extendto;
+   bool first;
+
+   RelationOpenSmgr(relation);
+   smgr = relation->rd_smgr;
+
+   be_state = palloc0(offsetof(BulkExtendBufferedState, ios) + sizeof(BulkExtendOneBuffer) * extendby);
+
+   dlist_init(&be_state->acquired_buffers);
+   dlist_init(&be_state->pending_buffers);
+   dlist_init(&be_state->allocated_buffers);
+
+   for (int i = 0; i < extendby; i++)
+   {
+       dlist_push_tail(&be_state->allocated_buffers, &be_state->ios[i].node);
+   }
+
+   be_state->pgsw = pg_streaming_write_alloc(128, be_state, bulk_extend_undirty_complete);
+
+   while (be_state->acquired_buffers_count < extendby)
+   {
+       uint32 cur_old_flags;
+       uint32 cur_buf_state;
+       BufferDesc *cur_buf_hdr;
+       BulkExtendOneBuffer *cur_ex_buf = NULL;
+       bool buffer_usable;
+       bool buffer_io;
+
+       /*
+        * If there's buffers being written out that might or might not be
+        * available, depending on whether they've concurrently been pinned,
+        * wait for all of those to finish, and check again.
+        */
+       if ((be_state->acquired_buffers_count + be_state->pending_buffers_count) >= extendby)
+       {
+           pg_streaming_write_wait_all(be_state->pgsw);
+           continue;
+       }
+
+       Assert(!dlist_is_empty(&be_state->allocated_buffers));
+       cur_ex_buf = dlist_container(BulkExtendOneBuffer, node,
+                                    dlist_pop_head_node(&be_state->allocated_buffers));
+
+       ReservePrivateRefCountEntry();
+       ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+       cur_buf_hdr = StrategyGetBuffer(NULL, &cur_buf_state);
+       cur_ex_buf->buf_hdr = cur_buf_hdr;
+
+       Assert(BUF_STATE_GET_REFCOUNT(cur_buf_state) == 0);
+
+       /* Must copy buffer flags while we still hold the spinlock */
+       cur_old_flags = cur_buf_state & BUF_FLAG_MASK;
+
+       /* Pin the buffer and then release the buffer spinlock */
+       PinBuffer_Locked(cur_buf_hdr);
+
+       if (cur_old_flags & BM_DIRTY)
+       {
+           LWLock *content_lock;
+           PgAioInProgress *aio;
+
+           content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+
+           /*
+            * NB: this protect against deadlocks due to holding multiple
+            * buffer locks, as well as avoids unnecessary blocking (see
+            * BufferAlloc() for the latter).
+            */
+           if (LWLockConditionalAcquire(content_lock, LW_SHARED))
+           {
+               aio = pg_streaming_write_get_io(be_state->pgsw);
+
+               // XXX: could use strategy reject logic here too
+               if (AsyncFlushBuffer(aio, cur_buf_hdr, NULL))
+               {
+                   buffer_usable = true;
+                   buffer_io = true;
+
+                   pg_streaming_write_write(be_state->pgsw, aio, cur_ex_buf);
+               }
+               else
+               {
+                   buffer_io = false;
+
+                   if (!bulk_extend_buffer_inval(cur_buf_hdr))
+                       buffer_usable = false;
+                   else
+                       buffer_usable = true;
+
+                   LWLockRelease(content_lock);
+               }
+           }
+           else
+           {
+               /*
+                * Someone else has locked the buffer, so give it up and loop
+                * back to get another one.
+                */
+               buffer_usable = false;
+               buffer_io = false;
+           }
+       }
+       else
+       {
+           /*
+            * This buffer can be used, unless it's getting pinned
+            * concurrently.
+            */
+           buffer_io = false;
+
+           if (!bulk_extend_buffer_inval(cur_buf_hdr))
+               buffer_usable = false;
+           else
+               buffer_usable = true;
+       }
+
+       if (buffer_io)
+       {
+           dlist_push_tail(&be_state->pending_buffers, &cur_ex_buf->node);
+           be_state->pending_buffers_count++;
+       }
+       else if (buffer_usable)
+       {
+           dlist_push_head(&be_state->acquired_buffers, &cur_ex_buf->node);
+           be_state->acquired_buffers_count++;
+       }
+       else
+       {
+           UnpinBuffer(cur_ex_buf->buf_hdr, true);
+           dlist_push_tail(&be_state->allocated_buffers, &cur_ex_buf->node);
+       }
+   }
+
+   // FIXME: shouldn't be needed
+   pg_streaming_write_wait_all(be_state->pgsw);
+   pg_streaming_write_free(be_state->pgsw);
 
    /*
-    * Remove the buffer from the lookup hashtable, if it was in there.
+    * Now we have our hands on N buffers that are guaranteed to be clean
+    * (since they are pinned they cannot be reused by other backends).
+    *
+    * Now acquire extension lock, extend relation, and try to point the
+    * victim buffers acquired above to the extended part of the relation.
     */
-   if (oldFlags & BM_TAG_VALID)
-       BufTableDelete(&oldTag, oldHash);
+   if (need_extension_lock)
+       LockRelationForExtension(relation, ExclusiveLock);
 
-   /*
-    * Done with mapping lock.
-    */
-   LWLockRelease(oldPartitionLock);
+   start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+   extendto = start_nblocks;
 
    /*
-    * Insert the buffer at the head of the list of free buffers.
+    * Set up identities of all the new buffers. This way there cannot be race
+    * conditions where other backends lock the returned page first.
     */
-   StrategyFreeBuffer(buf);
+   first = true;
+   dlist_foreach(iter, &be_state->acquired_buffers)
+   {
+       BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+       BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+       BufferTag   new_tag;
+       uint32      new_hash;
+       int         existing_buf;
+       uint32      buf_state;
+       LWLock     *partition_lock;
+
+       Assert(extendto < start_nblocks + extendby);
+       Assert(ex_buf->buf_hdr);
+
+       INIT_BUFFERTAG(new_tag, smgr->smgr_rnode.node, forkNum, extendto);
+       new_hash = BufTableHashCode(&new_tag);
+
+       partition_lock = BufMappingPartitionLock(new_hash);
+       LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+       existing_buf = BufTableInsert(&new_tag, new_hash, new_buf_hdr->buf_id);
+       if (existing_buf >= 0)
+       {
+           /* FIXME: This is probably possible when extension fails due to ENOSPC or such */
+           elog(ERROR, "buffer beyond EOF");
+       }
+
+       /* lock to install new identity */
+       buf_state = LockBufHdr(new_buf_hdr);
+
+       buf_state |= BM_TAG_VALID | BM_IO_IN_PROGRESS | BUF_USAGECOUNT_ONE * BM_MAX_USAGE_COUNT;
+
+       if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
+           buf_state |= BM_PERMANENT;
+
+       new_buf_hdr->tag = new_tag;
+       UnlockBufHdr(new_buf_hdr, buf_state);
+
+       LWLockRelease(partition_lock);
+
+       if (first)
+       {
+           return_buf_hdr = ex_buf->buf_hdr;
+           first = false;
+       }
+
+       extendto++;
+   }
+   Assert(extendto == start_nblocks + extendby);
+
+   /* finally extend the relation */
+   smgrzeroextend(relation->rd_smgr, forkNum, start_nblocks,
+                  extendby, false);
+
+   /* Ensure taht the returned buffer cannot be reached by another backend first */
+   LWLockAcquire(BufferDescriptorGetContentLock(return_buf_hdr), LW_EXCLUSIVE);
+
+   /* Mark all buffers as having completed */
+   dlist_foreach(iter, &be_state->acquired_buffers)
+   {
+       BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+       BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+       Block       new_buf_block;
+       uint32      buf_state;
+
+       new_buf_block = BufHdrGetBlock(new_buf_hdr);
+
+       /* new buffers are zero-filled */
+       memset((char *) __builtin_assume_aligned(new_buf_block, 4096), 0, BLCKSZ);
+
+       buf_state = LockBufHdr(new_buf_hdr);
+       buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+       buf_state |= BM_VALID;
+       UnlockBufHdr(new_buf_hdr, buf_state);
+       ConditionVariableBroadcast(BufferDescriptorGetIOCV(new_buf_hdr));
+
+       if (new_buf_hdr != return_buf_hdr)
+           UnpinBuffer(new_buf_hdr, true);
+   }
+
+   if (need_extension_lock)
+       UnlockRelationForExtension(relation, ExclusiveLock);
+
+   return BufferDescriptorGetBuffer(return_buf_hdr);
 }
 
 /*
@@ -1822,6 +2484,79 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
    }
 }
 
+static void
+buffer_sync_complete(void *pgsw_private, void *write_private)
+{
+   WritebackContext *wb_context = (WritebackContext *) pgsw_private;
+   BufferDesc *bufHdr = (BufferDesc *) write_private;
+   BufferTag   tag;
+
+   /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+   tag = bufHdr->tag;
+   UnpinBuffer(bufHdr, true);
+
+   if (wb_context)
+       ScheduleBufferTagForWriteback(wb_context, &tag);
+}
+
+static bool
+BufferSyncWriteOne(pg_streaming_write *pgsw, BufferDesc *bufHdr)
+{
+   uint32 buf_state;
+   bool did_write = false;
+
+   ReservePrivateRefCountEntry();
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+   buf_state = LockBufHdr(bufHdr);
+
+   if ((buf_state & BM_VALID) && (buf_state & BM_DIRTY))
+   {
+       LWLock *content_lock;
+       PgAioInProgress *aio;
+
+       PinBuffer_Locked(bufHdr);
+
+       content_lock = BufferDescriptorGetContentLock(bufHdr);
+
+       aio = pg_streaming_write_get_io(pgsw);
+
+       /*
+        * If there are pre-existing IOs in-flight, we can't block on the
+        * content lock, it could lead to a deadlock. So first wait for
+        * outstanding IO, and then block on acquiring the lock.
+        */
+       if (pg_streaming_write_inflight(pgsw) > 0 &&
+           LWLockConditionalAcquire(content_lock, LW_SHARED))
+       {
+       }
+       else
+       {
+           pg_streaming_write_wait_all(pgsw);
+           LWLockAcquire(content_lock, LW_SHARED);
+       }
+
+       if (AsyncFlushBuffer(aio, bufHdr, NULL))
+       {
+           pg_streaming_write_write(pgsw, aio, bufHdr);
+           BgWriterStats.m_buf_written_checkpoints++;
+           did_write = true;
+       }
+       else
+       {
+           LWLockRelease(content_lock);
+           UnpinBuffer(bufHdr, true);
+       }
+   }
+   else
+   {
+       UnlockBufHdr(bufHdr, buf_state);
+   }
+
+   return did_write;
+}
+
 /*
  * BufferSync -- Write out all dirty buffers in the pool.
  *
@@ -1846,6 +2581,7 @@ BufferSync(int flags)
    binaryheap *ts_heap;
    int         i;
    int         mask = BM_DIRTY;
+   pg_streaming_write *pgsw;
    WritebackContext wb_context;
 
    /* Make sure we can handle the pin inside SyncOneBuffer */
@@ -1860,6 +2596,8 @@ BufferSync(int flags)
                    CHECKPOINT_FLUSH_ALL))))
        mask |= BM_PERMANENT;
 
+   elog(DEBUG1, "checkpoint looking at buffers");
+
    /*
     * Loop over all buffers, and mark the ones that need to be written with
     * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
@@ -1913,8 +2651,12 @@ BufferSync(int flags)
 
    WritebackContextInit(&wb_context, &checkpoint_flush_after);
 
+   pgsw = pg_streaming_write_alloc(128, &wb_context, buffer_sync_complete);
+
    TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
 
+   elog(DEBUG1, "checkpoint predicts to write %u buffers", num_to_scan);
+
    /*
     * Sort buffers that need to be written to reduce the likelihood of random
     * IO. The sorting is also important for the implementation of balancing
@@ -1927,6 +2669,8 @@ BufferSync(int flags)
 
    num_spaces = 0;
 
+   elog(DEBUG1, "checkpoint done sorting");
+
    /*
     * Allocate progress status for each tablespace with buffers that need to
     * be flushed. This requires the to-be-flushed array to be sorted.
@@ -2012,6 +2756,8 @@ BufferSync(int flags)
 
    binaryheap_build(ts_heap);
 
+   elog(DEBUG1, "checkpoint done heaping");
+
    /*
     * Iterate through to-be-checkpointed buffers and write the ones (still)
     * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
@@ -2047,7 +2793,7 @@ BufferSync(int flags)
         */
        if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
        {
-           if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+           if (BufferSyncWriteOne(pgsw, bufHdr))
            {
                TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
                BgWriterStats.m_buf_written_checkpoints++;
@@ -2082,6 +2828,9 @@ BufferSync(int flags)
        CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
    }
 
+   pg_streaming_write_wait_all(pgsw);
+   pg_streaming_write_free(pgsw);
+
    /* issue all pending flushes */
    IssuePendingWritebacks(&wb_context);
 
@@ -2153,6 +2902,8 @@ BgBufferSync(WritebackContext *wb_context)
    long        new_strategy_delta;
    uint32      new_recent_alloc;
 
+   pg_streaming_write *pgsw;
+
    /*
     * Find out where the freelist clock sweep currently is, and how many
     * buffer allocations have happened since our last call.
@@ -2318,6 +3069,8 @@ BgBufferSync(WritebackContext *wb_context)
        upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    }
 
+   pgsw = pg_streaming_write_alloc(128, wb_context, buffer_sync_complete);
+
    /*
     * Now write out dirty reusable buffers, working forward from the
     * next_to_clean point, until we have lapped the strategy scan, or cleaned
@@ -2335,8 +3088,8 @@ BgBufferSync(WritebackContext *wb_context)
    /* Execute the LRU scan */
    while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    {
-       int         sync_state = SyncOneBuffer(next_to_clean, true,
-                                              wb_context);
+       int         sync_state =
+           BgBufferSyncWriteOne(next_to_clean, true, pgsw);
 
        if (++next_to_clean >= NBuffers)
        {
@@ -2358,6 +3111,9 @@ BgBufferSync(WritebackContext *wb_context)
            reusable_buffers++;
    }
 
+   pg_streaming_write_wait_all(pgsw);
+   pg_streaming_write_free(pgsw);
+
    BgWriterStats.m_buf_written_clean += num_written;
 
 #ifdef BGW_DEBUG
@@ -2413,14 +3169,17 @@ BgBufferSync(WritebackContext *wb_context)
  * Note: caller must have done ResourceOwnerEnlargeBuffers.
  */
 static int
-SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+                    pg_streaming_write *pgsw)
 {
    BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
    int         result = 0;
    uint32      buf_state;
-   BufferTag   tag;
+   LWLock *content_lock;
+   PgAioInProgress *aio;
 
    ReservePrivateRefCountEntry();
+   ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
 
    /*
     * Check whether buffer needs writing.
@@ -2452,22 +3211,37 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
        return result;
    }
 
-   /*
-    * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
-    * buffer is clean by the time we've locked it.)
-    */
    PinBuffer_Locked(bufHdr);
-   LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-
-   FlushBuffer(bufHdr, NULL);
 
-   LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+   aio = pg_streaming_write_get_io(pgsw);
 
-   tag = bufHdr->tag;
+   content_lock = BufferDescriptorGetContentLock(bufHdr);
 
-   UnpinBuffer(bufHdr, true);
+   /*
+    * If there are pre-existing IOs in-flight, we can't block on the
+    * content lock, it could lead to a deadlock. So first wait for
+    * outstanding IO, and then block on acquiring the lock.
+    */
+   if (pg_streaming_write_inflight(pgsw) > 0 &&
+       LWLockConditionalAcquire(content_lock, LW_SHARED))
+   {
+   }
+   else
+   {
+       pg_streaming_write_wait_all(pgsw);
+       LWLockAcquire(content_lock, LW_SHARED);
+   }
 
-   ScheduleBufferTagForWriteback(wb_context, &tag);
+   if (AsyncFlushBuffer(aio, bufHdr, NULL))
+   {
+       pg_streaming_write_write(pgsw, aio, bufHdr);
+       result |= BUF_WRITTEN;
+   }
+   else
+   {
+       LWLockRelease(content_lock);
+       UnpinBuffer(bufHdr, true);
+   }
 
    return result | BUF_WRITTEN;
 }
@@ -2584,7 +3358,6 @@ CheckForBufferLeaks(void)
            PrintBufferLeakWarning(res->buffer);
            RefCountErrors++;
        }
-
    }
 
    Assert(RefCountErrors == 0);
@@ -2731,6 +3504,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    Block       bufBlock;
    char       *bufToWrite;
    uint32      buf_state;
+   PgAioBounceBuffer *bb;
 
    /*
     * Try to start an I/O operation.  If StartBufferIO returns false, then
@@ -2788,6 +3562,8 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    if (buf_state & BM_PERMANENT)
        XLogFlush(recptr);
 
+   pgBufferUsage.shared_blks_written++;
+
    /*
     * Now it's safe to write buffer to disk. Note that no one else should
     * have been able to write it while we were busy with log flushing because
@@ -2800,35 +3576,68 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
     * buffer, other processes might be updating hint bits in it, so we must
     * copy the page to private storage if we do checksumming.
     */
-   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum,
+                                    io_data_force_async ? &bb : NULL);
 
-   if (track_io_timing)
-       INSTR_TIME_SET_CURRENT(io_start);
+   if (!io_data_force_async)
+   {
+       if (track_io_timing)
+           INSTR_TIME_SET_CURRENT(io_start);
 
-   /*
-    * bufToWrite is either the shared buffer or a copy, as appropriate.
-    */
-   smgrwrite(reln,
-             buf->tag.forkNum,
-             buf->tag.blockNum,
-             bufToWrite,
-             false);
+       /*
+        * bufToWrite is either the shared buffer or a copy, as appropriate.
+        */
+       smgrwrite(reln,
+                 buf->tag.forkNum,
+                 buf->tag.blockNum,
+                 bufToWrite,
+                 false);
 
-   if (track_io_timing)
-   {
-       INSTR_TIME_SET_CURRENT(io_time);
-       INSTR_TIME_SUBTRACT(io_time, io_start);
-       pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
-       INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+       if (track_io_timing)
+       {
+           INSTR_TIME_SET_CURRENT(io_time);
+           INSTR_TIME_SUBTRACT(io_time, io_start);
+           pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+           INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+       }
+
+       /*
+        * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
+        * end the BM_IO_IN_PROGRESS state.
+        */
+       TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ true, 0);
    }
+   else
+   {
+       PgAioInProgress *aio = pgaio_io_get();
+       uint32      buf_state;
 
-   pgBufferUsage.shared_blks_written++;
+       buf_state = LockBufHdr(buf);
+       buf_state += BUF_REFCOUNT_ONE;
+       UnlockBufHdr(buf, buf_state);
 
-   /*
-    * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
-    * end the BM_IO_IN_PROGRESS state.
-    */
-   TerminateBufferIO(buf, true, 0);
+       /* FIXME: improve */
+       InProgressBuf = NULL;
+
+       if (bb)
+       {
+           pgaio_assoc_bounce_buffer(aio, bb);
+           pgaio_bounce_buffer_release(bb);
+       }
+
+       pgaio_io_ref(aio, &buf->io_in_progress);
+
+       smgrstartwrite(aio,
+                      reln,
+                      buf->tag.forkNum,
+                      buf->tag.blockNum,
+                      bufToWrite,
+                      BufferDescriptorGetBuffer(buf),
+                      /* skipFsync = */ false,
+                      /* release_lock = */ false);
+       pgaio_io_wait(aio);
+       pgaio_io_release(aio);
+   }
 
    TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
                                       buf->tag.blockNum,
@@ -2840,6 +3649,100 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
    error_context_stack = errcallback.previous;
 }
 
+static bool
+AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln)
+{
+   XLogRecPtr  recptr;
+   Block       bufBlock;
+   char       *bufToWrite;
+   uint32      buf_state;
+   PgAioBounceBuffer *bb;
+
+   Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_SHARED));
+
+   /*
+    * Try to start an I/O operation.  If StartBufferIO returns false, then
+    * someone else flushed the buffer before we could, so we need not do
+    * anything.
+    */
+   if (!StartBufferIO(buf, false))
+       return false;
+
+   /* Find smgr relation for buffer */
+   if (reln == NULL)
+       reln = smgropen(buf->tag.rnode, InvalidBackendId);
+
+   buf_state = LockBufHdr(buf);
+
+   /*
+    * Run PageGetLSN while holding header lock, since we don't have the
+    * buffer locked exclusively in all cases.
+    */
+   recptr = BufferGetLSN(buf);
+
+   /* To check if block content changes while flushing. - vadim 01/17/97 */
+   buf_state &= ~BM_JUST_DIRTIED;
+
+   /* ownership while in AIO subsystem */
+   buf_state += BUF_REFCOUNT_ONE;
+
+   UnlockBufHdr(buf, buf_state);
+
+   if (buf_state & BM_PERMANENT)
+       XLogFlush(recptr);
+
+   bufBlock = BufHdrGetBlock(buf);
+
+   bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum, &bb);
+
+   /* FIXME: improve */
+   InProgressBuf = NULL;
+
+   pgBufferUsage.shared_blks_written++;
+
+   if (bb)
+   {
+       pgaio_assoc_bounce_buffer(aio, bb);
+       pgaio_bounce_buffer_release(bb);
+   }
+
+   pgaio_io_ref(aio, &buf->io_in_progress);
+
+   /*
+    * Ask the completion routine to release the lock for us. That's important
+    * for two reasons:
+    *
+    * 1) It allows other backends to move the block into a modifyable state
+    *    by completing the IO, avoiding some deadlock risks. Otherwise this
+    *    process would need to ensure the IO is completed before being
+    *    allowed to sleep.
+    *
+    * 2) For processes doing lots of streaming writes (e.g. checkpointer) the
+    *    lwlock ownership management turns out to be very expensive, because
+    *    lwlocks for locks aquired earlier are also likely to be released
+    *    earlier, leading to held_lwlocks needing to be shifted around.
+    *
+    * This is safe because we only release the lock ownership once the AIO
+    * subsystem successfully started tracking the IO.
+    */
+
+   smgrstartwrite(aio, reln,
+                  buf->tag.forkNum, buf->tag.blockNum,
+                  bufToWrite,
+                  BufferDescriptorGetBuffer(buf),
+                  /* skipFsync = */ false,
+                  /* release_lock = */ true);
+
+   /*
+    * XXX: The lock ownership release should probably be moved into the AIO
+    * layer, so we correctly handle errors happening during IO submission.
+    */
+   LWLockReleaseOwnership(BufferDescriptorGetContentLock(buf));
+   RESUME_INTERRUPTS();
+
+   return true;
+}
+
 /*
  * RelationGetNumberOfBlocksInFork
  *     Determines the current number of pages in the specified relation fork.
@@ -4104,6 +5007,7 @@ WaitIO(BufferDesc *buf)
    for (;;)
    {
        uint32      buf_state;
+       PgAioIoRef  aio_ref;
 
        /*
         * It may not be necessary to acquire the spinlock to check the flag
@@ -4111,10 +5015,19 @@ WaitIO(BufferDesc *buf)
         * play it safe.
         */
        buf_state = LockBufHdr(buf);
+       aio_ref = buf->io_in_progress;
        UnlockBufHdr(buf, buf_state);
 
        if (!(buf_state & BM_IO_IN_PROGRESS))
            break;
+
+       if (pgaio_io_ref_valid(&aio_ref))
+       {
+           pgaio_io_wait_ref(&aio_ref, true);
+           ConditionVariablePrepareToSleep(cv);
+           continue;
+       }
+
        ConditionVariableSleep(cv, WAIT_EVENT_BUFFILE_WAITIO);
    }
    ConditionVariableCancelSleep();
@@ -4173,8 +5086,26 @@ StartBufferIO(BufferDesc *buf, bool forInput)
    return true;
 }
 
+static void
+TerminateBufferIO(BufferDesc *bufHdr, bool local, bool syncio,
+                 bool clear_dirty, uint32 set_flag_bits)
+{
+   if (local)
+   {
+       /* Only need to adjust flags */
+       uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+       buf_state |= set_flag_bits;
+       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+   }
+   else
+   {
+       TerminateSharedBufferIO(bufHdr, syncio, clear_dirty, set_flag_bits);
+   }
+}
+
 /*
- * TerminateBufferIO: release a buffer we were doing I/O on
+ * TerminateSharedBufferIO: release a buffer we were doing I/O on
  * (Assumptions)
  * My process is executing IO for the buffer
  * BM_IO_IN_PROGRESS bit is set for the buffer
@@ -4190,11 +5121,12 @@ StartBufferIO(BufferDesc *buf, bool forInput)
  * be 0, or BM_VALID if we just finished reading in the page.
  */
 static void
-TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+TerminateSharedBufferIO(BufferDesc *buf, bool syncio, bool clear_dirty, uint32 set_flag_bits)
 {
    uint32      buf_state;
 
-   Assert(buf == InProgressBuf);
+   if (syncio)
+       Assert(buf == InProgressBuf);
 
    buf_state = LockBufHdr(buf);
 
@@ -4205,11 +5137,46 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
        buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
 
    buf_state |= set_flag_bits;
+
+   if (!syncio)
+   {
+       buf_state -= BUF_REFCOUNT_ONE;
+       pgaio_io_ref_clear(&buf->io_in_progress);
+   }
+
    UnlockBufHdr(buf, buf_state);
 
-   InProgressBuf = NULL;
+   if (syncio)
+       InProgressBuf = NULL;
 
    ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
+
+
+   /* Support LockBufferForCleanup() */
+   if (!syncio && buf_state & BM_PIN_COUNT_WAITER)
+   {
+       /*
+        * Acquire the buffer header lock, re-check that there's a waiter.
+        * Another backend could have unpinned this buffer, and already
+        * woken up the waiter.  There's no danger of the buffer being
+        * replaced after we unpinned it above, as it's pinned by the
+        * waiter.
+        */
+       buf_state = LockBufHdr(buf);
+
+       if ((buf_state & BM_PIN_COUNT_WAITER) &&
+           BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+       {
+           /* we just released the last pin other than the waiter's */
+           int         wait_backend_pid = buf->wait_backend_pid;
+
+           buf_state &= ~BM_PIN_COUNT_WAITER;
+           UnlockBufHdr(buf, buf_state);
+           ProcSendSignal(wait_backend_pid);
+       }
+       else
+           UnlockBufHdr(buf, buf_state);
+   }
 }
 
 /*
@@ -4226,6 +5193,8 @@ AbortBufferIO(void)
 {
    BufferDesc *buf = InProgressBuf;
 
+   pgaio_at_abort();
+
    if (buf)
    {
        uint32      buf_state;
@@ -4259,7 +5228,7 @@ AbortBufferIO(void)
                pfree(path);
            }
        }
-       TerminateBufferIO(buf, false, BM_IO_ERROR);
+       TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ false, BM_IO_ERROR);
    }
 }
 
@@ -4490,6 +5459,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
 {
    PendingWriteback *pending;
 
+   if (io_data_direct)
+       return;
+
    /*
     * Add buffer to the pending writeback array, unless writeback control is
     * disabled.
@@ -4584,6 +5556,9 @@ IssuePendingWritebacks(WritebackContext *context)
    }
 
    context->nr_pending = 0;
+
+   if (i > 0)
+       pgaio_submit_pending(true);
 }
 
 
index eddd0782f298e6bbe486a6508196627b7b8d19e6..28e4b8155e79c9f4fb88ba5ed3aaf62be01c1f12 100644 (file)
@@ -117,6 +117,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
    bool        found;
    uint32      buf_state;
 
+   Assert(blockNum != P_NEW);
+
    INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
 
    /* Initialize local buffers if first request in this session */
index 931ed679307b223c61ff7da092a0de88b8560250..cbb10ded4ba3551fac7668cf416ee171dd2d74e6 100644 (file)
@@ -93,6 +93,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "portability/mem.h"
+#include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "utils/guc.h"
@@ -468,6 +469,15 @@ pg_flush_data(int fd, off_t offset, off_t nbytes)
     * We compile all alternatives that are supported on the current platform,
     * to find portability problems more easily.
     */
+#if USE_LIBURING
+   {
+       PgAioInProgress *aio = pgaio_io_get();
+
+       pgaio_io_start_flush_range(aio, fd, offset, nbytes);
+       pgaio_io_release(aio);
+       return;
+   }
+#endif
 #if defined(HAVE_SYNC_FILE_RANGE)
    {
        int         rc;
@@ -1040,7 +1050,11 @@ tryAgain:
    fd = open(fileName, fileFlags, fileMode);
 
    if (fd >= 0)
+   {
+       //elog(DEBUG1, "opening file %s fd %d", fileName, fd);
+
        return fd;              /* success! */
+   }
 
    if (errno == EMFILE || errno == ENFILE)
    {
@@ -1184,6 +1198,8 @@ LruDelete(File file)
 
    vfdP = &VfdCache[file];
 
+   pgaio_submit_pending(false);
+
    /*
     * Close the file.  We aren't expecting this to fail; if it does, better
     * to leak the FD than to mess up our internal state.
@@ -1863,6 +1879,8 @@ FileClose(File file)
 
    if (!FileIsNotOpen(file))
    {
+       pgaio_submit_pending(false);
+
        /* close the file */
        if (close(vfdP->fd) != 0)
        {
@@ -1991,6 +2009,10 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
    if (nbytes <= 0)
        return;
 
+   /* no point */
+   if (VfdCache[file].fileFlags & O_DIRECT)
+       return;
+
    returnCode = FileAccess(file);
    if (returnCode < 0)
        return;
@@ -2056,6 +2078,30 @@ retry:
    return returnCode;
 }
 
+bool
+FileStartRead(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, int mode)
+{
+   int         returnCode;
+   Vfd        *vfdP;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileStartRead: %d (%s) " INT64_FORMAT " %d %p",
+              file, VfdCache[file].fileName,
+              (int64) offset,
+              amount, buffer));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return false;
+
+   vfdP = &VfdCache[file];
+
+   pgaio_io_start_read_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, mode);
+
+   return true;
+}
+
 int
 FileWrite(File file, char *buffer, int amount, off_t offset,
          uint32 wait_event_info)
@@ -2154,6 +2200,30 @@ retry:
    return returnCode;
 }
 
+bool
+FileStartWrite(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, bool release_lock)
+{
+   int         returnCode;
+   Vfd        *vfdP;
+
+   Assert(FileIsValid(file));
+
+   DO_DB(elog(LOG, "FileStartWrite: %d (%s) " INT64_FORMAT " %d %p",
+              file, VfdCache[file].fileName,
+              (int64) offset,
+              amount, buffer));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return false;
+
+   vfdP = &VfdCache[file];
+
+   pgaio_io_start_write_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, release_lock);
+
+   return true;
+}
+
 int
 FileSync(File file, uint32 wait_event_info)
 {
@@ -2246,7 +2316,14 @@ FilePathName(File file)
 int
 FileGetRawDesc(File file)
 {
+   int         returnCode;
+
    Assert(FileIsValid(file));
+
+   returnCode = FileAccess(file);
+   if (returnCode < 0)
+       return returnCode;
+
    return VfdCache[file].fd;
 }
 
@@ -2527,6 +2604,7 @@ FreeDesc(AllocateDesc *desc)
            result = closedir(desc->desc.dir);
            break;
        case AllocateDescRawFD:
+           pgaio_submit_pending(false);
            result = close(desc->desc.fd);
            break;
        default:
@@ -2595,6 +2673,8 @@ CloseTransientFile(int fd)
    /* Only get here if someone passes us a file not in allocatedDescs */
    elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
 
+   pgaio_submit_pending(false);
+
    return close(fd);
 }
 
index df90c6b093fd7b826d34108946bf75e1640db2ae..ab0933028215085cdf77149763f52d27ebd18acd 100644 (file)
@@ -9,6 +9,8 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = \
+   aio.o \
+   aio_util.o \
    barrier.o \
    dsm.o \
    dsm_impl.o \
@@ -27,4 +29,7 @@ OBJS = \
    sinvaladt.o \
    standby.o
 
+aio.o: override CPPFLAGS += $(LIBURING_CFLAGS)
+aio.bc: override CPPFLAGS += $(LIBURING_CFLAGS)
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/aio.c b/src/backend/storage/ipc/aio.c
new file mode 100644 (file)
index 0000000..b7374ac
--- /dev/null
@@ -0,0 +1,4069 @@
+/*
+ * Big picture changes:
+ * - backend local recycleable IOs
+ * - merging of IOs when submitting individual IOs, not when submitting all pending IOs
+ * - reorganization of shared callback system, so there's an underlying
+ *   "write" operation that's used both by WAL, generic, ...  writes.
+ * - Consider not exposing PgAioInProgress* at all, instead expose a PgAioReference { uint32 io; uint64 generation; }
+ *   which would make it a lot less problematic to immediate reuse IOs.
+ * - Shrink size of PgAioInProgress
+ * - refcount bounc buffers / redesign
+ * - get rid of the current backpressure logic
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <liburing.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "lib/ilist.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "nodes/memnodes.h"
+#include "pgstat.h"
+#include "access/xlog_internal.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+
+#define PGAIO_VERBOSE
+
+
+/*
+ * FIXME: This is just so large because merging happens when submitting
+ * pending requests, rather than when staging them.
+ */
+#define PGAIO_SUBMIT_BATCH_SIZE 256
+#define PGAIO_MAX_LOCAL_REAPED 128
+#define PGAIO_MAX_COMBINE 16
+
+#define PGAIO_NUM_CONTEXTS 8
+
+/*
+ * The type of AIO.
+ *
+ * To keep PgAioInProgress smaller try to tell the compiler to only use the
+ * minimal space. We could alternatively just use a uint8, but then we'd need
+ * casts in more places...
+ */
+typedef enum
+#ifdef pg_attribute_packed
+pg_attribute_packed()
+#endif
+   PgAioAction
+{
+   /* intentionally the zero value, to help catch zeroed memory etc */
+   PGAIO_INVALID = 0,
+
+   PGAIO_NOP,
+   /* FIXME: unify */
+   PGAIO_FSYNC,
+   PGAIO_FSYNC_WAL,
+   PGAIO_FLUSH_RANGE,
+
+   PGAIO_READ_BUFFER,
+   /* FIXME: unify */
+   PGAIO_WRITE_BUFFER,
+   PGAIO_WRITE_WAL,
+   PGAIO_WRITE_GENERIC,
+} PgAioAction;
+
+typedef enum PgAioInProgressFlags
+{
+   /* request in the ->unused list */
+   PGAIOIP_UNUSED = 1 << 0,
+
+   /*  */
+   PGAIOIP_IDLE = 1 << 1,
+
+   /*  */
+   PGAIOIP_IN_PROGRESS = 1 << 2,
+
+   /* somewhere */
+   PGAIOIP_PENDING = 1 << 3,
+
+   /* request in kernel */
+   PGAIOIP_INFLIGHT = 1 << 4,
+
+   /* request reaped */
+   PGAIOIP_REAPED = 1 << 5,
+
+   /* shared completion callback was called */
+   PGAIOIP_SHARED_CALLBACK_CALLED = 1 << 6,
+
+   /* completed */
+   PGAIOIP_DONE = 1 << 7,
+
+   PGAIOIP_FOREIGN_DONE = 1 << 8,
+
+   /* IO is merged with others */
+   PGAIOIP_MERGE = 1 << 9,
+
+   PGAIOIP_RETRY = 1 << 10,
+
+   /* request failed completely */
+   PGAIOIP_HARD_FAILURE = 1 << 11,
+
+   /* request failed partly, e.g. a short write */
+   PGAIOIP_SOFT_FAILURE = 1 << 12,
+
+   PGAIOIP_SHARED_FAILED = 1 << 13,
+
+   /* local completion callback was called */
+   PGAIOIP_LOCAL_CALLBACK_CALLED = 1 << 14,
+
+} PgAioInProgressFlags;
+
+/* IO completion callback */
+typedef bool (*PgAioCompletedCB)(PgAioInProgress *io);
+
+typedef uint16 PgAioIPFlags;
+
+struct PgAioInProgress
+{
+   /* PgAioAction, indexes PgAioCompletionCallbacks */
+   PgAioAction type;
+
+   PgAioIPFlags flags;
+
+   bool user_referenced;
+   bool system_referenced;
+
+   /* which AIO ring is this entry active for */
+   uint8 ring;
+
+   /* index into allProcs, or PG_UINT32_MAX for process local IO */
+   uint32 owner_id;
+
+   /* the IOs result, depends on operation. E.g. the length of a read */
+   int32 result;
+
+   /*
+    * Single callback that can be registered on an IO to be called upon
+    * completion. Note that this is reset whenever an IO is recycled..
+    */
+   PgAioOnCompletionLocalContext *on_completion_local;
+
+   /*
+    * Membership in one of
+    * PgAioCtl->unused,
+    * PgAioPerBackend->unused,
+    * PgAioPerBackend->outstanding,
+    * PgAioPerBackend->issued,
+    */
+   dlist_node owner_node;
+
+   /*
+    * Membership in
+    * PgAioPerBackend->pending,
+    * PgAioPerBackend->reaped,
+    * local_recycle_requests
+    * PgAioPerBackend->foreign_completed,
+    * PgAioPerBackend->local_completed
+    */
+   dlist_node io_node;
+
+   ConditionVariable cv;
+
+   /* index into context->iovec, or -1 */
+   int32 used_iovec;
+
+   PgAioBounceBuffer *bb;
+
+   PgAioInProgress *merge_with;
+
+   uint64 generation;
+
+   /*
+    * NB: Note that fds in here may *not* be relied upon for re-issuing
+    * requests (e.g. for partial reads/writes) - the fd might be from another
+    * process, or closed since. That's not a problem for IOs waiting to be
+    * issued only because the queue is flushed when closing an fd.
+    */
+   union {
+       struct
+       {
+           int fd;
+           bool barrier;
+           bool datasync;
+       } fsync;
+
+       struct
+       {
+           int fd;
+           bool barrier;
+           bool datasync;
+           uint32 flush_no;
+       } fsync_wal;
+
+       struct
+       {
+           int fd;
+           uint64 nbytes;
+           uint32 offset;
+       } flush_range;
+
+       struct
+       {
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           int fd;
+           char *bufdata;
+           Buffer buf;
+           AioBufferTag tag;
+           int mode;
+       } read_buffer;
+
+       struct
+       {
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           int fd;
+           char *bufdata;
+           Buffer buf;
+           bool release_lock;
+           AioBufferTag tag;
+       } write_buffer;
+
+       struct
+       {
+           int fd;
+           uint32 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           char *bufdata;
+           bool no_reorder;
+           uint32 write_no;
+       } write_wal;
+
+       struct
+       {
+           int fd;
+           uint64 offset;
+           uint32 nbytes;
+           uint32 already_done;
+           char *bufdata;
+           bool no_reorder;
+       } write_generic;
+   } d;
+};
+
+/* typedef in header */
+struct PgAioBounceBuffer
+{
+   pg_atomic_uint32 refcount;
+   dlist_node node;
+   char *buffer;
+};
+
+/*
+ * An iovec that can represent the biggest possible iovec (due to combining)
+ * we may need for a single IO submission.
+ */
+typedef struct PgAioIovec
+{
+   slist_node node;
+   struct iovec iovec[PGAIO_MAX_COMBINE];
+} PgAioIovec;
+
+
+/*
+ * XXX: Really want a proclist like structure that works with integer
+ * offsets. Given the limited number of IOs ever existing, using full pointers
+ * is completely unnecessary.
+ */
+
+typedef struct PgAioPerBackend
+{
+   uint32 last_context;
+
+   /*
+    * Local unused IOs. There's only a limited number of these. Used to
+    * reduce overhead of the central unused list.
+    *
+    * FIXME: Actually use.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head unused;
+   uint32 unused_count;
+
+   /*
+    * IOs handed out to code within the backend.
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head outstanding;
+   uint32 outstanding_count;
+
+   /*
+    * Requests waiting to be issued to the kernel. They are submitted to the
+    * kernel in batches, for efficiency (local merging of IOs, and better
+    * kernel side queue processing).
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head pending;
+   uint32 pending_count;
+
+   /*
+    * Requests issued by backend that have not yet completed yet (but may be
+    * foreign_completed) and are still referenced by backend code (see
+    * issued_abandoned for those).
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head issued;
+   uint32 issued_count;
+
+   /*
+    * Requests issued by backend that have not yet completed yet (but may be
+    * foreign_completed) and that are not referenced by backend code anymore (see
+    * issued for those).
+    *
+    * PgAioInProgress->owner_node
+    */
+   dlist_head issued_abandoned;
+   uint32 issued_abandoned_count;
+
+   /*
+    * PgAioInProgress that are issued to the ringbuffer, and have not yet
+    * been processed (but they may have completed without the completions
+    * having been processed).
+    */
+   pg_atomic_uint32 inflight_count;
+
+   /*
+    * Requests where we've received a kernel completion, but haven't yet
+    * processed them.  This is needed to handle failing callbacks.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head reaped;
+
+   /*
+    * IOs that were completed, but not yet recycled.
+    *
+    * PgAioInProgress->io_node
+    */
+   dlist_head local_completed;
+   uint32 local_completed_count;
+
+   /*
+    * IOs where the completion was received in another backend.
+    *
+    * Could be singly linked list.
+    *
+    * PgAioInProgress->io_node
+    */
+   slock_t foreign_completed_lock;
+   uint32 foreign_completed_count;
+   dlist_head foreign_completed;
+
+   /*
+    * Stats.
+    */
+   uint64 executed_total_count; /* un-merged */
+   uint64 issued_total_count; /* merged */
+   uint64 submissions_total_count; /* number of submission syscalls */
+   uint64 foreign_completed_total_count;
+   uint64 retry_total_count;
+
+} PgAioPerBackend;
+
+typedef struct PgAioContext
+{
+   LWLock submission_lock;
+   LWLock completion_lock;
+
+   struct io_uring io_uring_ring;
+
+   /*
+    * For many versions of io_uring iovecs need to be in shared memory. The
+    * lists of available iovecs are split to be under the submission /
+    * completion locks - that allows to avoid additional lock acquisitions in
+    * the common cases.
+    */
+   PgAioIovec *iovecs;
+
+   /* locked by submission lock */
+   slist_head unused_iovecs;
+   uint32 unused_iovecs_count;
+
+   /* locked by completion lock */
+   slist_head reaped_iovecs;
+   uint32 reaped_iovecs_count;
+
+   /* XXX: probably worth padding to a cacheline boundary here */
+} PgAioContext;
+
+typedef struct PgAioCtl
+{
+   /* PgAioInProgress that are not used */
+   dlist_head unused_ios;
+
+   /*
+    * Number of PgAioInProgressIOs that are in use. This includes pending
+    * requests, as well as requests actually issues to the queue.
+    *
+    * Protected by SharedAIOCtlLock.
+    */
+   uint32 used_count;
+
+   /*
+    * Protected by SharedAIOCtlLock.
+    */
+   dlist_head reaped_uncompleted;
+
+   PgAioBounceBuffer *bounce_buffers;
+   dlist_head unused_bounce_buffers;
+   uint32 unused_bounce_buffers_count;
+
+   int backend_state_count;
+   PgAioPerBackend *backend_state;
+
+   uint32 num_contexts;
+   PgAioContext *contexts;
+
+   PgAioInProgress in_progress_io[FLEXIBLE_ARRAY_MEMBER];
+} PgAioCtl;
+
+/* general pgaio helper functions */
+static void pgaio_complete_ios(bool in_error);
+static void pgaio_apply_backend_limit(void);
+static void pgaio_prepare_io(PgAioInProgress *io, PgAioAction action);
+static void pgaio_finish_io(PgAioInProgress *io);
+static void pgaio_bounce_buffer_release_internal(PgAioBounceBuffer *bb, bool holding_lock, bool release_resowner);
+static void pgaio_io_ref_internal(PgAioInProgress *io, PgAioIoRef *ref);
+
+/* io_uring related functions */
+static int pgaio_uring_submit(int max_submit, bool drain);
+static int pgaio_uring_drain(PgAioContext *context);
+static void pgaio_uring_wait_one(PgAioContext *context, PgAioInProgress *io, uint64 generation, uint32 wait_event_info);
+
+static void pgaio_uring_sq_from_io(PgAioContext *context, PgAioInProgress *io, struct io_uring_sqe *sqe);
+static void pgaio_uring_io_from_cqe(PgAioContext *context, struct io_uring_cqe *cqe);
+static void pgaio_uring_iovec_transfer(PgAioContext *context);
+
+static int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+                               unsigned flags, sigset_t *sig);
+
+/* io completions */
+static bool pgaio_complete_nop(PgAioInProgress *io);
+static bool pgaio_complete_fsync(PgAioInProgress *io);
+static bool pgaio_complete_fsync_wal(PgAioInProgress *io);
+static bool pgaio_complete_flush_range(PgAioInProgress *io);
+static bool pgaio_complete_read_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_wal(PgAioInProgress *io);
+static bool pgaio_complete_write_generic(PgAioInProgress *io);
+
+
+static MemoryContext aio_retry_context;
+
+/*
+ * To support EXEC_BACKEND environments, where we cannot rely on callback
+ * addresses being equivalent across processes, completion actions are just
+ * indices into a process local array of callbacks, indexed by the type of
+ * action.  Also makes the shared memory entries a bit smaller, but that's not
+ * a huge win.
+ */
+static const PgAioCompletedCB completion_callbacks[] =
+{
+   [PGAIO_NOP] = pgaio_complete_nop,
+   [PGAIO_FSYNC] = pgaio_complete_fsync,
+   [PGAIO_FSYNC_WAL] = pgaio_complete_fsync_wal,
+   [PGAIO_FLUSH_RANGE] = pgaio_complete_flush_range,
+   [PGAIO_READ_BUFFER] = pgaio_complete_read_buffer,
+   [PGAIO_WRITE_BUFFER] = pgaio_complete_write_buffer,
+   [PGAIO_WRITE_WAL] = pgaio_complete_write_wal,
+   [PGAIO_WRITE_GENERIC] = pgaio_complete_write_generic,
+};
+
+
+/* (future) GUC controlling global MAX number of in-progress IO entries */
+/* FIXME: find a good naming pattern */
+extern int max_aio_in_progress;
+/* FIXME: this is per context right now */
+extern int max_aio_in_flight;
+extern int max_aio_bounce_buffers;
+
+/* max per backend concurrency */
+extern int io_max_concurrency;
+
+int max_aio_in_progress = 32768; /* XXX: Multiple of MaxBackends instead? */
+int max_aio_in_flight = 4096;
+int max_aio_bounce_buffers = 1024;
+int io_max_concurrency = 128;
+
+/* global list of in-progress IO */
+static PgAioCtl *aio_ctl;
+
+/* current backend's per-backend-state */
+static PgAioPerBackend *my_aio;
+static int my_aio_id;
+
+/* FIXME: move into PgAioPerBackend / subsume into ->reaped */
+static dlist_head local_recycle_requests;
+
+
+/* io_uring local state */
+struct io_uring local_ring;
+
+static Size
+AioCtlShmemSize(void)
+{
+   Size        sz;
+
+   /* aio_ctl itself */
+   sz = offsetof(PgAioCtl, in_progress_io);
+
+   /* ios */
+   sz = add_size(sz, mul_size(max_aio_in_progress, sizeof(PgAioInProgress)));
+
+   return sz;
+}
+
+static Size
+AioCtlBackendShmemSize(void)
+{
+   uint32      TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+   return mul_size(TotalProcs, sizeof(PgAioPerBackend));
+}
+
+static Size
+AioBounceShmemSize(void)
+{
+   Size        sz;
+
+   /* PgAioBounceBuffer itself */
+   sz = mul_size(sizeof(PgAioBounceBuffer), max_aio_bounce_buffers);
+
+   /* and the associated buffer */
+   sz = add_size(sz,
+                 mul_size(BLCKSZ, add_size(max_aio_bounce_buffers, 1)));
+
+   return sz;
+}
+
+static Size
+AioContextShmemSize(void)
+{
+   return mul_size(PGAIO_NUM_CONTEXTS, sizeof(PgAioContext));
+}
+
+static Size
+AioContextIovecsShmemSize(void)
+{
+   return mul_size(PGAIO_NUM_CONTEXTS,
+                   mul_size(sizeof(PgAioIovec), max_aio_in_flight));
+}
+
+Size
+AioShmemSize(void)
+{
+   Size        sz = 0;
+
+   sz = add_size(sz, AioCtlShmemSize());
+   sz = add_size(sz, AioBounceShmemSize());
+   sz = add_size(sz, AioCtlBackendShmemSize());
+   sz = add_size(sz, AioContextShmemSize());
+   sz = add_size(sz, AioContextIovecsShmemSize());
+
+   return sz;
+}
+
+void
+AioShmemInit(void)
+{
+   bool        found;
+   uint32      TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+   aio_ctl = (PgAioCtl *)
+       ShmemInitStruct("PgAio", AioCtlShmemSize(), &found);
+
+   if (!found)
+   {
+       memset(aio_ctl, 0, AioCtlShmemSize());
+
+       dlist_init(&aio_ctl->unused_ios);
+       dlist_init(&aio_ctl->reaped_uncompleted);
+
+       for (int i = 0; i < max_aio_in_progress; i++)
+       {
+           PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+
+           ConditionVariableInit(&io->cv);
+           dlist_push_tail(&aio_ctl->unused_ios, &io->owner_node);
+           io->flags = PGAIOIP_UNUSED;
+           io->system_referenced = true;
+           io->generation = 1;
+       }
+
+       aio_ctl->backend_state_count = TotalProcs;
+       aio_ctl->backend_state = (PgAioPerBackend *)
+           ShmemInitStruct("PgAioBackend", AioCtlBackendShmemSize(), &found);
+       memset(aio_ctl->backend_state, 0, AioCtlBackendShmemSize());
+
+       for (int procno = 0; procno < TotalProcs; procno++)
+       {
+           PgAioPerBackend *bs = &aio_ctl->backend_state[procno];
+
+           dlist_init(&bs->unused);
+           dlist_init(&bs->outstanding);
+           dlist_init(&bs->pending);
+           dlist_init(&bs->issued);
+           dlist_init(&bs->issued_abandoned);
+           pg_atomic_init_u32(&bs->inflight_count, 0);
+           dlist_init(&bs->reaped);
+
+           dlist_init(&bs->foreign_completed);
+           SpinLockInit(&bs->foreign_completed_lock);
+       }
+
+       {
+           char *p;
+           char *blocks;
+
+           dlist_init(&aio_ctl->unused_bounce_buffers);
+           aio_ctl->bounce_buffers =
+               ShmemInitStruct("PgAioBounceBuffers",
+                               sizeof(PgAioBounceBuffer) * max_aio_bounce_buffers,
+                               &found);
+           Assert(!found);
+
+           p = ShmemInitStruct("PgAioBounceBufferBlocks",
+                               BLCKSZ * (max_aio_bounce_buffers + 1),
+                               &found);
+           Assert(!found);
+           blocks = (char *) TYPEALIGN(BLCKSZ, (uintptr_t) p);
+
+           for (int i = 0; i < max_aio_bounce_buffers; i++)
+           {
+               PgAioBounceBuffer *bb = &aio_ctl->bounce_buffers[i];
+
+               bb->buffer = blocks + i * BLCKSZ;
+               memset(bb->buffer, 0, BLCKSZ);
+               pg_atomic_init_u32(&bb->refcount, 0);
+               dlist_push_tail(&aio_ctl->unused_bounce_buffers, &bb->node);
+               aio_ctl->unused_bounce_buffers_count++;
+           }
+       }
+
+       {
+           PgAioIovec *iovecs;
+
+           aio_ctl->num_contexts = PGAIO_NUM_CONTEXTS;
+           aio_ctl->contexts = ShmemInitStruct("PgAioContexts", AioContextShmemSize(), &found);
+           Assert(!found);
+
+           iovecs = (PgAioIovec *)
+           ShmemInitStruct("PgAioContextsIovecs", AioContextIovecsShmemSize(), &found);
+           Assert(!found);
+           memset(iovecs, 0, AioContextIovecsShmemSize());
+
+           for (int contextno = 0; contextno < aio_ctl->num_contexts; contextno++)
+           {
+               PgAioContext *context = &aio_ctl->contexts[contextno];
+               int ret;
+
+               LWLockInitialize(&context->submission_lock, LWTRANCHE_AIO_CONTEXT_SUBMISSION);
+               LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_CONTEXT_COMPLETION);
+
+               slist_init(&context->unused_iovecs);
+               slist_init(&context->reaped_iovecs);
+
+               context->iovecs = iovecs;
+               iovecs += max_aio_in_flight;
+
+               for (uint32 i = 0; i < max_aio_in_flight; i++)
+               {
+                   slist_push_head(&context->unused_iovecs, &context->iovecs[i].node);
+                   context->unused_iovecs_count++;
+               }
+
+               /*
+                * XXX: Probably worth sharing the WQ between the different
+                * rings, when supported by the kernel. Could also cause
+                * additional contention, I guess?
+                */
+               if (!AcquireExternalFD())
+                   elog(ERROR, "io_uring_queue_init: %m");
+               ret = io_uring_queue_init(max_aio_in_flight, &context->io_uring_ring, 0);
+               if (ret < 0)
+                   elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+           }
+       }
+
+   }
+}
+
+void
+pgaio_postmaster_init(void)
+{
+   /* FIXME: should also be allowed to use AIO */
+   dlist_init(&local_recycle_requests);
+
+   // XXX: could create a local queue here.
+
+   /*
+    * Need to be allowed to re-open files during retries. Those can happen,
+    * e.g. when fsyncing WAL, within a critical section. Reopening files
+    * currently requires memory. So create a context with small reservation
+    * that's allowed to be used within a critical section.
+    */
+   aio_retry_context = AllocSetContextCreate(TopMemoryContext,
+                                             "aio retry context",
+                                             1024,
+                                             1024,
+                                             1024);
+   MemoryContextAllowInCriticalSection(aio_retry_context, true);
+}
+
+void
+pgaio_postmaster_child_init_local(void)
+{
+   /*
+    *
+    */
+   {
+       int ret;
+
+       ret = io_uring_queue_init(32, &local_ring, 0);
+       if (ret < 0)
+       {
+           elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+       }
+   }
+}
+
+static void
+pgaio_postmaster_before_child_exit(int code, Datum arg)
+{
+   elog(DEBUG2, "aio before shmem exit: start");
+
+   /*
+    * Need to wait for in-progress IOs initiated by this backend to
+    * finish. Some operating systems, like linux w/ io_uring, cancel IOs that
+    * are still in progress when exiting. Other's don't provide access to the
+    * results of such IOs.
+    */
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       pgaio_io_release(io);
+   }
+
+   Assert(my_aio->issued_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued));
+
+   while (!dlist_is_empty(&my_aio->issued_abandoned))
+   {
+       PgAioInProgress *io = NULL;
+       PgAioIoRef ref;
+
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+       if (!dlist_is_empty(&my_aio->issued_abandoned))
+       {
+           io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued_abandoned);
+           pgaio_io_ref_internal(io, &ref);
+       }
+       LWLockRelease(SharedAIOCtlLock);
+
+       if (!io)
+       {
+           elog(LOG, "skipped exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+           break;
+       }
+
+       elog(LOG, "exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+       pgaio_io_print(io, NULL);
+       pgaio_io_wait_ref(&ref, false);
+   }
+
+   elog(DEBUG2, "aio before shmem exit: end");
+}
+
+static void
+pgaio_postmaster_child_exit(int code, Datum arg)
+{
+   /* FIXME: handle unused */
+   Assert(my_aio->outstanding_count == 0);
+   Assert(dlist_is_empty(&my_aio->outstanding));
+
+   Assert(my_aio->pending_count == 0);
+   Assert(dlist_is_empty(&my_aio->pending));
+
+   Assert(my_aio->issued_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued));
+
+   Assert(my_aio->issued_abandoned_count == 0);
+   Assert(dlist_is_empty(&my_aio->issued_abandoned));
+
+   Assert(pg_atomic_read_u32(&my_aio->inflight_count) == 0);
+
+   Assert(dlist_is_empty(&my_aio->reaped));
+
+   Assert(my_aio->local_completed_count == 0);
+   Assert(dlist_is_empty(&my_aio->local_completed));
+
+   Assert(my_aio->foreign_completed_count == 0);
+   Assert(dlist_is_empty(&my_aio->foreign_completed));
+}
+
+void
+pgaio_postmaster_child_init(void)
+{
+   /* no locking needed here, only affects this process */
+   for (int i = 0; i < aio_ctl->num_contexts; i++)
+       io_uring_ring_dontfork(&aio_ctl->contexts[i].io_uring_ring);
+
+   my_aio_id = MyProc->pgprocno;
+   my_aio = &aio_ctl->backend_state[my_aio_id];
+
+   dlist_init(&local_recycle_requests);
+
+   before_shmem_exit(pgaio_postmaster_before_child_exit, 0);
+   on_shmem_exit(pgaio_postmaster_child_exit, 0);
+
+   Assert(my_aio->unused_count == 0);
+   Assert(my_aio->outstanding_count == 0);
+   Assert(my_aio->issued_count == 0);
+   Assert(my_aio->issued_abandoned_count == 0);
+   Assert(my_aio->pending_count == 0);
+   Assert(my_aio->local_completed_count == 0);
+   Assert(my_aio->foreign_completed_count == 0);
+
+   /* try to spread out a bit from the start */
+   my_aio->last_context = MyProcPid % PGAIO_NUM_CONTEXTS;
+
+   my_aio->executed_total_count = 0;
+   my_aio->issued_total_count = 0;
+   my_aio->submissions_total_count = 0;
+   my_aio->foreign_completed_total_count = 0;
+   my_aio->retry_total_count = 0;
+}
+
+void
+pgaio_at_abort(void)
+{
+   pgaio_complete_ios(/* in_error = */ true);
+
+   pgaio_submit_pending(false);
+
+   while (!dlist_is_empty(&my_aio->outstanding))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+       pgaio_io_release(io);
+   }
+
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       pgaio_io_release(io);
+   }
+}
+
+void
+pgaio_at_commit(void)
+{
+   Assert(dlist_is_empty(&local_recycle_requests));
+
+   if (my_aio->pending_count != 0)
+   {
+       elog(WARNING, "unsubmitted IOs %d", my_aio->pending_count);
+       pgaio_submit_pending(false);
+   }
+
+   while (!dlist_is_empty(&my_aio->outstanding))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+       elog(WARNING, "leaked outstanding io %zu", io - aio_ctl->in_progress_io);
+
+       pgaio_io_release(io);
+   }
+
+   while (!dlist_is_empty(&my_aio->issued))
+   {
+       PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+       elog(WARNING, "leaked issued io %zu", io - aio_ctl->in_progress_io);
+
+       pgaio_io_release(io);
+   }
+}
+
+static int
+pgaio_uncombine_one(PgAioInProgress *io)
+{
+   int orig_result = io->result;
+   int running_result = orig_result;
+   PgAioInProgress *cur = io;
+   PgAioInProgress *last = NULL;
+   int extracted = 0;
+
+   while (cur)
+   {
+       PgAioInProgress *next = cur->merge_with;
+
+       Assert(!(cur->flags & PGAIOIP_SHARED_CALLBACK_CALLED));
+       Assert(cur->merge_with || cur != io);
+
+       switch (cur->type)
+       {
+           case PGAIO_READ_BUFFER:
+               Assert(cur->d.read_buffer.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.read_buffer.nbytes)
+               {
+                   cur->result = cur->d.read_buffer.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.read_buffer.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+
+               break;
+
+           case PGAIO_WRITE_BUFFER:
+               Assert(cur->d.write_buffer.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_buffer.nbytes)
+               {
+                   cur->result = cur->d.write_buffer.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_buffer.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           case PGAIO_WRITE_WAL:
+               Assert(cur->d.write_wal.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_wal.nbytes)
+               {
+                   cur->result = cur->d.write_wal.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_wal.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           case PGAIO_WRITE_GENERIC:
+               Assert(cur->d.write_generic.already_done == 0);
+
+               if (orig_result < 0)
+               {
+                   cur->result = io->result;
+               }
+               else if (running_result >= cur->d.write_generic.nbytes)
+               {
+                   cur->result = cur->d.write_generic.nbytes;
+                   running_result -= cur->result;
+               }
+               else if (running_result < cur->d.write_generic.nbytes)
+               {
+                   cur->result = running_result;
+                   running_result = 0;
+               }
+               break;
+
+           default:
+               elog(PANIC, "merge for %d not supported yet", cur->type);
+       }
+
+       cur->merge_with = NULL;
+
+       if (last)
+       {
+           cur->flags =
+               (cur->flags & ~(PGAIOIP_INFLIGHT |
+                               PGAIOIP_MERGE)) |
+               PGAIOIP_REAPED;
+
+           Assert(dlist_is_member(&my_aio->reaped, &last->io_node));
+           dlist_insert_after(&last->io_node, &cur->io_node);
+           extracted++;
+       }
+       else
+       {
+           cur->flags &= ~PGAIOIP_MERGE;
+       }
+
+       last = cur;
+       cur = next;
+   }
+
+   return extracted;
+}
+
+static void
+pgaio_uncombine(void)
+{
+   dlist_mutable_iter iter;
+
+   /* "unmerge" merged IOs, so they can be treated uniformly */
+   dlist_foreach_modify(iter, &my_aio->reaped)
+   {
+       PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, iter.cur);
+       uint32 extracted = 1;
+
+       if (io->flags & PGAIOIP_MERGE)
+           extracted += pgaio_uncombine_one(io);
+
+       pg_atomic_fetch_sub_u32(&aio_ctl->backend_state[io->owner_id].inflight_count, 1);
+   }
+}
+
+static void  __attribute__((noinline))
+pgaio_complete_ios(bool in_error)
+{
+   int pending_count_before = my_aio->pending_count;
+
+   Assert(!LWLockHeldByMe(SharedAIOCtlLock));
+
+   /* call all callbacks, without holding lock */
+   while (!dlist_is_empty(&my_aio->reaped))
+   {
+       dlist_node *node = dlist_head_node(&my_aio->reaped);
+       PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+       Assert(dlist_is_member(&my_aio->reaped, &io->io_node));
+
+       Assert(node != NULL);
+
+       if (!(io->flags & PGAIOIP_SHARED_CALLBACK_CALLED))
+       {
+           PgAioCompletedCB cb;
+           bool finished;
+
+           /*
+            * Set flag before calling callback, otherwise we could easily end
+            * up looping forever.
+            */
+           *(volatile PgAioIPFlags*) &io->flags |= PGAIOIP_SHARED_CALLBACK_CALLED;
+
+           cb = completion_callbacks[io->type];
+           finished = cb(io);
+
+           dlist_delete_from(&my_aio->reaped, node);
+
+           if (finished)
+           {
+               dlist_push_tail(&local_recycle_requests, &io->io_node);
+           }
+           else
+           {
+               Assert((*(volatile PgAioIPFlags*) &io->flags) & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE));
+
+               LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+               *(volatile PgAioIPFlags*) &io->flags =
+                   (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                   PGAIOIP_DONE |
+                   PGAIOIP_SHARED_FAILED;
+               dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+               LWLockRelease(SharedAIOCtlLock);
+
+               /* signal state change */
+               if (IsUnderPostmaster)
+                   ConditionVariableBroadcast(&io->cv);
+           }
+       }
+       else
+       {
+           Assert(in_error);
+
+           dlist_delete_from(&my_aio->reaped, node);
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           *(volatile PgAioIPFlags*) &io->flags =
+               (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+               PGAIOIP_DONE |
+               PGAIOIP_HARD_FAILURE |
+               PGAIOIP_SHARED_FAILED;
+           dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+           LWLockRelease(SharedAIOCtlLock);
+       }
+   }
+
+   /* if any IOs weren't fully done, re-submit them */
+   if (pending_count_before != my_aio->pending_count)
+       pgaio_submit_pending(false);
+
+   /*
+    * Next, under lock, process all the still pending requests. This entails
+    * releasing the "system" reference on the IO and checking which callbacks
+    * need to be called.
+    */
+   START_CRIT_SECTION();
+
+   while (!dlist_is_empty(&local_recycle_requests))
+   {
+       dlist_mutable_iter iter;
+       PgAioInProgress* signal_ios[32];
+       int to_signal = 0;
+
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+       dlist_foreach_modify(iter, &local_recycle_requests)
+       {
+           PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+           dlist_delete_from(&local_recycle_requests, iter.cur);
+           signal_ios[to_signal++] = cur;
+
+           Assert(cur->system_referenced);
+           Assert(cur->flags & PGAIOIP_REAPED);
+           Assert(!(cur->flags & PGAIOIP_DONE));
+           Assert(!(cur->flags & PGAIOIP_INFLIGHT));
+           Assert(!(cur->flags & PGAIOIP_MERGE));
+           Assert(!(cur->flags & (PGAIOIP_SHARED_FAILED)));
+           Assert(!(cur->flags & (PGAIOIP_SOFT_FAILURE)));
+           Assert(cur->merge_with == NULL);
+
+           if (cur->user_referenced)
+           {
+               cur->system_referenced = false;
+
+               if (cur->owner_id != my_aio_id)
+               {
+                   PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+                   SpinLockAcquire(&other->foreign_completed_lock);
+
+                   dlist_push_tail(&other->foreign_completed, &cur->io_node);
+                   other->foreign_completed_count++;
+                   other->foreign_completed_total_count++;
+
+                   pg_write_barrier();
+
+                   *(volatile PgAioIPFlags*) &cur->flags =
+                       (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                       PGAIOIP_DONE |
+                       PGAIOIP_FOREIGN_DONE;
+
+                   SpinLockRelease(&other->foreign_completed_lock);
+               }
+               else
+               {
+                   *(volatile PgAioIPFlags*) &cur->flags =
+                       (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+                       PGAIOIP_DONE;
+
+                   dlist_push_tail(&my_aio->local_completed, &cur->io_node);
+                   my_aio->local_completed_count++;
+               }
+           }
+           else
+           {
+               PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+#ifdef PGAIO_VERBOSE
+               ereport(DEBUG4,
+                       errmsg("removing aio %zu from issued_abandoned complete_ios",
+                              cur - aio_ctl->in_progress_io),
+                       errhidecontext(1),
+                       errhidestmt(1));
+#endif
+
+               dlist_delete_from(&other->issued_abandoned, &cur->owner_node);
+               Assert(other->issued_abandoned_count > 0);
+               other->issued_abandoned_count--;
+
+               cur->generation++;
+               pg_write_barrier();
+
+               cur->flags = PGAIOIP_UNUSED;
+
+               if (cur->bb)
+               {
+                   pgaio_bounce_buffer_release_internal(cur->bb,
+                                                        /* holding_lock = */ true,
+                                                        /* release_resowner = */ false);
+                   cur->bb = NULL;
+               }
+
+               cur->type = 0;
+               cur->owner_id = INVALID_PGPROCNO;
+               cur->result = 0;
+               cur->system_referenced = true;
+               cur->on_completion_local = NULL;
+
+               dlist_push_head(&aio_ctl->unused_ios, &cur->owner_node);
+               aio_ctl->used_count--;
+           }
+
+           if (to_signal >= lengthof(signal_ios))
+               break;
+       }
+       LWLockRelease(SharedAIOCtlLock);
+
+       if (IsUnderPostmaster)
+       {
+           for (int i = 0; i < to_signal; i++)
+           {
+               ConditionVariableBroadcast(&signal_ios[i]->cv);
+           }
+       }
+   }
+
+   END_CRIT_SECTION();
+}
+
+static void
+pgaio_io_call_local_callback(PgAioInProgress *io, bool in_error)
+{
+   Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+   Assert(io->user_referenced);
+
+   dlist_delete_from(&my_aio->issued, &io->owner_node);
+   my_aio->issued_count--;
+   dlist_push_tail(&my_aio->outstanding, &io->owner_node);
+   my_aio->outstanding_count++;
+
+   io->flags |= PGAIOIP_LOCAL_CALLBACK_CALLED;
+
+   if (!io->on_completion_local)
+       return;
+
+   if (!in_error)
+       io->on_completion_local->callback(io->on_completion_local, io);
+
+}
+
+/*
+ * Call all pending local callbacks.
+ */
+static void
+pgaio_call_local_callbacks(bool in_error)
+{
+   if (my_aio->local_completed_count != 0 &&
+       CritSectionCount == 0)
+   {
+       /* FIXME: this isn't safe against errors */
+       static int local_callback_depth = 0;
+
+       if (local_callback_depth == 0)
+       {
+           local_callback_depth++;
+
+           while (!dlist_is_empty(&my_aio->local_completed))
+           {
+               dlist_node *node = dlist_pop_head_node(&my_aio->local_completed);
+               PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+               Assert(my_aio->local_completed_count > 0);
+               my_aio->local_completed_count--;
+
+               pgaio_io_call_local_callback(io, in_error);
+           }
+
+           local_callback_depth--;
+       }
+   }
+}
+
+/*
+ * Receive completions in ring.
+ */
+static int  __attribute__((noinline))
+pgaio_drain(PgAioContext *context, bool in_error, bool call_local)
+{
+   int ndrained = 0;
+
+   ndrained = pgaio_uring_drain(context);
+
+   /*
+    * Transfer all the foreign completions into the local queue.
+    */
+   if (my_aio->foreign_completed_count != 0)
+   {
+       SpinLockAcquire(&my_aio->foreign_completed_lock);
+
+       while (!dlist_is_empty(&my_aio->foreign_completed))
+       {
+           dlist_node *node = dlist_pop_head_node(&my_aio->foreign_completed);
+           PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+           Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+
+           dlist_push_tail(&my_aio->local_completed, &io->io_node);
+           io->flags &= ~PGAIOIP_FOREIGN_DONE;
+           my_aio->foreign_completed_count--;
+           my_aio->local_completed_count++;
+       }
+       SpinLockRelease(&my_aio->foreign_completed_lock);
+   }
+
+   /*
+    * Call all pending local callbacks.
+    */
+   if (call_local && CritSectionCount == 0)
+       pgaio_call_local_callbacks(in_error);
+
+   return ndrained;
+}
+
+static bool
+pgaio_can_be_combined(PgAioInProgress *last, PgAioInProgress *cur)
+{
+   if (last->type != cur->type)
+       return false;
+
+   if (last->flags & PGAIOIP_RETRY ||
+       cur->flags & PGAIOIP_RETRY)
+       return false;
+
+   switch (last->type)
+   {
+       case PGAIO_INVALID:
+           elog(ERROR, "unexpected");
+           break;
+
+       case PGAIO_READ_BUFFER:
+           if (last->d.read_buffer.fd != cur->d.read_buffer.fd)
+               return false;
+           if ((last->d.read_buffer.offset + last->d.read_buffer.nbytes) != cur->d.read_buffer.offset)
+               return false;
+           if (last->d.read_buffer.mode != cur->d.read_buffer.mode)
+               return false;
+           if (last->d.read_buffer.already_done != 0 || cur->d.read_buffer.already_done != 0)
+               return false;
+
+           return true;
+
+       case PGAIO_NOP:
+       case PGAIO_FLUSH_RANGE:
+       case PGAIO_FSYNC:
+       case PGAIO_FSYNC_WAL:
+           return false;
+
+       case PGAIO_WRITE_BUFFER:
+           if (last->d.write_buffer.fd != cur->d.write_buffer.fd)
+               return false;
+           if ((last->d.write_buffer.offset + last->d.write_buffer.nbytes) != cur->d.write_buffer.offset)
+               return false;
+           if (last->d.write_buffer.already_done != 0 || cur->d.write_buffer.already_done != 0)
+               return false;
+           return true;
+
+       case PGAIO_WRITE_WAL:
+           if (last->d.write_wal.fd != cur->d.write_wal.fd)
+               return false;
+           if ((last->d.write_wal.offset + last->d.write_wal.nbytes) != cur->d.write_wal.offset)
+               return false;
+           if (last->d.write_wal.already_done != 0 || cur->d.write_wal.already_done != 0)
+               return false;
+           if (last->d.write_wal.no_reorder || cur->d.write_wal.no_reorder)
+               return false;
+           return true;
+
+       case PGAIO_WRITE_GENERIC:
+           if (last->d.write_generic.fd != cur->d.write_generic.fd)
+               return false;
+           if ((last->d.write_generic.offset + last->d.write_generic.nbytes) != cur->d.write_generic.offset)
+               return false;
+           if (last->d.write_generic.already_done != 0 || cur->d.write_generic.already_done != 0)
+               return false;
+           if (last->d.write_generic.no_reorder || cur->d.write_generic.no_reorder)
+               return false;
+           return true;
+   }
+
+   pg_unreachable();
+}
+
+static void
+pgaio_io_merge(PgAioInProgress *into, PgAioInProgress *tomerge)
+{
+   ereport(DEBUG3,
+           errmsg("merging %zu to %zu",
+                  tomerge - aio_ctl->in_progress_io,
+                  into - aio_ctl->in_progress_io),
+           errhidestmt(true),
+           errhidecontext(true));
+
+   into->merge_with = tomerge;
+   into->flags |= PGAIOIP_MERGE;
+}
+
+static void
+pgaio_combine_pending(void)
+{
+   dlist_iter iter;
+   PgAioInProgress *last = NULL;
+   int combined = 1;
+
+   Assert(my_aio->pending_count > 1);
+
+   dlist_foreach(iter, &my_aio->pending)
+   {
+       PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+       /* can happen when failing partway through io submission */
+       if (cur->merge_with)
+       {
+           elog(DEBUG1, "already merged request (%zu), giving up on merging",
+                cur - aio_ctl->in_progress_io);
+           return;
+       }
+
+       Assert(cur->merge_with == NULL);
+       Assert(!(cur->flags & PGAIOIP_MERGE));
+
+       if (last == NULL)
+       {
+           last = cur;
+           continue;
+       }
+
+       if (pgaio_can_be_combined(last, cur))
+       {
+           combined++;
+
+           pgaio_io_merge(last, cur);
+       }
+       else
+       {
+           combined = 1;
+       }
+
+       if (combined >= PGAIO_MAX_COMBINE)
+       {
+           ereport(DEBUG3,
+                   errmsg("max combine at %d", combined),
+                   errhidestmt(true),
+                   errhidecontext(true));
+           last = NULL;
+           combined = 1;
+       }
+       else
+           last = cur;
+   }
+}
+
+void  __attribute__((noinline))
+pgaio_submit_pending(bool drain)
+{
+   int total_submitted = 0;
+   uint32 orig_total;
+
+   if (!aio_ctl || !my_aio)
+       return;
+
+   if (my_aio->pending_count == 0)
+   {
+       Assert(dlist_is_empty(&my_aio->pending));
+       return;
+   }
+
+   HOLD_INTERRUPTS();
+
+   orig_total = my_aio->pending_count;
+
+#define COMBINE_ENABLED
+
+#ifdef COMBINE_ENABLED
+#if 0
+   ereport(LOG, errmsg("before combine"),
+           errhidestmt(true),
+           errhidecontext(true));
+   pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+   if (my_aio->pending_count > 1 && PGAIO_MAX_COMBINE > 1)
+       pgaio_combine_pending();
+
+#if 0
+   ereport(LOG, errmsg("after combine"),
+           errhidestmt(true),
+           errhidecontext(true));
+   pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+#endif /* COMBINE_ENABLED */
+
+   /*
+    * Loop until all pending IOs are submitted. Throttle max in-flight before
+    * calling into the IO implementation specific routine, so this code can
+    * be shared.
+    */
+   while (!dlist_is_empty(&my_aio->pending))
+   {
+       int max_submit;
+       int did_submit;
+
+       Assert(my_aio->pending_count > 0);
+       pgaio_apply_backend_limit();
+
+       Assert(my_aio->pending_count > 0);
+       if (my_aio->pending_count == 0)
+           break;
+
+       max_submit = Min(my_aio->pending_count, PGAIO_SUBMIT_BATCH_SIZE);
+       max_submit = Min(max_submit, io_max_concurrency - pg_atomic_read_u32(&my_aio->inflight_count));
+       Assert(max_submit > 0);
+
+       START_CRIT_SECTION();
+       did_submit = pgaio_uring_submit(max_submit, drain);
+       total_submitted += did_submit;
+       Assert(did_submit > 0 && did_submit <= max_submit);
+       END_CRIT_SECTION();
+   }
+
+   my_aio->executed_total_count += orig_total;
+   my_aio->issued_total_count += total_submitted;
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("submitted %d (orig %d)", total_submitted, orig_total),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   RESUME_INTERRUPTS();
+
+   if (drain)
+       pgaio_call_local_callbacks(/* in_error = */ false);
+}
+
+static void
+pgaio_io_prepare_submit(PgAioInProgress *io, uint32 ring)
+{
+   PgAioInProgress *cur;
+
+   cur = io;
+
+   while (cur)
+   {
+       Assert(cur->flags & PGAIOIP_PENDING);
+
+       cur->ring = ring;
+
+       pg_write_barrier();
+
+       *(volatile PgAioIPFlags*) &cur->flags =
+           (cur->flags & ~PGAIOIP_PENDING) | PGAIOIP_INFLIGHT;
+
+       dlist_delete_from(&my_aio->pending, &cur->io_node);
+       my_aio->pending_count--;
+
+       if (cur->flags & PGAIOIP_RETRY)
+       {
+           /* XXX: more error checks */
+       }
+       else if (cur->user_referenced)
+       {
+           Assert(my_aio_id == cur->owner_id);
+           Assert(my_aio->outstanding_count > 0);
+           dlist_delete_from(&my_aio->outstanding, &cur->owner_node);
+           my_aio->outstanding_count--;
+
+           dlist_push_tail(&my_aio->issued, &cur->owner_node);
+           my_aio->issued_count++;
+       }
+       else
+       {
+#ifdef PGAIO_VERBOSE
+           ereport(DEBUG4,
+                   errmsg("putting aio %zu onto issued_abandoned during submit",
+                          cur - aio_ctl->in_progress_io),
+                   errhidecontext(1),
+                   errhidestmt(1));
+#endif
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           dlist_push_tail(&my_aio->issued_abandoned, &cur->owner_node);
+           my_aio->issued_abandoned_count++;
+           LWLockRelease(SharedAIOCtlLock);
+       }
+
+       ereport(DEBUG5,
+               errmsg("readied %zu/%llu for submit",
+                      cur - aio_ctl->in_progress_io,
+                      (unsigned long long ) cur->generation),
+               errhidecontext(1),
+               errhidestmt(1));
+
+       cur = cur->merge_with;
+   }
+}
+
+static void
+pgaio_apply_backend_limit(void)
+{
+   uint32 current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+
+   while (current_inflight >= io_max_concurrency)
+   {
+       PgAioInProgress *io;
+
+       /*
+        * XXX: Should we be a bit fairer and check the "oldest" in-flight IO
+        * between issued and issued_abandoned?
+        */
+
+       if (my_aio->issued_count > 0)
+       {
+           dlist_iter iter;
+
+           Assert(!dlist_is_empty(&my_aio->issued));
+
+           dlist_foreach(iter, &my_aio->issued)
+           {
+               io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+               if (io->flags & PGAIOIP_INFLIGHT)
+               {
+                   PgAioIoRef ref;
+
+                   ereport(DEBUG2,
+                           errmsg("applying per-backend limit to issued IO %zu/%llu (current %d in %d, target %d)",
+                                  io - aio_ctl->in_progress_io,
+                                  (unsigned long long) io->generation,
+                                  my_aio->issued_count + my_aio->issued_abandoned_count,
+                                  current_inflight,
+                                  io_max_concurrency),
+                           errhidestmt(true),
+                           errhidecontext(true));
+
+                   pgaio_io_ref(io, &ref);
+                   pgaio_io_wait_ref(&ref, /* call_local = */ false);
+                   current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+                   break;
+               }
+           }
+       }
+
+       if (current_inflight < io_max_concurrency)
+           break;
+
+       if (my_aio->issued_abandoned_count > 0)
+       {
+           dlist_iter iter;
+           PgAioIoRef ref;
+
+           io = NULL;
+
+           LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+           dlist_foreach(iter, &my_aio->issued_abandoned)
+           {
+               io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+               if (io->flags & PGAIOIP_INFLIGHT)
+               {
+                   pgaio_io_ref_internal(io, &ref);
+                   break;
+               }
+               else
+                   io = NULL;
+           }
+           LWLockRelease(SharedAIOCtlLock);
+
+           if (io == NULL)
+               continue;
+
+           ereport(DEBUG2,
+                   errmsg("applying per-backend limit to issued_abandoned IO %zu/%llu (current %d in %d, target %d)",
+                          io - aio_ctl->in_progress_io,
+                          (unsigned long long) io->generation,
+                          my_aio->issued_count + my_aio->issued_abandoned_count,
+                          current_inflight,
+                          io_max_concurrency),
+                   errhidestmt(true),
+                   errhidecontext(true));
+
+           pgaio_io_wait_ref(&ref, false);
+       }
+
+       current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+   }
+}
+
+void
+pgaio_io_wait_ref(PgAioIoRef *ref, bool call_local)
+{
+   uint64 ref_generation;
+   PgAioInProgress *io;
+   uint32 done_flags = PGAIOIP_DONE;
+   PgAioIPFlags flags;
+   bool am_owner;
+
+   Assert(ref->aio_index < max_aio_in_progress);
+
+   io = &aio_ctl->in_progress_io[ref->aio_index];
+   ref_generation = ((uint64) ref->generation_upper) << 32 |
+       ref->generation_lower;
+
+   Assert(ref_generation != 0);
+
+   am_owner = io->owner_id == my_aio_id;
+   flags = io->flags;
+   pg_read_barrier();
+
+   if (io->generation != ref_generation)
+       return;
+
+   if (am_owner && (flags & PGAIOIP_PENDING))
+       pgaio_submit_pending(false);
+
+   Assert(!(flags & (PGAIOIP_UNUSED)));
+
+   while (true)
+   {
+       PgAioContext *context;
+
+       flags = io->flags;
+       context = &aio_ctl->contexts[io->ring];
+       pg_read_barrier();
+
+       if (io->generation != ref_generation)
+           return;
+
+       if (flags & done_flags)
+           goto wait_ref_out;
+
+       Assert(!(flags & (PGAIOIP_UNUSED)));
+
+       if (flags & PGAIOIP_INFLIGHT)
+       {
+           pgaio_drain(context, false, call_local);
+
+           flags = io->flags;
+           context = &aio_ctl->contexts[io->ring];
+           pg_read_barrier();
+
+           if (io->generation != ref_generation)
+               return;
+
+           if (flags & done_flags)
+               goto wait_ref_out;
+       }
+
+       if (my_aio->pending_count > 0 && call_local)
+       {
+           /* FIXME: we should call this in a larger number of cases */
+
+           /*
+            * If we otherwise would have to sleep submit all pending
+            * requests, to avoid others having to wait for us to submit
+            * them. Don't want to do so when not needing to sleep, as
+            * submitting IOs in smaller increments can be less efficient.
+            */
+           pgaio_submit_pending(false);
+       }
+       else if (flags & PGAIOIP_INFLIGHT)
+       {
+           /* note that this is allowed to spuriously return */
+           pgaio_uring_wait_one(context, io, ref_generation, WAIT_EVENT_AIO_IO_COMPLETE_ANY);
+       }
+       else
+       {
+           /* shouldn't be reachable without concurrency */
+           Assert(IsUnderPostmaster);
+
+           /* ensure we're going to get woken up */
+           if (IsUnderPostmaster)
+               ConditionVariablePrepareToSleep(&io->cv);
+
+           flags = io->flags;
+           pg_read_barrier();
+           if (io->generation == ref_generation && !(flags & done_flags))
+               ConditionVariableSleep(&io->cv, WAIT_EVENT_AIO_IO_COMPLETE_ONE);
+
+           if (IsUnderPostmaster)
+               ConditionVariableCancelSleepEx(true);
+       }
+   }
+
+wait_ref_out:
+
+   flags = io->flags;
+   pg_read_barrier();
+   if (io->generation != ref_generation)
+       return;
+
+   Assert(flags & PGAIOIP_DONE);
+
+   if (unlikely(flags & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE)))
+   {
+       /* can retry soft failures, but not hard ones */
+       /* FIXME: limit number of soft retries */
+       if (flags & PGAIOIP_SOFT_FAILURE)
+       {
+           pgaio_io_retry(io);
+           pgaio_io_wait_ref(ref, call_local);
+       }
+       else
+       {
+           pgaio_io_print(io, NULL);
+           elog(WARNING, "request %zd failed permanently",
+                io - aio_ctl->in_progress_io);
+       }
+
+       return;
+   }
+
+   if (am_owner && call_local && !(flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+   {
+       if (flags & PGAIOIP_FOREIGN_DONE)
+       {
+           SpinLockAcquire(&my_aio->foreign_completed_lock);
+           dlist_delete_from(&my_aio->foreign_completed, &io->io_node);
+           io->flags &= ~PGAIOIP_FOREIGN_DONE;
+           my_aio->foreign_completed_count--;
+           SpinLockRelease(&my_aio->foreign_completed_lock);
+       }
+       else
+       {
+           Assert(my_aio->local_completed_count > 0);
+           dlist_delete_from(&my_aio->local_completed, &io->io_node);
+           my_aio->local_completed_count--;
+       }
+
+       pgaio_io_call_local_callback(io, false);
+   }
+
+}
+
+bool
+pgaio_io_check_ref(PgAioIoRef *ref)
+{
+   uint64 ref_generation;
+   PgAioInProgress *io;
+   uint32 done_flags = PGAIOIP_DONE;
+   PgAioIPFlags flags;
+   PgAioContext *context;
+
+   Assert(ref->aio_index < max_aio_in_progress);
+
+   io = &aio_ctl->in_progress_io[ref->aio_index];
+   ref_generation = ((uint64) ref->generation_upper) << 32 |
+       ref->generation_lower;
+
+   Assert(ref_generation != 0);
+
+   flags = io->flags;
+   pg_read_barrier();
+
+   if (io->generation != ref_generation)
+       return true;
+
+   if (flags & PGAIOIP_PENDING)
+       return false;
+
+   if (flags & done_flags)
+       return true;
+
+   context = &aio_ctl->contexts[io->ring];
+   Assert(!(flags & (PGAIOIP_UNUSED)));
+
+   if (io->generation != ref_generation)
+       return true;
+
+   if (flags & PGAIOIP_INFLIGHT)
+       pgaio_drain(context, false, false);
+
+   flags = io->flags;
+   pg_read_barrier();
+
+   if (io->generation != ref_generation)
+       return true;
+
+   Assert(!(flags & PGAIOIP_PENDING));
+
+   if (flags & done_flags)
+       return true;
+   return false;
+}
+
+
+/*
+ */
+void
+pgaio_io_wait(PgAioInProgress *io)
+{
+   PgAioIoRef ref;
+
+   Assert(io->user_referenced && io->owner_id == my_aio_id);
+
+   pgaio_io_ref(io, &ref);
+   pgaio_io_wait_ref(&ref, /* call_local = */ true);
+}
+
+PgAioInProgress *
+pgaio_io_get(void)
+{
+   dlist_node *elem;
+   PgAioInProgress *io;
+
+   Assert(!LWLockHeldByMe(SharedAIOCtlLock));
+
+   // FIXME: relax?
+   Assert(my_aio->pending_count < PGAIO_SUBMIT_BATCH_SIZE);
+
+   /* FIXME: wait for an IO to complete if full */
+
+   LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+   while (unlikely(dlist_is_empty(&aio_ctl->unused_ios)))
+   {
+       LWLockRelease(SharedAIOCtlLock);
+       elog(WARNING, "needed to drain while getting IO (used %d inflight %d)",
+            aio_ctl->used_count, pg_atomic_read_u32(&my_aio->inflight_count));
+
+       /*
+        * FIXME: should we wait for IO instead?
+        *
+        * Also, need to protect against too many ios handed out but not used.
+        */
+       for (int i = 0; i < aio_ctl->num_contexts; i++)
+           pgaio_drain(&aio_ctl->contexts[i], false, true);
+
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+   }
+
+   elem = dlist_pop_head_node(&aio_ctl->unused_ios);
+   aio_ctl->used_count++;
+
+   LWLockRelease(SharedAIOCtlLock);
+
+   io = dlist_container(PgAioInProgress, owner_node, elem);
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3, errmsg("acquired io %zu/%llu",
+                          io - aio_ctl->in_progress_io,
+                          (long long unsigned) io->generation),
+           errhidecontext(1),
+           errhidestmt(1));
+#endif
+
+   Assert(io->type == PGAIO_INVALID);
+   Assert(io->flags == PGAIOIP_UNUSED);
+   Assert(io->system_referenced);
+   Assert(io->on_completion_local == NULL);
+
+   io->user_referenced = true;
+   io->system_referenced = false;
+
+   *(volatile PgAioIPFlags*) &io->flags = PGAIOIP_IDLE;
+
+   io->owner_id = my_aio_id;
+
+   dlist_push_tail(&my_aio->outstanding, &io->owner_node);
+   my_aio->outstanding_count++;
+
+   return io;
+}
+
+bool
+pgaio_io_success(PgAioInProgress *io)
+{
+   Assert(io->user_referenced);
+   Assert(io->flags & PGAIOIP_DONE);
+
+   if (io->flags & (PGAIOIP_HARD_FAILURE | PGAIOIP_SOFT_FAILURE))
+       return false;
+
+   /* FIXME: is this possible? */
+   if (!(io->flags & PGAIOIP_SHARED_CALLBACK_CALLED))
+       return false;
+
+   return true;
+}
+
+bool
+pgaio_io_done(PgAioInProgress *io)
+{
+   Assert(io->user_referenced);
+   Assert(!(io->flags & PGAIOIP_UNUSED));
+
+   if (io->flags & PGAIOIP_SOFT_FAILURE)
+       return false;
+
+   if (io->flags & (PGAIOIP_IDLE | PGAIOIP_HARD_FAILURE))
+       return true;
+
+   if (io->flags & PGAIOIP_DONE)
+   {
+       if (io->owner_id == my_aio_id &&
+           !(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+           return false;
+       return true;
+   }
+
+   return false;
+}
+
+static void
+pgaio_io_ref_internal(PgAioInProgress *io, PgAioIoRef *ref)
+{
+   Assert(io->flags & (PGAIOIP_IDLE | PGAIOIP_IN_PROGRESS | PGAIOIP_DONE));
+
+   ref->aio_index = io - aio_ctl->in_progress_io;
+   ref->generation_upper = (uint32) (io->generation >> 32);
+   ref->generation_lower = (uint32) io->generation;
+   Assert(io->generation != 0);
+}
+
+void
+pgaio_io_ref(PgAioInProgress *io, PgAioIoRef *ref)
+{
+   Assert(io->user_referenced);
+   pgaio_io_ref_internal(io, ref);
+}
+
+/*
+ * Register a completion callback that is executed locally in the backend that
+ * initiated the IO, even if the the completion of the IO has been reaped by
+ * another process (which executed the shared callback, unlocking buffers
+ * etc).  This is mainly useful for AIO using code to promptly react to
+ * individual IOs finishing, without having to individually check each of the
+ * IOs.
+ */
+void
+pgaio_io_on_completion_local(PgAioInProgress *io, PgAioOnCompletionLocalContext *ocb)
+{
+   Assert(io->flags & (PGAIOIP_IDLE | PGAIOIP_PENDING));
+   Assert(io->on_completion_local == NULL);
+
+   io->on_completion_local = ocb;
+}
+
+static int
+reopen_buffered(const AioBufferTag *tag)
+{
+   uint32 off;
+   SMgrRelation reln = smgropen(tag->rnode.node, tag->rnode.backend);
+
+   return smgrfd(reln, tag->forkNum, tag->blockNum, &off);
+}
+
+void
+pgaio_io_retry(PgAioInProgress *io)
+{
+   bool retryable = false;
+   bool need_retry;
+
+   switch (io->type)
+   {
+       case PGAIO_READ_BUFFER:
+           retryable = true;
+           break;
+
+       case PGAIO_WRITE_BUFFER:
+           retryable = true;
+           break;
+
+       default:
+           break;
+   }
+
+   if (!retryable)
+   {
+       elog(WARNING, "non-retryable aio being retried");
+       return;
+   }
+
+   LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+   /* could concurrently have been unset / retried */
+   if (io->flags & PGAIOIP_SHARED_FAILED)
+   {
+       Assert(!(io->flags & PGAIOIP_FOREIGN_DONE));
+
+       dlist_delete(&io->io_node);
+
+       io->flags =
+           (io->flags & ~(PGAIOIP_SHARED_FAILED |
+                          PGAIOIP_DONE |
+                          PGAIOIP_FOREIGN_DONE |
+                          PGAIOIP_SHARED_CALLBACK_CALLED |
+                          PGAIOIP_LOCAL_CALLBACK_CALLED |
+                          PGAIOIP_HARD_FAILURE |
+                          PGAIOIP_SOFT_FAILURE)) |
+           PGAIOIP_IN_PROGRESS |
+           PGAIOIP_PENDING |
+           PGAIOIP_RETRY;
+
+       need_retry = true;
+   }
+   else
+   {
+       need_retry = false;
+   }
+   LWLockRelease(SharedAIOCtlLock);
+
+   if (!need_retry)
+   {
+       ereport(LOG, errmsg("was about to retry %zd, but somebody else did already",
+                           io - aio_ctl->in_progress_io),
+               errhidestmt(true),
+               errhidecontext(true));
+       pgaio_io_print(io, NULL);
+       return;
+   }
+
+   switch (io->type)
+   {
+       case PGAIO_READ_BUFFER:
+           io->d.read_buffer.fd = reopen_buffered(&io->d.read_buffer.tag);
+           break;
+
+       case PGAIO_WRITE_BUFFER:
+           io->d.write_buffer.fd = reopen_buffered(&io->d.write_buffer.tag);
+           break;
+
+       default:
+           break;
+   }
+
+   dlist_push_tail(&my_aio->pending, &io->io_node);
+   my_aio->pending_count++;
+   my_aio->retry_total_count++;
+
+   pgaio_submit_pending(true);
+}
+
+extern void
+pgaio_io_recycle(PgAioInProgress *io)
+{
+   uint32 init_flags = *(volatile PgAioIPFlags*) &io->flags;
+
+   Assert(init_flags & (PGAIOIP_IDLE | PGAIOIP_DONE));
+   Assert(io->user_referenced);
+   Assert(io->owner_id == my_aio_id);
+   Assert(!io->system_referenced);
+   Assert(io->merge_with == NULL);
+
+   if (io->bb)
+   {
+       pgaio_bounce_buffer_release_internal(io->bb, false, false);
+       io->bb = NULL;
+   }
+
+   if (io->flags & PGAIOIP_DONE)
+   {
+       /* request needs to actually be done, including local callbacks */
+       Assert(!(io->flags & PGAIOIP_FOREIGN_DONE));
+       Assert(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED);
+
+       io->generation++;
+       pg_write_barrier();
+
+       io->flags &= ~PGAIOIP_DONE;
+       io->flags |= PGAIOIP_IDLE;
+   }
+
+   io->flags &= ~(PGAIOIP_MERGE |
+                  PGAIOIP_SHARED_CALLBACK_CALLED |
+                  PGAIOIP_LOCAL_CALLBACK_CALLED |
+                  PGAIOIP_RETRY |
+                  PGAIOIP_HARD_FAILURE |
+                  PGAIOIP_SOFT_FAILURE);
+   Assert(io->flags == PGAIOIP_IDLE);
+   io->result = 0;
+   io->on_completion_local = NULL;
+}
+
+static void  __attribute__((noinline))
+pgaio_prepare_io(PgAioInProgress *io, PgAioAction action)
+{
+   if (my_aio->pending_count + 1 >= PGAIO_SUBMIT_BATCH_SIZE)
+       pgaio_submit_pending(true);
+
+   /* true for now, but not necessarily in the future */
+   Assert(io->flags == PGAIOIP_IDLE);
+   Assert(io->user_referenced);
+   Assert(io->merge_with == NULL);
+
+   Assert(my_aio->pending_count < PGAIO_SUBMIT_BATCH_SIZE);
+
+   io->flags = (io->flags & ~PGAIOIP_IDLE) | PGAIOIP_IN_PROGRESS | PGAIOIP_PENDING;
+
+   /* for this module */
+   io->system_referenced = true;
+   io->type = action;
+   if (IsUnderPostmaster)
+       io->owner_id = MyProc->pgprocno;
+
+   // FIXME: should this be done in end_get_io?
+   dlist_push_tail(&my_aio->pending, &io->io_node);
+   my_aio->pending_count++;
+}
+
+static void  __attribute__((noinline))
+pgaio_finish_io(PgAioInProgress *io)
+{
+}
+
+
+void
+pgaio_io_release(PgAioInProgress *io)
+{
+   Assert(io->user_referenced);
+   Assert(!IsUnderPostmaster || io->owner_id == MyProc->pgprocno);
+
+   LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+   io->user_referenced = false;
+
+   if (io->flags & (PGAIOIP_IDLE |
+                    PGAIOIP_PENDING |
+                    PGAIOIP_LOCAL_CALLBACK_CALLED))
+   {
+       Assert(!(io->flags & PGAIOIP_INFLIGHT));
+
+       Assert(my_aio->outstanding_count > 0);
+       dlist_delete_from(&my_aio->outstanding, &io->owner_node);
+       my_aio->outstanding_count--;
+
+#ifdef PGAIO_VERBOSE
+       ereport(DEBUG3, errmsg("releasing plain user reference to %zu",
+                              io - aio_ctl->in_progress_io),
+               errhidecontext(1),
+               errhidestmt(1));
+#endif
+   }
+   else
+   {
+       dlist_delete_from(&my_aio->issued, &io->owner_node);
+       my_aio->issued_count--;
+
+       if (io->system_referenced)
+       {
+#ifdef PGAIO_VERBOSE
+           ereport(DEBUG4, errmsg("putting aio %zu onto issued_abandoned during release",
+                                  io - aio_ctl->in_progress_io),
+                   errhidecontext(1),
+                   errhidestmt(1));
+#endif
+
+           dlist_push_tail(&my_aio->issued_abandoned, &io->owner_node);
+           my_aio->issued_abandoned_count++;
+       }
+       else
+       {
+           Assert(io->flags & (PGAIOIP_DONE | PGAIOIP_SHARED_CALLBACK_CALLED));
+
+#ifdef PGAIO_VERBOSE
+           ereport(DEBUG4, errmsg("not putting aio %zu onto issued_abandoned during release",
+                                  io - aio_ctl->in_progress_io),
+                   errhidecontext(1),
+                   errhidestmt(1));
+#endif
+       }
+   }
+
+   if (!io->system_referenced)
+   {
+       Assert(!(io->flags & PGAIOIP_INFLIGHT));
+       Assert(!(io->flags & PGAIOIP_MERGE));
+       Assert(io->flags & PGAIOIP_DONE ||
+              io->flags & PGAIOIP_IDLE);
+
+       if (io->flags & PGAIOIP_DONE)
+       {
+           if (io->flags & PGAIOIP_FOREIGN_DONE)
+           {
+               SpinLockAcquire(&my_aio->foreign_completed_lock);
+               Assert(io->flags & PGAIOIP_FOREIGN_DONE);
+               dlist_delete_from(&my_aio->foreign_completed, &io->io_node);
+               my_aio->foreign_completed_count--;
+               SpinLockRelease(&my_aio->foreign_completed_lock);
+           }
+           else if (!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+           {
+               dlist_delete_from(&my_aio->local_completed, &io->io_node);
+               my_aio->local_completed_count--;
+               io->on_completion_local = NULL;
+           }
+
+       }
+
+       io->generation++;
+       pg_write_barrier();
+
+       io->flags = PGAIOIP_UNUSED;
+       io->type = 0;
+       io->owner_id = INVALID_PGPROCNO;
+       io->result = 0;
+       io->system_referenced = true;
+       io->on_completion_local = NULL;
+
+       Assert(io->merge_with == NULL);
+
+       /* could do this earlier or conditionally */
+       if (io->bb)
+       {
+           pgaio_bounce_buffer_release_internal(io->bb,
+                                                /* holding_lock = */ true,
+                                                /* release_resowner = */ false);
+           io->bb = NULL;
+       }
+
+       dlist_push_head(&aio_ctl->unused_ios, &io->owner_node);
+       aio_ctl->used_count--;
+   }
+
+   LWLockRelease(SharedAIOCtlLock);
+}
+
+void
+pgaio_print_queues(void)
+{
+   StringInfoData s;
+   uint32 inflight_backend = 0;
+   uint32 *inflight_context;
+
+   initStringInfo(&s);
+
+   for (int procno = 0; procno < aio_ctl->backend_state_count; procno++)
+   {
+       PgAioPerBackend *bs = &aio_ctl->backend_state[procno];
+
+       inflight_backend += pg_atomic_read_u32(&bs->inflight_count);
+   }
+
+   inflight_context = palloc0(sizeof(uint32) * aio_ctl->backend_state_count);
+   for (int i = 0; i < max_aio_in_progress; i++)
+   {
+       PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+
+       if (!(io->flags & PGAIOIP_INFLIGHT))
+           continue;
+       inflight_context[io->ring]++;
+   }
+
+   appendStringInfo(&s, "inflight backend: %d", inflight_backend);
+
+   for (int contextno = 0; contextno < aio_ctl->num_contexts; contextno++)
+   {
+       PgAioContext *context = &aio_ctl->contexts[contextno];
+
+       appendStringInfo(&s, "\n\tqueue[%d]: space: %d, ready: %d, we think inflight: %d",
+                        contextno,
+                        io_uring_sq_space_left(&context->io_uring_ring),
+                        io_uring_cq_ready(&context->io_uring_ring),
+                        inflight_context[contextno]);
+   }
+
+   ereport(LOG, errmsg_internal("%s", s.data),
+           errhidestmt(true),
+           errhidecontext(true));
+}
+
+static const char *
+pgaio_io_action_string(PgAioAction a)
+{
+   switch(a)
+   {
+       case PGAIO_INVALID:
+           return "invalid";
+       case PGAIO_NOP:
+           return "nop";
+       case PGAIO_FLUSH_RANGE:
+           return "flush_range";
+       case PGAIO_FSYNC:
+           return "fsync";
+       case PGAIO_FSYNC_WAL:
+           return "fsync_wal";
+       case PGAIO_READ_BUFFER:
+           return "read_buffer";
+       case PGAIO_WRITE_BUFFER:
+           return "write_buffer";
+       case PGAIO_WRITE_WAL:
+           return "write_wal";
+       case PGAIO_WRITE_GENERIC:
+           return "write_generic";
+   }
+
+   pg_unreachable();
+}
+
+
+static void
+pgaio_io_flag_string(PgAioIPFlags flags, StringInfo s)
+{
+   bool first = true;
+
+#define STRINGIFY_FLAG(f) if (flags & f) {  appendStringInfoString(s, first ? CppAsString(f) : " | " CppAsString(f)); first = false;}
+
+   STRINGIFY_FLAG(PGAIOIP_UNUSED);
+   STRINGIFY_FLAG(PGAIOIP_IDLE);
+   STRINGIFY_FLAG(PGAIOIP_IN_PROGRESS);
+   STRINGIFY_FLAG(PGAIOIP_PENDING);
+   STRINGIFY_FLAG(PGAIOIP_INFLIGHT);
+   STRINGIFY_FLAG(PGAIOIP_REAPED);
+   STRINGIFY_FLAG(PGAIOIP_SHARED_CALLBACK_CALLED);
+   STRINGIFY_FLAG(PGAIOIP_LOCAL_CALLBACK_CALLED);
+
+   STRINGIFY_FLAG(PGAIOIP_DONE);
+   STRINGIFY_FLAG(PGAIOIP_FOREIGN_DONE);
+
+   STRINGIFY_FLAG(PGAIOIP_MERGE);
+   STRINGIFY_FLAG(PGAIOIP_RETRY);
+   STRINGIFY_FLAG(PGAIOIP_HARD_FAILURE);
+   STRINGIFY_FLAG(PGAIOIP_SOFT_FAILURE);
+   STRINGIFY_FLAG(PGAIOIP_SHARED_FAILED);
+
+#undef STRINGIFY_FLAG
+}
+
+static void
+pgaio_io_action_desc(PgAioInProgress *io, StringInfo s)
+{
+   switch (io->type)
+   {
+       case PGAIO_FSYNC:
+           appendStringInfo(s, "fd: %d, datasync: %d, barrier: %d",
+                            io->d.fsync.fd,
+                            io->d.fsync.datasync,
+                            io->d.fsync.barrier);
+           break;
+       case PGAIO_FSYNC_WAL:
+           appendStringInfo(s, "flush_no: %d, fd: %d, datasync: %d, barrier: %d",
+                            io->d.fsync_wal.flush_no,
+                            io->d.fsync_wal.fd,
+                            io->d.fsync_wal.datasync,
+                            io->d.fsync_wal.barrier);
+           break;
+       case PGAIO_FLUSH_RANGE:
+           appendStringInfo(s, "fd: %d, offset: %llu, nbytes: %llu",
+                            io->d.flush_range.fd,
+                            (unsigned long long) io->d.flush_range.offset,
+                            (unsigned long long) io->d.flush_range.nbytes);
+           break;
+       case PGAIO_READ_BUFFER:
+           appendStringInfo(s, "fd: %d, mode: %d, offset: %u, nbytes: %u, already_done: %u, buf/data: %d/%p",
+                            io->d.read_buffer.fd,
+                            io->d.read_buffer.mode,
+                            io->d.read_buffer.offset,
+                            io->d.read_buffer.nbytes,
+                            io->d.read_buffer.already_done,
+                            io->d.read_buffer.buf,
+                            io->d.read_buffer.bufdata);
+           break;
+       case PGAIO_WRITE_BUFFER:
+           appendStringInfo(s, "fd: %d, offset: %u, nbytes: %u, already_done: %u, release_lock: %d, buf/data: %u/%p",
+                            io->d.write_buffer.fd,
+                            io->d.write_buffer.offset,
+                            io->d.write_buffer.nbytes,
+                            io->d.write_buffer.already_done,
+                            io->d.write_buffer.release_lock,
+                            io->d.write_buffer.buf,
+                            io->d.write_buffer.bufdata);
+           break;
+       case PGAIO_WRITE_WAL:
+           appendStringInfo(s, "write_no: %d, fd: %d, offset: %u, nbytes: %u, already_done: %u, bufdata: %p, no-reorder: %d",
+                            io->d.write_wal.write_no,
+                            io->d.write_wal.fd,
+                            io->d.write_wal.offset,
+                            io->d.write_wal.nbytes,
+                            io->d.write_wal.already_done,
+                            io->d.write_wal.bufdata,
+                            io->d.write_wal.no_reorder);
+           break;
+       case PGAIO_WRITE_GENERIC:
+           appendStringInfo(s, "fd: %d, offset: %llu, nbytes: %u, already_done: %u, bufdata: %p, no-reorder: %d",
+                            io->d.write_generic.fd,
+                            (unsigned long long) io->d.write_generic.offset,
+                            io->d.write_generic.nbytes,
+                            io->d.write_generic.already_done,
+                            io->d.write_generic.bufdata,
+                            io->d.write_generic.no_reorder);
+           break;
+       default:
+           break;
+   }
+}
+
+static void
+pgaio_io_print_one(PgAioInProgress *io, StringInfo s)
+{
+   appendStringInfo(s, "aio %zu/"UINT64_FORMAT": action: %s, ring: %d, init: %d, flags: ",
+                    io - aio_ctl->in_progress_io,
+                    io->generation,
+                    pgaio_io_action_string(io->type),
+                    io->ring,
+                    io->owner_id);
+   pgaio_io_flag_string(io->flags, s);
+   appendStringInfo(s, ", result: %d, user/system_referenced: %d/%d (",
+                    io->result,
+                    io->user_referenced,
+                    io->system_referenced);
+   pgaio_io_action_desc(io, s);
+   appendStringInfoString(s, ")");
+}
+
+void
+pgaio_io_ref_print(PgAioIoRef *ref, StringInfo s)
+{
+   PgAioInProgress *io;
+
+   io = &aio_ctl->in_progress_io[ref->aio_index];
+
+   pgaio_io_print(io, s);
+}
+
+void
+pgaio_io_print(PgAioInProgress *io, StringInfo s)
+{
+   bool alloc = false;
+   MemoryContext old_context;
+
+   if (s == NULL)
+   {
+       old_context = MemoryContextSwitchTo(ErrorContext);
+       s = makeStringInfo();
+       alloc = true;
+   }
+
+   pgaio_io_print_one(io, s);
+
+   {
+       PgAioInProgress *cur = io;
+       int nummerge = 0;
+
+       if (cur->merge_with)
+           appendStringInfoString(s, "\n  merge with:");
+
+       while (cur->merge_with)
+       {
+           nummerge++;
+           appendStringInfo(s, "\n    %d: ", nummerge);
+           pgaio_io_print_one(cur->merge_with, s);
+
+           cur = cur->merge_with;
+       }
+   }
+
+   if (alloc)
+   {
+       ereport(LOG,
+               errmsg("%s", s->data),
+               errhidestmt(true),
+               errhidecontext(true));
+       pfree(s->data);
+       pfree(s);
+       MemoryContextReset(ErrorContext);
+       MemoryContextSwitchTo(old_context);
+   }
+}
+
+void
+pgaio_print_list(dlist_head *head, StringInfo s, size_t offset)
+{
+   bool alloc = false;
+   dlist_iter iter;
+   bool first = true;
+   MemoryContext old_context;
+
+   if (s == NULL)
+   {
+       old_context = MemoryContextSwitchTo(ErrorContext);
+       s = makeStringInfo();
+       alloc = true;
+   }
+
+   dlist_foreach(iter, head)
+   {
+       PgAioInProgress *io = ((PgAioInProgress *) ((char *) (iter.cur) - offset));
+
+       if (!first)
+           appendStringInfo(s, "\n");
+       first = false;
+
+       pgaio_io_print(io, s);
+   }
+
+   if (alloc)
+   {
+       ereport(LOG,
+               errmsg("%s", s->data),
+               errhidestmt(true),
+               errhidecontext(true));
+       pfree(s->data);
+       pfree(s);
+       MemoryContextSwitchTo(old_context);
+       MemoryContextReset(ErrorContext);
+   }
+}
+
+PgAioBounceBuffer *
+pgaio_bounce_buffer_get(void)
+{
+   PgAioBounceBuffer *bb = NULL;
+
+   ResourceOwnerEnlargeAioBB(CurrentResourceOwner);
+
+   while (true)
+   {
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+       if (!dlist_is_empty(&aio_ctl->unused_bounce_buffers))
+       {
+           dlist_node *node = dlist_pop_head_node(&aio_ctl->unused_bounce_buffers);
+
+           aio_ctl->unused_bounce_buffers_count--;
+           bb = dlist_container(PgAioBounceBuffer, node, node);
+
+           Assert(pg_atomic_read_u32(&bb->refcount) == 0);
+       }
+       LWLockRelease(SharedAIOCtlLock);
+
+       if (!bb)
+       {
+           elog(WARNING, "needed to drain while getting BB (inflight %d)",
+                pg_atomic_read_u32(&my_aio->inflight_count));
+
+           for (int i = 0; i < aio_ctl->num_contexts; i++)
+               pgaio_drain(&aio_ctl->contexts[i], false, true);
+       }
+       else
+           break;
+   }
+
+   pg_atomic_write_u32(&bb->refcount, 1);
+
+   ResourceOwnerRememberAioBB(CurrentResourceOwner, bb);
+
+   return bb;
+}
+
+static void
+pgaio_bounce_buffer_release_internal(PgAioBounceBuffer *bb, bool holding_lock, bool release_resowner)
+{
+   Assert(holding_lock == LWLockHeldByMe(SharedAIOCtlLock));
+   Assert(bb != NULL);
+
+   if (release_resowner)
+       ResourceOwnerForgetAioBB(CurrentResourceOwner, bb);
+
+   if (pg_atomic_sub_fetch_u32(&bb->refcount, 1) != 0)
+       return;
+
+   if (!holding_lock)
+       LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+   dlist_push_tail(&aio_ctl->unused_bounce_buffers, &bb->node);
+   aio_ctl->unused_bounce_buffers_count++;
+   if (!holding_lock)
+       LWLockRelease(SharedAIOCtlLock);
+}
+
+void
+pgaio_bounce_buffer_release(PgAioBounceBuffer *bb)
+{
+   pgaio_bounce_buffer_release_internal(bb,
+                                        /* holding_lock = */ false,
+                                        /* release_resowner */ true);
+}
+
+char *
+pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb)
+{
+   return bb->buffer;
+}
+
+void
+pgaio_assoc_bounce_buffer(PgAioInProgress *io, PgAioBounceBuffer *bb)
+{
+   Assert(bb != NULL);
+   Assert(io->bb == NULL);
+   Assert(io->flags == PGAIOIP_IDLE);
+   Assert(io->user_referenced);
+   Assert(pg_atomic_read_u32(&bb->refcount) > 0);
+
+   io->bb = bb;
+   pg_atomic_fetch_add_u32(&bb->refcount, 1);
+}
+
+/*
+ * FIXME: Should this be in general or io_uring specific code?
+ */
+static PgAioContext *
+pgaio_acquire_context(void)
+{
+   PgAioContext *context;
+   int init_last_context = my_aio->last_context;
+
+   /*
+    * First try to acquire a context without blocking on the lock. We start
+    * with the last context we successfully used, which should lead to
+    * backends spreading to different contexts over time.
+    */
+   for (int i = 0; i < aio_ctl->num_contexts; i++)
+   {
+       context = &aio_ctl->contexts[my_aio->last_context];
+
+       if (LWLockConditionalAcquire(&context->submission_lock, LW_EXCLUSIVE))
+       {
+           return context;
+       }
+
+       if (++my_aio->last_context == aio_ctl->num_contexts)
+           my_aio->last_context = 0;
+   }
+
+   /*
+    * Couldn't acquire any without blocking. Block on the last + 1.;
+    */
+   if (++my_aio->last_context == aio_ctl->num_contexts)
+       my_aio->last_context = 0;
+   context = &aio_ctl->contexts[my_aio->last_context];
+
+   elog(DEBUG2, "blocking acquiring io context %d, started on %d",
+        my_aio->last_context, init_last_context);
+
+   LWLockAcquire(&context->submission_lock, LW_EXCLUSIVE);
+
+   return context;
+}
+
+/* --------------------------------------------------------------------------------
+ * io_uring related code
+ * --------------------------------------------------------------------------------
+ */
+
+static int
+pgaio_uring_submit(int max_submit, bool drain)
+{
+   PgAioInProgress *ios[PGAIO_SUBMIT_BATCH_SIZE];
+   struct io_uring_sqe *sqe[PGAIO_SUBMIT_BATCH_SIZE];
+   PgAioContext *context;
+   int nios = 0;
+
+   context = pgaio_acquire_context();
+
+   Assert(max_submit != 0 && max_submit <= my_aio->pending_count);
+
+   while (!dlist_is_empty(&my_aio->pending))
+   {
+       dlist_node *node;
+       PgAioInProgress *io;
+
+       if (nios == max_submit)
+           break;
+
+       /*
+        * XXX: Should there be a per-ring limit? If so, we'd probably best
+        * apply it here.
+        */
+
+       sqe[nios] = io_uring_get_sqe(&context->io_uring_ring);
+
+       if (!sqe[nios])
+       {
+           Assert(nios != 0);
+           elog(WARNING, "io_uring_get_sqe() returned NULL?");
+           break;
+       }
+
+       node = dlist_head_node(&my_aio->pending);
+       io = dlist_container(PgAioInProgress, io_node, node);
+       ios[nios] = io;
+
+       pgaio_io_prepare_submit(io, context - aio_ctl->contexts);
+
+       pgaio_uring_sq_from_io(context, ios[nios], sqe[nios]);
+
+       nios++;
+   }
+
+   Assert(nios > 0);
+
+   {
+       int ret;
+
+       pg_atomic_add_fetch_u32(&my_aio->inflight_count, nios);
+       my_aio->submissions_total_count++;
+
+again:
+       pgstat_report_wait_start(WAIT_EVENT_AIO_SUBMIT);
+       ret = io_uring_submit(&context->io_uring_ring);
+       pgstat_report_wait_end();
+
+       if (ret == -EINTR)
+           goto again;
+
+       if (ret < 0)
+           elog(PANIC, "failed: %d/%s",
+                ret, strerror(-ret));
+   }
+
+   LWLockRelease(&context->submission_lock);
+
+   /*
+    * Others might have been waiting for this IO. Because it wasn't
+    * marked as in-flight until now, they might be waiting for the
+    * CV. Wake'em up.
+    */
+   for (int i = 0; i < nios; i++)
+   {
+       PgAioInProgress *cur = ios[i];
+
+       while (cur)
+       {
+           ConditionVariableBroadcast(&cur->cv);
+           cur = cur->merge_with;
+       }
+   }
+
+   /* callbacks will be called later by pgaio_submit() */
+   if (drain)
+       pgaio_drain(context, false, false);
+
+   return nios;
+}
+
+static int
+pgaio_uring_drain_locked(PgAioContext *context)
+{
+   uint32 processed = 0;
+   int ready;
+
+   Assert(LWLockHeldByMe(&context->completion_lock));
+
+   /*
+    * Don't drain more events than available right now. Otherwise it's
+    * plausible that one backend could get stuck, for a while, receiving CQEs
+    * without actually processing them.
+    */
+   ready = io_uring_cq_ready(&context->io_uring_ring);
+
+   while (ready > 0)
+   {
+       struct io_uring_cqe *reaped_cqes[PGAIO_MAX_LOCAL_REAPED];
+       uint32 processed_one;
+
+       START_CRIT_SECTION();
+
+       ereport(DEBUG3,
+               errmsg("context [%zu]: drain %d ready",
+                      context - aio_ctl->contexts,
+                      ready),
+               errhidestmt(true),
+               errhidecontext(true));
+
+       processed_one =
+           io_uring_peek_batch_cqe(&context->io_uring_ring,
+                                   reaped_cqes,
+                                   Min(PGAIO_MAX_LOCAL_REAPED, ready));
+       Assert(processed_one <= ready);
+
+       ready -= processed_one;
+       processed += processed_one;
+
+       for (int i = 0; i < processed_one; i++)
+       {
+           struct io_uring_cqe *cqe = reaped_cqes[i];
+
+           pgaio_uring_io_from_cqe(context, cqe);
+
+           io_uring_cqe_seen(&context->io_uring_ring, cqe);
+       }
+
+       END_CRIT_SECTION();
+   }
+
+   if (context->reaped_iovecs_count > context->unused_iovecs_count &&
+       LWLockConditionalAcquire(&context->submission_lock, LW_EXCLUSIVE))
+   {
+       ereport(DEBUG4,
+               errmsg("plenty reaped iovecs (%d), transferring",
+                      context->reaped_iovecs_count),
+               errhidestmt(true),
+               errhidecontext(true));
+
+       pgaio_uring_iovec_transfer(context);
+       LWLockRelease(&context->submission_lock);
+   }
+
+   return processed;
+}
+
+static int
+pgaio_uring_drain(PgAioContext *context)
+{
+   uint32 processed = 0;
+
+   Assert(!LWLockHeldByMe(&context->completion_lock));
+
+   while (io_uring_cq_ready(&context->io_uring_ring))
+   {
+       if (LWLockAcquireOrWait(&context->completion_lock, LW_EXCLUSIVE))
+       {
+           processed = pgaio_uring_drain_locked(context);
+
+           /*
+            * Call shared callbacks under lock - that avoids others to first
+            * wait on the completion_lock for pgaio_uring_wait_one, then
+            * again in pgaio_drain(), and then again on the condition
+            * variable for the AIO.
+            */
+           if (processed > 0)
+               pgaio_uncombine();
+
+           pgaio_complete_ios(false);
+
+           LWLockRelease(&context->completion_lock);
+           break;
+       }
+   }
+
+   return processed;
+}
+
+static void
+pgaio_uring_wait_one(PgAioContext *context, PgAioInProgress *io, uint64 ref_generation, uint32 wait_event_info)
+{
+   PgAioIPFlags flags;
+
+   if (LWLockAcquireOrWait(&context->completion_lock, LW_EXCLUSIVE))
+   {
+       ereport(DEBUG3,
+               errmsg("[%d] got the lock, before waiting for %zu/%llu, %d ready",
+                      (int)(context - aio_ctl->contexts),
+                      io - aio_ctl->in_progress_io,
+                      (long long unsigned) io->generation,
+                      io_uring_cq_ready(&context->io_uring_ring)),
+               errhidestmt(true),
+               errhidecontext(true));
+
+       flags = *(volatile PgAioIPFlags*) &io->flags;
+
+       pg_read_barrier();
+
+       if (!io_uring_cq_ready(&context->io_uring_ring) &&
+           io->generation == ref_generation &&
+           (flags & PGAIOIP_INFLIGHT))
+       {
+           int ret;
+
+           /*
+            * XXX: Temporary, non-assert, sanity checks, some of these are
+            * hard to hit in assert builds and lead to rare and hard to debug
+            * hangs.
+            */
+           if (io->generation != ref_generation ||
+               &aio_ctl->contexts[io->ring] != context)
+           {
+               ereport(PANIC,
+                       errmsg("generation changed while locked, after wait: %llu, real: %llu",
+                              (long long unsigned) ref_generation,
+                              (long long unsigned) io->generation),
+                       errhidestmt(true),
+                       errhidecontext(true));
+           }
+
+           pgstat_report_wait_start(wait_event_info);
+           ret = __sys_io_uring_enter(context->io_uring_ring.ring_fd,
+                                      0, 1,
+                                      IORING_ENTER_GETEVENTS, NULL);
+           pgstat_report_wait_end();
+
+           if (ret < 0 && errno == EINTR)
+           {
+               elog(DEBUG3, "got interrupted");
+           }
+           else if (ret != 0)
+               elog(WARNING, "unexpected: %d/%s: %m", ret, strerror(-ret));
+
+           /* see XXX above */
+           if (io->generation != ref_generation ||
+               &aio_ctl->contexts[io->ring] != context)
+           {
+               ereport(PANIC,
+                       errmsg("generation changed while locked, after wait: %llu, real: %llu",
+                              (long long unsigned) ref_generation,
+                              (long long unsigned) io->generation),
+                       errhidestmt(true),
+                       errhidecontext(true));
+           }
+
+           ereport(DEBUG3,
+                   errmsg("[%d] after waiting for %zu, %d ready",
+                          (int)(context - aio_ctl->contexts),
+                          io - aio_ctl->in_progress_io,
+                          io_uring_cq_ready(&context->io_uring_ring)),
+                   errhidestmt(true),
+                   errhidecontext(true));
+       }
+       else
+       {
+           ereport(DEBUG3,
+                   errmsg("[%d] got ready under lock, without wait %zu, %d ready",
+                          (int)(context - aio_ctl->contexts),
+                          io - aio_ctl->in_progress_io,
+                          io_uring_cq_ready(&context->io_uring_ring)),
+                   errhidestmt(true),
+                   errhidecontext(true));
+       }
+
+       /*
+        * Drain and call shared callbacks under lock while we already have it
+        * - that avoids others to first wait on the completion_lock for
+        * pgaio_uring_wait_one, then again in pgaio_drain(), and then again
+        * on the condition variable for the AIO.
+        */
+       if (io_uring_cq_ready(&context->io_uring_ring))
+       {
+           pgaio_uring_drain_locked(context);
+           pgaio_uncombine();
+           pgaio_complete_ios(false);
+       }
+
+       LWLockRelease(&context->completion_lock);
+   }
+   else
+   {
+       ereport(DEBUG3,
+               errmsg("[%d] somebody else got the lock while waiting for %zu/%lu, %d ready",
+                      (int)(context - aio_ctl->contexts),
+                      io - aio_ctl->in_progress_io, io->generation,
+                      io_uring_cq_ready(&context->io_uring_ring)),
+               errhidestmt(true),
+               errhidecontext(true));
+   }
+}
+
+static void
+pgaio_uring_io_from_cqe(PgAioContext *context, struct io_uring_cqe *cqe)
+{
+   PgAioInProgress *io;
+
+   io = io_uring_cqe_get_data(cqe);
+   Assert(io != NULL);
+   Assert(io->flags & PGAIOIP_INFLIGHT);
+   Assert(io->system_referenced);
+
+   *(volatile PgAioIPFlags*) &io->flags = (io->flags & ~PGAIOIP_INFLIGHT) | PGAIOIP_REAPED;
+   io->result = cqe->res;
+
+   dlist_push_tail(&my_aio->reaped, &io->io_node);
+
+   if (io->used_iovec != -1)
+   {
+       PgAioIovec *iovec = &context->iovecs[io->used_iovec];
+
+       slist_push_head(&context->reaped_iovecs, &iovec->node);
+       context->reaped_iovecs_count++;
+   }
+
+   /*
+    * FIXME: needs to be removed at some point, this is effectively a
+    * critical section.
+    */
+   if (cqe->res < 0)
+   {
+       elog(WARNING, "cqe: u: %p s: %d/%s f: %u",
+            io_uring_cqe_get_data(cqe),
+            cqe->res,
+            cqe->res < 0 ? strerror(-cqe->res) : "",
+            cqe->flags);
+   }
+}
+
+/*
+ * FIXME: These need to be deduplicated.
+ */
+static void
+prep_read_buffer_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+   PgAioInProgress *cur;
+   uint32 offset = io->d.read_buffer.offset;
+   int niov = 0;
+
+   cur = io;
+   while (cur)
+   {
+       offset += cur->d.read_buffer.already_done;
+       iovs[niov].iov_base = cur->d.read_buffer.bufdata + cur->d.read_buffer.already_done;
+       iovs[niov].iov_len = cur->d.read_buffer.nbytes - cur->d.read_buffer.already_done;
+
+       niov++;
+       cur = cur->merge_with;
+   }
+
+   io_uring_prep_readv(sqe,
+                       io->d.read_buffer.fd,
+                       iovs,
+                       niov,
+                       offset);
+}
+
+static void
+prep_write_buffer_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+   PgAioInProgress *cur;
+   uint32 offset = io->d.write_buffer.offset;
+   int niov = 0;
+
+   cur = io;
+   while (cur)
+   {
+       offset += cur->d.write_buffer.already_done;
+       iovs[niov].iov_base = cur->d.write_buffer.bufdata + cur->d.write_buffer.already_done;
+       iovs[niov].iov_len = cur->d.write_buffer.nbytes - cur->d.write_buffer.already_done;
+
+       niov++;
+       cur = cur->merge_with;
+   }
+
+   io_uring_prep_writev(sqe,
+                        io->d.write_buffer.fd,
+                        iovs,
+                        niov,
+                        offset);
+}
+
+static void
+prep_write_wal_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+   PgAioInProgress *cur;
+   uint32 offset = io->d.write_wal.offset;
+   int niov = 0;
+
+   cur = io;
+   while (cur)
+   {
+       offset += cur->d.write_wal.already_done;
+       iovs[niov].iov_base = cur->d.write_wal.bufdata + cur->d.write_wal.already_done;
+       iovs[niov].iov_len = cur->d.write_wal.nbytes - cur->d.write_wal.already_done;
+
+       niov++;
+       cur = cur->merge_with;
+   }
+
+   io_uring_prep_writev(sqe,
+                       io->d.write_wal.fd,
+                       iovs,
+                       niov,
+                       offset);
+}
+
+static void
+prep_write_generic_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+   PgAioInProgress *cur;
+   off_t offset = io->d.write_generic.offset;
+   int niov = 0;
+
+   cur = io;
+   while (cur)
+   {
+       offset += cur->d.write_generic.already_done;
+       iovs[niov].iov_base = cur->d.write_generic.bufdata + cur->d.write_generic.already_done;
+       iovs[niov].iov_len = cur->d.write_generic.nbytes - cur->d.write_generic.already_done;
+
+       niov++;
+       cur = cur->merge_with;
+   }
+
+   io_uring_prep_writev(sqe,
+                       io->d.write_generic.fd,
+                       iovs,
+                       niov,
+                       offset);
+}
+
+static void
+pgaio_uring_iovec_transfer(PgAioContext *context)
+{
+   Assert(LWLockHeldByMe(&context->submission_lock));
+   Assert(LWLockHeldByMe(&context->completion_lock));
+
+   while (!slist_is_empty(&context->reaped_iovecs))
+   {
+       slist_push_head(&context->unused_iovecs, slist_pop_head_node(&context->reaped_iovecs));
+   }
+
+   context->unused_iovecs_count += context->reaped_iovecs_count;
+   context->reaped_iovecs_count = 0;
+}
+
+static struct PgAioIovec *
+pgaio_uring_iovec_get(PgAioContext *context, PgAioInProgress *io)
+{
+   slist_node *node;
+   PgAioIovec *iov;
+
+   if (context->unused_iovecs_count == 0)
+   {
+       ereport(DEBUG2,
+               errmsg("out of unused iovecs, transferring %d reaped ones",
+                      context->reaped_iovecs_count),
+               errhidestmt(true),
+               errhidecontext(true));
+       LWLockAcquire(&context->completion_lock, LW_EXCLUSIVE);
+       Assert(context->reaped_iovecs_count > 0);
+       pgaio_uring_iovec_transfer(context);
+       LWLockRelease(&context->completion_lock);
+       Assert(context->unused_iovecs_count > 0);
+   }
+
+   context->unused_iovecs_count--;
+   node = slist_pop_head_node(&context->unused_iovecs);
+   iov = slist_container(PgAioIovec, node, node);
+
+   io->used_iovec = iov - context->iovecs;
+
+   return iov;
+}
+
+static void
+pgaio_uring_sq_from_io(PgAioContext *context, PgAioInProgress *io, struct io_uring_sqe *sqe)
+{
+   PgAioIovec *iovec;
+
+   io->used_iovec = -1;
+
+   switch (io->type)
+   {
+       case PGAIO_FSYNC:
+           io_uring_prep_fsync(sqe,
+                               io->d.fsync.fd,
+                               io->d.fsync.datasync ? IORING_FSYNC_DATASYNC : 0);
+           if (io->d.fsync.barrier)
+               sqe->flags |= IOSQE_IO_DRAIN;
+           break;
+
+       case PGAIO_FSYNC_WAL:
+           io_uring_prep_fsync(sqe,
+                               io->d.fsync_wal.fd,
+                               io->d.fsync_wal.datasync ? IORING_FSYNC_DATASYNC : 0);
+           if (io->d.fsync.barrier)
+               sqe->flags |= IOSQE_IO_DRAIN;
+           break;
+
+       case PGAIO_READ_BUFFER:
+           iovec = pgaio_uring_iovec_get(context, io);
+           prep_read_buffer_iov(io, sqe, iovec->iovec);
+           //sqe->flags |= IOSQE_ASYNC;
+           break;
+
+       case PGAIO_WRITE_BUFFER:
+           iovec = pgaio_uring_iovec_get(context, io);
+           prep_write_buffer_iov(io, sqe, iovec->iovec);
+           break;
+
+       case PGAIO_FLUSH_RANGE:
+           io_uring_prep_rw(IORING_OP_SYNC_FILE_RANGE,
+                            sqe,
+                            io->d.flush_range.fd,
+                            NULL,
+                            io->d.flush_range.nbytes,
+                            io->d.flush_range.offset);
+           sqe->sync_range_flags = SYNC_FILE_RANGE_WRITE;
+           break;
+
+       case PGAIO_WRITE_WAL:
+           iovec = pgaio_uring_iovec_get(context, io);
+           prep_write_wal_iov(io, sqe, iovec->iovec);
+           if (io->d.write_wal.no_reorder)
+               sqe->flags = IOSQE_IO_DRAIN;
+           break;
+
+       case PGAIO_WRITE_GENERIC:
+           iovec = pgaio_uring_iovec_get(context, io);
+           prep_write_generic_iov(io, sqe, iovec->iovec);
+           if (io->d.write_generic.no_reorder)
+               sqe->flags = IOSQE_IO_DRAIN;
+           break;
+
+       case PGAIO_NOP:
+           elog(ERROR, "not yet");
+           break;
+
+       case PGAIO_INVALID:
+           elog(ERROR, "invalid");
+   }
+
+   io_uring_sqe_set_data(sqe, io);
+}
+
+static int
+__sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+            unsigned flags, sigset_t *sig)
+{
+
+# ifndef __NR_io_uring_enter
+#  define __NR_io_uring_enter      426
+# endif
+
+   return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
+           flags, sig, _NSIG / 8);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+ * Code dealing with specific IO types
+ * --------------------------------------------------------------------------------
+ */
+
+void
+pgaio_io_start_flush_range(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes)
+{
+   pgaio_prepare_io(io, PGAIO_FLUSH_RANGE);
+
+   io->d.flush_range.fd = fd;
+   io->d.flush_range.offset = offset;
+   io->d.flush_range.nbytes = nbytes;
+
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "start_flush_range %zu: %d, %llu, %llu",
+        io - aio_ctl->in_progress_io,
+        fd, (unsigned long long) offset, (unsigned long long) nbytes);
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_read_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes, char *bufdata, int buffno, int mode)
+{
+   Assert(ShmemAddrIsValid(bufdata));
+
+   pgaio_prepare_io(io, PGAIO_READ_BUFFER);
+
+   io->d.read_buffer.buf = buffno;
+   io->d.read_buffer.mode = mode;
+   io->d.read_buffer.fd = fd;
+   io->d.read_buffer.offset = offset;
+   io->d.read_buffer.nbytes = nbytes;
+   io->d.read_buffer.bufdata = bufdata;
+   io->d.read_buffer.already_done = 0;
+   memcpy(&io->d.read_buffer.tag, tag, sizeof(io->d.read_buffer.tag));
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("start_read_buffer %zu: "
+                  "fd %d, off: %llu, bytes: %llu, buff: %d, data %p",
+                  io - aio_ctl->in_progress_io,
+                  fd,
+                  (unsigned long long) offset,
+                  (unsigned long long) nbytes,
+                  buffno,
+                  bufdata),
+        errhidestmt(true),
+        errhidecontext(true));
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes, char *bufdata, int buffno, bool release_lock)
+{
+   Assert(ShmemAddrIsValid(bufdata));
+
+   pgaio_prepare_io(io, PGAIO_WRITE_BUFFER);
+
+   io->d.write_buffer.buf = buffno;
+   io->d.write_buffer.fd = fd;
+   io->d.write_buffer.offset = offset;
+   io->d.write_buffer.nbytes = nbytes;
+   io->d.write_buffer.bufdata = bufdata;
+   io->d.write_buffer.already_done = 0;
+   io->d.write_buffer.release_lock = release_lock;
+   memcpy(&io->d.write_buffer.tag, tag, sizeof(io->d.write_buffer.tag));
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("start_write_buffer %zu: "
+                  "fd %d, off: %llu, nbytes: %llu, rel: %d, buff: %d, data %p",
+                  io - aio_ctl->in_progress_io,
+                  fd,
+                  (unsigned long long) offset,
+                  (unsigned long long) nbytes,
+                  release_lock,
+                  buffno,
+                  bufdata),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_wal(PgAioInProgress *io, int fd, uint32 offset, uint32 nbytes, char *bufdata, bool no_reorder, uint32 write_no)
+{
+   Assert(ShmemAddrIsValid(bufdata));
+
+   pgaio_prepare_io(io, PGAIO_WRITE_WAL);
+
+   io->d.write_wal.fd = fd;
+   io->d.write_wal.no_reorder = no_reorder;
+   io->d.write_wal.offset = offset;
+   io->d.write_wal.nbytes = nbytes;
+   io->d.write_wal.bufdata = bufdata;
+   io->d.write_wal.already_done = 0;
+   io->d.write_wal.write_no = write_no;
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("start_write_wal %zu:"
+                  "fd %d, off: %llu, bytes: %llu, write_no: %d, no_reorder: %d, data %p",
+                  io - aio_ctl->in_progress_io,
+                  fd,
+                  (unsigned long long) offset,
+                  (unsigned long long) nbytes,
+                  write_no,
+                  no_reorder,
+                  bufdata),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_generic(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes, char *bufdata, bool no_reorder)
+{
+   Assert(ShmemAddrIsValid(bufdata));
+
+   pgaio_prepare_io(io, PGAIO_WRITE_GENERIC);
+
+   io->d.write_generic.fd = fd;
+   io->d.write_generic.no_reorder = no_reorder;
+   io->d.write_generic.offset = offset;
+   io->d.write_generic.nbytes = nbytes;
+   io->d.write_generic.bufdata = bufdata;
+   io->d.write_generic.already_done = 0;
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("start_write_generic %zu:"
+                  "fd %d, off: %llu, bytes: %llu, no_reorder: %d, data %p",
+                  io - aio_ctl->in_progress_io,
+                  fd,
+                  (unsigned long long) offset,
+                  (unsigned long long) nbytes,
+                  no_reorder,
+                  bufdata),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_nop(PgAioInProgress *io)
+{
+   pgaio_prepare_io(io, PGAIO_NOP);
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fsync(PgAioInProgress *io, int fd, bool barrier)
+{
+   pgaio_prepare_io(io, PGAIO_FSYNC);
+   io->d.fsync.fd = fd;
+   io->d.fsync.barrier = barrier;
+   io->d.fsync.datasync = false;
+
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "start_fsync %zu:"
+        "fd %d, is_barrier: %d, is_datasync: %d",
+        io - aio_ctl->in_progress_io,
+        fd,
+        barrier,
+        false);
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fdatasync(PgAioInProgress *io, int fd, bool barrier)
+{
+   pgaio_prepare_io(io, PGAIO_FSYNC);
+   io->d.fsync.fd = fd;
+   io->d.fsync.barrier = barrier;
+   io->d.fsync.datasync = true;
+
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "start_fsync %zu:"
+        "fd %d, is_barrier: %d, is_datasync: %d",
+        io - aio_ctl->in_progress_io,
+        fd,
+        barrier,
+        true);
+#endif
+
+   pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fsync_wal(PgAioInProgress *io, int fd, bool barrier, bool datasync_only, uint32 flush_no)
+{
+   pgaio_prepare_io(io, PGAIO_FSYNC_WAL);
+   io->d.fsync_wal.fd = fd;
+   io->d.fsync_wal.barrier = barrier;
+   io->d.fsync_wal.datasync = datasync_only;
+   io->d.fsync_wal.flush_no = flush_no;
+
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "start_fsync_wal %zu:"
+        "fd %d, is_barrier: %d, is_datasync: %d, flush_no: %d",
+        io - aio_ctl->in_progress_io,
+        fd,
+        barrier,
+        datasync_only,
+        flush_no);
+#endif
+
+   pgaio_finish_io(io);
+}
+
+static bool
+pgaio_complete_nop(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "completed nop");
+#endif
+
+   return true;
+}
+
+static bool
+pgaio_complete_fsync(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "completed fsync: %zu",
+        io - aio_ctl->in_progress_io);
+#endif
+   if (io->result != 0)
+       elog(PANIC, "fsync needs better error handling");
+
+   return true;
+}
+
+static bool
+pgaio_complete_fsync_wal(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "completed fsync_wal: %zu",
+        io - aio_ctl->in_progress_io);
+#endif
+   if (io->result != 0)
+       elog(PANIC, "fsync_wal needs better error handling");
+
+   XLogFlushComplete(io, io->d.fsync_wal.flush_no);
+   return true;
+}
+
+static bool
+pgaio_complete_flush_range(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   elog(DEBUG3, "completed flush_range: %zu, %s",
+        io - aio_ctl->in_progress_io,
+        io->result < 0 ? strerror(-io->result) : "ok");
+#endif
+
+   return true;
+}
+
+static bool
+pgaio_complete_read_buffer(PgAioInProgress *io)
+{
+   Buffer      buffer = io->d.read_buffer.buf;
+
+   bool        call_completion;
+   bool        failed;
+   bool        done;
+
+   /* great for torturing error handling */
+#if 0
+   if (io->result > 4096)
+       io->result = 4096;
+#endif
+
+#ifdef PGAIO_VERBOSE
+   ereport(io->flags & PGAIOIP_RETRY ? DEBUG1 : DEBUG3,
+           errmsg("completed read_buffer: %zu, %d/%s, buf %d",
+                  io - aio_ctl->in_progress_io,
+                  io->result,
+                  io->result < 0 ? strerror(-io->result) : "ok",
+                  io->d.read_buffer.buf),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   if (io->result != (io->d.read_buffer.nbytes - io->d.read_buffer.already_done))
+   {
+       MemoryContext old_context = MemoryContextSwitchTo(aio_retry_context);
+
+       failed = true;
+
+       //pgaio_io_print(io, NULL);
+
+       if (io->result < 0)
+       {
+           if (io->result == -EAGAIN || io->result == -EINTR)
+           {
+               elog(PANIC, "need to implement retries for failed requests");
+           }
+           else
+           {
+               ereport(WARNING,
+                       errcode_for_file_access(),
+                       errmsg("could not read block %u in file \"%s\": %s",
+                              io->d.read_buffer.tag.blockNum,
+                              relpath(io->d.read_buffer.tag.rnode,
+                                      io->d.read_buffer.tag.forkNum),
+                              strerror(-io->result)));
+           }
+
+           call_completion = true;
+           done = true;
+       }
+       else
+       {
+           io->flags |= PGAIOIP_SOFT_FAILURE;
+           call_completion = false;
+           done = false;
+           io->d.read_buffer.already_done += io->result;
+
+           /*
+            * This is actually pretty common and harmless, happens when part
+            * of the block is in the kernel page cache, but the other
+            * isn't. So don't issue WARNING/ERROR, but just retry.
+            *
+            * While it can happen with single BLCKSZ reads (since they're
+            * bigger than typical page sizes), it's made much more likely by
+            * us combining reads.
+            *
+            * XXX: Should we handle repeated failures for the same blocks
+            * differently?
+            */
+           ereport(DEBUG1,
+                   errcode(ERRCODE_DATA_CORRUPTED),
+                   errmsg("aio %zd: could not read block %u in file \"%s\": read only %d of %d bytes (init: %d, cur: %d)",
+                          io - aio_ctl->in_progress_io,
+                          io->d.read_buffer.tag.blockNum,
+                          relpath(io->d.read_buffer.tag.rnode,
+                                  io->d.read_buffer.tag.forkNum),
+                          io->result, BLCKSZ,
+                          io->owner_id, MyProc ? MyProc->pgprocno : INVALID_PGPROCNO));
+       }
+
+       MemoryContextSwitchTo(old_context);
+       MemoryContextReset(aio_retry_context);
+   }
+   else
+   {
+       io->d.read_buffer.already_done += io->result;
+       Assert(io->d.read_buffer.already_done == BLCKSZ);
+
+       call_completion = true;
+       failed = false;
+       done = true;
+   }
+
+   if (call_completion)
+       ReadBufferCompleteRead(buffer,
+                              &io->d.read_buffer.tag,
+                              io->d.read_buffer.bufdata,
+                              io->d.read_buffer.mode,
+                              failed);
+
+   return done;
+}
+
+static bool
+pgaio_complete_write_buffer(PgAioInProgress *io)
+{
+   Buffer      buffer = io->d.write_buffer.buf;
+
+   bool        call_completion;
+   bool        failed;
+   bool        done;
+
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("completed write_buffer: %zu, %d/%s, buf %d",
+                  io - aio_ctl->in_progress_io,
+                  io->result,
+                  io->result < 0 ? strerror(-io->result) : "ok",
+                  io->d.write_buffer.buf),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   if (io->result != (io->d.write_buffer.nbytes - io->d.write_buffer.already_done))
+   {
+       MemoryContext old_context = MemoryContextSwitchTo(aio_retry_context);
+
+       failed = true;
+
+       if (io->result < 0)
+       {
+           int elevel ;
+
+           failed = true;
+
+           if (io->result == -EAGAIN || io->result == -EINTR)
+           {
+               io->flags |= PGAIOIP_SOFT_FAILURE;
+
+               call_completion = false;
+               done = false;
+
+               elevel = DEBUG1;
+           }
+           else
+           {
+               io->flags |= PGAIOIP_HARD_FAILURE;
+               elevel = WARNING;
+
+               call_completion = true;
+               done = true;
+
+               pgaio_io_print(io, NULL);
+           }
+
+           ereport(elevel,
+                   errcode_for_file_access(),
+                   errmsg("aio %zd: could not write block %u in file \"%s\": %s",
+                          io - aio_ctl->in_progress_io,
+                          io->d.write_buffer.tag.blockNum,
+                          relpath(io->d.write_buffer.tag.rnode,
+                                  io->d.write_buffer.tag.forkNum),
+                          strerror(-io->result)),
+                   errhint("Check free disk space."));
+       }
+       else
+       {
+           io->flags |= PGAIOIP_SOFT_FAILURE;
+           io->d.write_buffer.already_done += io->result;
+
+           call_completion = false;
+           done = false;
+
+           ereport(WARNING,
+                   (errcode(ERRCODE_DATA_CORRUPTED),
+                    errmsg("aio %zd: could not write block %u in file \"%s\": wrote only %d of %d bytes (init: %d, cur: %d)",
+                           io - aio_ctl->in_progress_io,
+                           io->d.write_buffer.tag.blockNum,
+                           relpath(io->d.write_buffer.tag.rnode,
+                                   io->d.write_buffer.tag.forkNum),
+                           io->result, (io->d.write_buffer.nbytes - io->d.write_buffer.already_done),
+                           io->owner_id, MyProc ? MyProc->pgprocno : INVALID_PGPROCNO)));
+       }
+
+       MemoryContextSwitchTo(old_context);
+       MemoryContextReset(aio_retry_context);
+   }
+   else
+   {
+       io->d.write_buffer.already_done += io->result;
+       Assert(io->d.write_buffer.already_done == BLCKSZ);
+
+       call_completion = true;
+       failed = false;
+       done = true;
+   }
+
+   if (call_completion && BufferIsValid(buffer))
+       ReadBufferCompleteWrite(buffer, failed, io->d.write_buffer.release_lock);
+
+   return done;
+}
+
+static bool
+pgaio_complete_write_wal(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("completed write_wal: %zu, %d/%s",
+                  io - aio_ctl->in_progress_io,
+                  io->result,
+                  io->result < 0 ? strerror(-io->result) : "ok"),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   if (io->result < 0)
+   {
+       if (io->result == -EAGAIN || io->result == -EINTR)
+       {
+           elog(WARNING, "need to implement retries");
+       }
+
+       ereport(PANIC,
+               (errcode_for_file_access(),
+                errmsg("could not write to log file: %s",
+                       strerror(-io->result))));
+   }
+   else if (io->result != (io->d.write_wal.nbytes - io->d.write_wal.already_done))
+   {
+       /* FIXME: implement retries for short writes */
+       ereport(PANIC,
+               (errcode_for_file_access(),
+                errmsg("could not write to log file: wrote only %d of %d bytes",
+                       io->result, (io->d.write_wal.nbytes - io->d.write_wal.already_done))));
+   }
+
+   XLogWriteComplete(io, io->d.write_wal.write_no);
+
+   return true;
+}
+
+
+static bool
+pgaio_complete_write_generic(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+   ereport(DEBUG3,
+           errmsg("completed write_generic: %zu, %d/%s",
+                  io - aio_ctl->in_progress_io,
+                  io->result,
+                  io->result < 0 ? strerror(-io->result) : "ok"),
+           errhidestmt(true),
+           errhidecontext(true));
+#endif
+
+   if (io->result < 0)
+   {
+       if (io->result == -EAGAIN || io->result == -EINTR)
+       {
+           elog(WARNING, "need to implement retries");
+       }
+
+       ereport(PANIC,
+               (errcode_for_file_access(),
+                errmsg("could not write to log file: %s",
+                       strerror(-io->result))));
+   }
+   else if (io->result != (io->d.write_generic.nbytes - io->d.write_generic.already_done))
+   {
+       /* FIXME: implement retries for short writes */
+       /* FIXME: not WAL */
+       ereport(PANIC,
+               (errcode_for_file_access(),
+                errmsg("could not write to log file: wrote only %d of %d bytes",
+                       io->result, (io->d.write_generic.nbytes - io->d.write_generic.already_done))));
+   }
+
+   return true;
+}
+
+
+/* --------------------------------------------------------------------------------
+ * SQL interface functions
+ * --------------------------------------------------------------------------------
+ */
+
+Datum
+pg_stat_get_aio_backends(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_AIO_BACKEND_COLS   13
+
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   TupleDesc   tupdesc;
+   Tuplestorestate *tupstore;
+   MemoryContext per_query_ctx;
+   MemoryContext oldcontext;
+
+   /* check to see if caller supports us returning a tuplestore */
+   if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("set-valued function called in context that cannot accept a set")));
+   if (!(rsinfo->allowedModes & SFRM_Materialize))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("materialize mode required, but it is not allowed in this context")));
+
+   /* Build a tuple descriptor for our result type */
+   if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+       elog(ERROR, "return type must be a row type");
+
+   per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+   oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+   tupstore = tuplestore_begin_heap(true, false, work_mem);
+   rsinfo->returnMode = SFRM_Materialize;
+   rsinfo->setResult = tupstore;
+   rsinfo->setDesc = tupdesc;
+
+   MemoryContextSwitchTo(oldcontext);
+
+   for (int i = 0; i < aio_ctl->backend_state_count; i++)
+   {
+       PgAioPerBackend *bs = &aio_ctl->backend_state[i];
+       Datum       values[PG_STAT_GET_AIO_BACKEND_COLS];
+       bool        nulls[PG_STAT_GET_AIO_BACKEND_COLS];
+       int         pid = ProcGlobal->allProcs[i].pid;
+
+       if (pid == 0)
+           continue;
+
+       memset(nulls, 0, sizeof(nulls));
+
+       values[ 0] = Int32GetDatum(pid);
+       values[ 1] = Int64GetDatum(bs->executed_total_count);
+       values[ 2] = Int64GetDatum(bs->issued_total_count);
+       values[ 3] = Int64GetDatum(bs->submissions_total_count);
+       values[ 4] = Int64GetDatum(bs->foreign_completed_total_count);
+       values[ 5] = Int64GetDatum(bs->retry_total_count);
+       values[ 6] = Int64GetDatum(pg_atomic_read_u32(&bs->inflight_count));
+       values[ 7] = Int32GetDatum(bs->unused_count);
+       values[ 8] = Int32GetDatum(bs->outstanding_count);
+       values[ 9] = Int32GetDatum(bs->pending_count);
+       values[10] = Int32GetDatum(bs->local_completed_count);
+       values[11] = Int32GetDatum(bs->foreign_completed_count);
+       values[12] = Int32GetDatum(bs->last_context);
+
+
+       tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+   }
+
+   /* clean up and return the tuplestore */
+   tuplestore_donestoring(tupstore);
+
+   return (Datum) 0;
+}
+
+Datum
+pg_stat_get_aios(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_AIOS_COLS  8
+
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+   TupleDesc   tupdesc;
+   Tuplestorestate *tupstore;
+   MemoryContext per_query_ctx;
+   MemoryContext oldcontext;
+   StringInfoData tmps;
+
+   /* check to see if caller supports us returning a tuplestore */
+   if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("set-valued function called in context that cannot accept a set")));
+   if (!(rsinfo->allowedModes & SFRM_Materialize))
+       ereport(ERROR,
+               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                errmsg("materialize mode required, but it is not allowed in this context")));
+
+   /* Build a tuple descriptor for our result type */
+   if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+       elog(ERROR, "return type must be a row type");
+
+   per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+   oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+   tupstore = tuplestore_begin_heap(true, false, work_mem);
+   rsinfo->returnMode = SFRM_Materialize;
+   rsinfo->setResult = tupstore;
+   rsinfo->setDesc = tupdesc;
+
+   MemoryContextSwitchTo(oldcontext);
+
+   initStringInfo(&tmps);
+
+   for (int i = 0; i < max_aio_in_progress; i++)
+   {
+       PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+       Datum       values[PG_STAT_GET_AIOS_COLS];
+       bool        nulls[PG_STAT_GET_AIOS_COLS];
+       int         owner_pid;
+       uint32      owner_id;
+       PgAioInProgressFlags flags = io->flags;
+
+       if (flags & PGAIOIP_UNUSED)
+           continue;
+
+       memset(nulls, 0, sizeof(nulls));
+
+       values[ 0] = Int32GetDatum(i);
+       values[ 1] = PointerGetDatum(cstring_to_text(pgaio_io_action_string(io->type)));
+
+       pgaio_io_flag_string(flags, &tmps);
+       values[ 2] = PointerGetDatum(cstring_to_text(tmps.data));
+       resetStringInfo(&tmps);
+
+       values[ 3] = Int32GetDatum(io->ring);
+
+       owner_id = io->owner_id; // XXX: READ_ONCE needed?
+       if (owner_id != INVALID_PGPROCNO)
+       {
+           owner_pid = ProcGlobal->allProcs[owner_id].pid;
+           values[ 4] = Int32GetDatum(owner_pid);
+       }
+       else
+       {
+           nulls[ 4] = true;
+       }
+
+       values[ 5] = Int64GetDatum(io->generation);
+
+       values[ 6] = Int32GetDatum(io->result);
+
+       pgaio_io_action_desc(io, &tmps);
+       values[ 7] = PointerGetDatum(cstring_to_text(tmps.data));
+       resetStringInfo(&tmps);
+
+       tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+   }
+
+   /* clean up and return the tuplestore */
+   tuplestore_donestoring(tupstore);
+
+   return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/aio_util.c b/src/backend/storage/ipc/aio_util.c
new file mode 100644 (file)
index 0000000..e4bf819
--- /dev/null
@@ -0,0 +1,547 @@
+#include "postgres.h"
+
+#include "storage/aio.h"
+#include "miscadmin.h"
+
+/* typedef is in header */
+typedef struct PgStreamingWriteItem
+{
+   /* membership in PgStreamingWrite->issued, PgStreamingWrite->available */
+   dlist_node node;
+
+   PgAioInProgress *aio;
+   bool in_progress;
+   void *private_data;
+
+   struct pg_streaming_write *pgsw;
+
+   PgAioOnCompletionLocalContext on_completion;
+} PgStreamingWriteItem;
+
+/* typedef is in header */
+struct pg_streaming_write
+{
+   uint32 iodepth;
+
+   uint32 inflight_count;
+
+   void *private_data;
+
+   pg_streaming_write_completed on_completion;
+
+   /* submitted writes */
+   dlist_head issued;
+
+   /* available writes (unused or completed) */
+   dlist_head available;
+
+   PgStreamingWriteItem all_items[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static void pg_streaming_write_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+
+pg_streaming_write*
+pg_streaming_write_alloc(uint32 iodepth, void *private_data, pg_streaming_write_completed on_completion)
+{
+   pg_streaming_write *pgsw;
+
+   iodepth = Max(Min(iodepth, NBuffers / 128), 1);
+
+   pgsw = palloc0(offsetof(pg_streaming_write, all_items)
+                + sizeof(PgStreamingWriteItem) * iodepth);
+
+   pgsw->iodepth = iodepth;
+   pgsw->private_data = private_data;
+   pgsw->on_completion = on_completion;
+
+   dlist_init(&pgsw->available);
+   dlist_init(&pgsw->issued);
+
+   for (int i = 0; i < pgsw->iodepth; i++)
+   {
+       PgStreamingWriteItem *this_write = &pgsw->all_items[i];
+
+       this_write->on_completion.callback = pg_streaming_write_complete;
+       this_write->pgsw = pgsw;
+       dlist_push_tail(&pgsw->available, &this_write->node);
+   }
+
+   return pgsw;
+}
+
+PgAioInProgress *
+pg_streaming_write_get_io(pg_streaming_write *pgsw)
+{
+   PgStreamingWriteItem *this_write;
+
+   /* loop in case the callback issues further writes */
+   while (dlist_is_empty(&pgsw->available))
+   {
+       Assert(!dlist_is_empty(&pgsw->issued));
+
+       this_write = dlist_head_element(PgStreamingWriteItem, node, &pgsw->issued);
+       Assert(this_write->in_progress);
+
+       pgaio_io_wait(this_write->aio);
+       Assert(!this_write->in_progress);
+   }
+
+   this_write = dlist_head_element(PgStreamingWriteItem, node, &pgsw->available);
+
+   Assert(!this_write->in_progress);
+
+   if (!this_write->aio)
+   {
+       this_write->aio = pgaio_io_get();
+   }
+
+   return this_write->aio;
+}
+
+uint32
+pg_streaming_write_inflight(pg_streaming_write *pgsw)
+{
+   return pgsw->inflight_count;
+}
+
+void
+pg_streaming_write_write(pg_streaming_write *pgsw, PgAioInProgress *io, void *private_data)
+{
+   PgStreamingWriteItem *this_write;
+
+   Assert(!dlist_is_empty(&pgsw->available));
+
+   this_write = dlist_container(PgStreamingWriteItem, node, dlist_pop_head_node(&pgsw->available));
+
+   Assert(!this_write->in_progress);
+   Assert(this_write->aio && this_write->aio == io);
+
+   dlist_delete_from(&pgsw->available, &this_write->node);
+
+   pgaio_io_on_completion_local(this_write->aio, &this_write->on_completion);
+
+   this_write->private_data = private_data;
+   this_write->in_progress = true;
+
+   dlist_push_tail(&pgsw->issued, &this_write->node);
+   pgsw->inflight_count++;
+
+   /*
+    * XXX: It'd make sense to trigger io submission more often.
+    */
+
+   ereport(DEBUG3, errmsg("submit %zu, %d deep", this_write - pgsw->all_items, pgsw->inflight_count),
+           errhidestmt(true),
+           errhidecontext(true));
+}
+
+static void
+pg_streaming_write_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io)
+{
+   PgStreamingWriteItem *this_write = pgaio_ocb_container(PgStreamingWriteItem, on_completion, ocb);
+   pg_streaming_write *pgsw = this_write->pgsw;
+   void *private_data;
+
+   Assert(this_write->in_progress);
+   Assert(pgaio_io_done(io));
+   Assert(pgaio_io_success(io));
+
+   dlist_delete_from(&pgsw->issued, &this_write->node);
+   pgsw->inflight_count--;
+
+   private_data = this_write->private_data;
+   this_write->private_data = NULL;
+   this_write->in_progress = false;
+   pgaio_io_recycle(this_write->aio);
+
+   dlist_push_tail(&pgsw->available, &this_write->node);
+
+   /* call callback after all other handling so it can issue IO */
+   pgsw->on_completion(pgsw->private_data, private_data);
+
+   ereport(DEBUG3, errmsg("complete %zu", this_write - pgsw->all_items),
+           errhidestmt(true),
+           errhidecontext(true));
+}
+
+void
+pg_streaming_write_wait_all(pg_streaming_write *pgsw)
+{
+   while (!dlist_is_empty(&pgsw->issued))
+   {
+       PgStreamingWriteItem *this_write =
+           dlist_head_element(PgStreamingWriteItem, node, &pgsw->issued);
+
+       if (!this_write->in_progress)
+           continue;
+       pgaio_io_wait(this_write->aio);
+       Assert(!this_write->in_progress);
+   }
+
+   Assert(pgsw->inflight_count == 0);
+}
+
+void
+pg_streaming_write_free(pg_streaming_write *pgsw)
+{
+   for (uint32 off = 0; off < pgsw->iodepth; off++)
+   {
+       PgStreamingWriteItem *this_write = &pgsw->all_items[off];
+
+       Assert(!this_write->in_progress);
+       if (this_write->aio)
+           pgaio_io_release(this_write->aio);
+       this_write->aio = NULL;
+   }
+
+   pfree(pgsw);
+}
+
+typedef struct PgStreamingReadItem
+{
+   /* membership in PgStreamingRead->issued, PgStreamingRead->available */
+   dlist_node node;
+   /* membership in PgStreamingRead->in_order */
+   dlist_node sequence_node;
+
+   PgAioOnCompletionLocalContext on_completion;
+   PgStreamingRead *pgsr;
+   PgAioInProgress *aio;
+
+   /* is this IO still considered to be in progress */
+   bool in_progress;
+   /* is this item currently valid / used */
+   bool valid;
+   uintptr_t read_private;
+} PgStreamingReadItem;
+
+struct PgStreamingRead
+{
+   uint32 iodepth_max;
+   uint32 distance_max;
+   uint32 all_items_count;
+
+   uintptr_t pgsr_private;
+   PgStreamingReadDetermineNextCB determine_next_cb;
+   PgStreamingReadRelease release_cb;
+
+   uint32 current_window;
+
+   /* number of requests issued */
+   uint64 prefetched_total_count;
+   /* number of requests submitted to kernel */
+   uint64 submitted_total_count;
+   /* number of current requests completed */
+   uint32 completed_count;
+   /* number of current requests in flight */
+   int32 inflight_count;
+   /* number of requests not yet submitted */
+   int32 pending_count;
+   /* number of requests that didn't require IO (debugging only) */
+   int32 no_io_count;
+
+   bool hit_end;
+
+   /* submitted reads */
+   dlist_head issued;
+
+   /* available reads (unused or completed) */
+   dlist_head available;
+
+   /*
+    * IOs, be they completed or in progress, in the order that the callback
+    * returned them.
+    */
+   dlist_head in_order;
+
+   PgStreamingReadItem all_items[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static void pg_streaming_read_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+static void pg_streaming_read_prefetch(PgStreamingRead *pgsr);
+
+PgStreamingRead *
+pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private,
+                       PgStreamingReadDetermineNextCB determine_next_cb,
+                       PgStreamingReadRelease release_cb)
+{
+   PgStreamingRead *pgsr;
+
+   iodepth = Max(Min(iodepth, NBuffers / 128), 1);
+
+   pgsr = palloc0(offsetof(PgStreamingRead, all_items) +
+                  sizeof(PgStreamingReadItem) * iodepth * 2);
+
+   pgsr->iodepth_max = iodepth;
+   pgsr->distance_max = iodepth;
+   pgsr->all_items_count = pgsr->iodepth_max + pgsr->distance_max;
+   pgsr->pgsr_private = pgsr_private;
+   pgsr->determine_next_cb = determine_next_cb;
+   pgsr->release_cb = release_cb;
+
+   pgsr->current_window = 0;
+
+   dlist_init(&pgsr->available);
+   dlist_init(&pgsr->in_order);
+   dlist_init(&pgsr->issued);
+
+   for (int i = 0; i < pgsr->all_items_count; i++)
+   {
+       PgStreamingReadItem *this_read = &pgsr->all_items[i];
+
+       this_read->on_completion.callback = pg_streaming_read_complete;
+       this_read->pgsr = pgsr;
+       dlist_push_tail(&pgsr->available, &this_read->node);
+   }
+
+   return pgsr;
+}
+
+void
+pg_streaming_read_free(PgStreamingRead *pgsr)
+{
+   /*
+    * Prevent further read-ahead from being queued in the completion
+    * callback. We'd not necessarily wait for them in this loop, leaving
+    * their completion callbacks point to already freed memory.
+    */
+   pgsr->hit_end = true;
+
+   for (int i = 0; i < pgsr->all_items_count; i++)
+   {
+       PgStreamingReadItem *this_read = &pgsr->all_items[i];
+
+       if (this_read->in_progress)
+       {
+           Assert(this_read->valid);
+           pgaio_io_wait(this_read->aio);
+           Assert(!this_read->in_progress);
+       }
+
+       if (this_read->valid)
+           pgsr->release_cb(pgsr->pgsr_private, this_read->read_private);
+
+       if (this_read->aio)
+       {
+           pgaio_io_release(this_read->aio);
+           this_read->aio = NULL;
+       }
+   }
+
+   pfree(pgsr);
+}
+
+static void
+pg_streaming_read_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io)
+{
+   PgStreamingReadItem *this_read = pgaio_ocb_container(PgStreamingReadItem, on_completion, ocb);
+   PgStreamingRead *pgsr = this_read->pgsr;
+
+#if 0
+   if ((pgsr->prefetched_total_count % 10000) == 0)
+       ereport(LOG, errmsg("pgsr read completed: qd %d completed: %d",
+                           pgsr->inflight_count, pgsr->completed_count),
+               errhidestmt(true),
+               errhidecontext(true));
+#endif
+
+   Assert(this_read->in_progress);
+   Assert(this_read->valid);
+   Assert(this_read->aio == io);
+   Assert(pgsr->inflight_count > 0);
+   Assert(pgaio_io_done(io));
+   Assert(pgaio_io_success(io));
+
+   dlist_delete_from(&pgsr->issued, &this_read->node);
+   pgsr->inflight_count--;
+   pgsr->completed_count++;
+   this_read->in_progress = false;
+   pgaio_io_recycle(this_read->aio);
+
+   pg_streaming_read_prefetch(pgsr);
+}
+
+static void
+pg_streaming_read_prefetch_one(PgStreamingRead *pgsr)
+{
+   PgStreamingReadItem *this_read;
+   PgStreamingReadNextStatus status;
+
+   Assert(!dlist_is_empty(&pgsr->available));
+
+   this_read = dlist_container(PgStreamingReadItem, node, dlist_pop_head_node(&pgsr->available));
+   Assert(!this_read->valid);
+   Assert(!this_read->in_progress);
+   Assert(this_read->read_private == 0);
+
+   if (this_read->aio == NULL)
+   {
+       this_read->aio = pgaio_io_get();
+   }
+
+   pgaio_io_on_completion_local(this_read->aio, &this_read->on_completion);
+   this_read->in_progress = true;
+   this_read->valid = true;
+   dlist_push_tail(&pgsr->issued, &this_read->node);
+   dlist_push_tail(&pgsr->in_order, &this_read->sequence_node);
+   pgsr->pending_count++;
+   pgsr->inflight_count++;
+   pgsr->prefetched_total_count++;
+
+   status = pgsr->determine_next_cb(pgsr->pgsr_private, this_read->aio, &this_read->read_private);
+
+   if (status == PGSR_NEXT_END)
+   {
+       pgsr->pending_count--;
+       pgsr->inflight_count--;
+       pgsr->prefetched_total_count--;
+       pgsr->hit_end = true;
+       this_read->read_private = 0;
+       this_read->valid = false;
+       this_read->in_progress = false;
+       pgaio_io_recycle(this_read->aio);
+       dlist_delete_from(&pgsr->in_order, &this_read->sequence_node);
+       dlist_push_tail(&pgsr->available, &this_read->node);
+   }
+   else if (status == PGSR_NEXT_NO_IO)
+   {
+       Assert(this_read->read_private != 0);
+       pgsr->pending_count--;
+       pgsr->inflight_count--;
+       pgsr->no_io_count++;
+       pgsr->completed_count++;
+       this_read->in_progress = false;
+       pgaio_io_recycle(this_read->aio);
+       dlist_delete_from(&pgsr->issued, &this_read->node);
+   }
+   else
+   {
+       Assert(this_read->read_private != 0);
+   }
+}
+
+static void
+pg_streaming_read_prefetch(PgStreamingRead *pgsr)
+{
+   uint32 min_issue;
+
+   if (pgsr->hit_end)
+       return;
+
+   Assert(pgsr->inflight_count <= pgsr->current_window);
+   Assert(pgsr->completed_count <= (pgsr->iodepth_max + pgsr->distance_max));
+
+   /*
+    * XXX: Some issues:
+    *
+    * - We should probably do the window calculation based on the number of
+    *   buffers the user actually requested, i.e. only recompute this
+    *   whenever pg_streaming_read_get_next() is called. Otherwise we will
+    *   always read the whole prefetch window.
+    *
+    * - The algorithm here is pretty stupid. Should take distance / iodepth
+    *   properly into account in a distitcn way. Grow slower.
+    *
+    * - It'd be good to have a usage dependent iodepth management. After an
+    *   initial increase, we should only increase further if the the user the
+    *   window proves too small (i.e. we don't manage to keep 'completed'
+    *   close to full).
+    *
+    * - If most requests don't trigger IO, we should probably reduce the
+    *   prefetch window.
+    */
+   if (pgsr->current_window < pgsr->iodepth_max)
+   {
+       if (pgsr->current_window == 0)
+           pgsr->current_window = 4;
+       else
+           pgsr->current_window *= 2;
+
+       if (pgsr->current_window > pgsr->iodepth_max)
+           pgsr->current_window = pgsr->iodepth_max;
+
+       min_issue = 1;
+   }
+   else
+   {
+       min_issue = Min(pgsr->iodepth_max, pgsr->current_window / 4);
+   }
+
+   Assert(pgsr->inflight_count <= pgsr->current_window);
+   Assert(pgsr->completed_count <= (pgsr->iodepth_max + pgsr->distance_max));
+
+   if (pgsr->completed_count >= pgsr->current_window)
+       return;
+
+   if (pgsr->inflight_count >= pgsr->current_window)
+       return;
+
+   while (!pgsr->hit_end &&
+          (pgsr->inflight_count < pgsr->current_window) &&
+          (pgsr->completed_count < pgsr->current_window))
+   {
+       pg_streaming_read_prefetch_one(pgsr);
+
+       if (pgsr->pending_count >= min_issue)
+       {
+           pgsr->submitted_total_count += pgsr->pending_count;
+           pgsr->pending_count = 0;
+           pgaio_submit_pending(true);
+       }
+
+       CHECK_FOR_INTERRUPTS();
+   }
+
+   if (pgsr->pending_count >= min_issue)
+   {
+       pgsr->submitted_total_count += pgsr->pending_count;
+       pgsr->pending_count = 0;
+       pgaio_submit_pending(true);
+   }
+}
+
+uintptr_t
+pg_streaming_read_get_next(PgStreamingRead *pgsr)
+{
+   if (pgsr->prefetched_total_count == 0)
+   {
+       pg_streaming_read_prefetch(pgsr);
+       Assert(pgsr->hit_end || pgsr->prefetched_total_count > 0);
+   }
+
+   if (dlist_is_empty(&pgsr->in_order))
+   {
+       Assert(pgsr->hit_end);
+       return 0;
+   }
+   else
+   {
+       PgStreamingReadItem *this_read;
+       uint64_t ret;
+
+       Assert(pgsr->prefetched_total_count > 0);
+
+       this_read = dlist_container(PgStreamingReadItem, sequence_node,
+                                   dlist_pop_head_node(&pgsr->in_order));
+       Assert(this_read->valid);
+
+       if (this_read->in_progress)
+       {
+           pgaio_io_wait(this_read->aio);
+           /* callback should have updated */
+           Assert(!this_read->in_progress);
+           Assert(this_read->valid);
+       }
+
+       Assert(this_read->read_private != 0);
+       ret = this_read->read_private;
+       this_read->read_private = 0;
+       this_read->valid = false;
+
+       pgsr->completed_count--;
+       dlist_push_tail(&pgsr->available, &this_read->node);
+       pg_streaming_read_prefetch(pgsr);
+
+       return ret;
+   }
+}
index f9bbe97b5075059691dff92b36e7e119d15908a0..2df32b6bffbb2517ea06ad156f8f594416ee4024 100644 (file)
@@ -34,6 +34,7 @@
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
@@ -149,6 +150,7 @@ CreateSharedMemoryAndSemaphores(void)
        size = add_size(size, BTreeShmemSize());
        size = add_size(size, SyncScanShmemSize());
        size = add_size(size, AsyncShmemSize());
+       size = add_size(size, AioShmemSize());
 #ifdef EXEC_BACKEND
        size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -268,6 +270,8 @@ CreateSharedMemoryAndSemaphores(void)
    SyncScanShmemInit();
    AsyncShmemInit();
 
+   AioShmemInit();
+
 #ifdef EXEC_BACKEND
 
    /*
index 7d6c85a8ac2aa250b8803b4dd2b21b5398b88455..7032f5d66f634777013129c488d4dcba56ed786f 100644 (file)
@@ -175,7 +175,11 @@ static const char *const BuiltinTrancheNames[] = {
    /* LWTRANCHE_PARALLEL_APPEND: */
    "ParallelAppend",
    /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
-   "PerXactPredicateList"
+   "PerXactPredicateList",
+   /* LWTRANCHE_AIO_CONTEXT_SUBMISSION: */
+   "AioContextSubmission",
+   /* LWTRANCHE_AIO_CONTEXT_COMPLETION: */
+   "AioContextCompletion"
 };
 
 StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
index 774292fd942774b87f11dbeebb76f6c5e64f02f2..9e53e0a8fbae63e395d21f455a2389eff1fb0dd6 100644 (file)
@@ -53,3 +53,5 @@ XactTruncationLock                    44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock               46
 NotifyQueueTailLock                    47
+SharedAIOCtlLock                   48
+WALSyncLock                        49
index db0cfaa360031dca712f4092765c0a68e6bc0f87..f026bc87d0c8630b86ed46bc27272e946613f3a3 100644 (file)
@@ -44,6 +44,7 @@
 #include "replication/slot.h"
 #include "replication/syncrep.h"
 #include "replication/walsender.h"
+#include "storage/aio.h"
 #include "storage/condition_variable.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
@@ -464,6 +465,8 @@ InitProcess(void)
     */
    InitLWLockAccess();
    InitDeadLockChecking();
+
+   pgaio_postmaster_child_init();
 }
 
 /*
@@ -602,6 +605,8 @@ InitAuxiliaryProcess(void)
    Assert(MyProc->lockGroupLeader == NULL);
    Assert(dlist_is_empty(&MyProc->lockGroupMembers));
 
+   pgaio_postmaster_child_init();
+
    /*
     * We might be reusing a semaphore that belonged to a failed process. So
     * be careful and reinitialize its value here.  (This is not strictly
index 93cdd74449845f0a3f7ef5d9404dd675c6b0cfc8..7c5711ba8eb896c37747789dab593f4c4f70dfd7 100644 (file)
@@ -18,6 +18,7 @@
 #include "access/itup.h"
 #include "access/xlog.h"
 #include "pgstat.h"
+#include "storage/aio.h"
 #include "storage/checksum.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
@@ -1390,22 +1391,30 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
  * returned page and not refer to it again.
  */
 char *
-PageSetChecksumCopy(Page page, BlockNumber blkno)
+PageSetChecksumCopy(Page page, BlockNumber blkno, PgAioBounceBuffer **bb)
 {
-   static char *pageCopy = NULL;
+   static char *pageCopySync = NULL;
+   char *pageCopy;
+
+   if (bb)
+       *bb = NULL;
 
    /* If we don't need a checksum, just return the passed-in data */
    if (PageIsNew(page) || !DataChecksumsEnabled())
        return (char *) page;
 
-   /*
-    * We allocate the copy space once and use it over on each subsequent
-    * call.  The point of palloc'ing here, rather than having a static char
-    * array, is first to ensure adequate alignment for the checksumming code
-    * and second to avoid wasting space in processes that never call this.
-    */
-   if (pageCopy == NULL)
-       pageCopy = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+   if (bb)
+   {
+       *bb = pgaio_bounce_buffer_get();
+       pageCopy = pgaio_bounce_buffer_buffer(*bb);
+   }
+   else if (!pageCopySync)
+   {
+       pageCopySync = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+       pageCopy = pageCopySync;
+   }
+   else
+       pageCopy = pageCopySync;
 
    memcpy(pageCopy, (char *) page, BLCKSZ);
    ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
index aa66590cf3f2ba349fac3e5ea6b783cf4cdc8bad..e197b78df301cf6852c8dbd2caf98595548c9fdb 100644 (file)
@@ -32,6 +32,7 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/md.h"
@@ -140,6 +141,20 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
                              MdfdVec *seg);
 
+static inline int _mdfd_open_flags(ForkNumber forkNum)
+{
+   int     flags = O_RDWR | PG_BINARY;
+
+   /*
+    * XXX: not clear if direct IO ever is interesting for other forks?  The
+    * FSM fork currently often ends up very fragmented when using direct IO,
+    * for example.
+    */
+   if (io_data_direct /* && forkNum == MAIN_FORKNUM */)
+       flags |= PG_O_DIRECT;
+
+   return flags;
+}
 
 /*
  * mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -201,14 +216,14 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
    path = relpath(reln->smgr_rnode, forkNum);
 
-   fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+   fd = PathNameOpenFile(path, _mdfd_open_flags(forkNum) | O_CREAT | O_EXCL);
 
    if (fd < 0)
    {
        int         save_errno = errno;
 
        if (isRedo)
-           fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+           fd = PathNameOpenFile(path, _mdfd_open_flags(forkNum));
        if (fd < 0)
        {
            /* be sure to report the error reported by create, not open */
@@ -466,6 +481,130 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
    Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 }
 
+static void
+zeroextend_complete(void *pgsw_private, void *write_private)
+{
+   BlockNumber *latest = (BlockNumber *) write_private;
+}
+
+BlockNumber
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+            BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+   MdfdVec    *v;
+   PgAioBounceBuffer *bb;
+   char       *zerobuf;
+   pg_streaming_write *pgsw ;
+   BlockNumber latest;
+   BlockNumber curblocknum = blocknum;
+   int         remblocks = nblocks;
+
+   Assert(nblocks > 0);
+
+   pgsw = pg_streaming_write_alloc(Min(256, nblocks), &latest, zeroextend_complete);
+
+   bb = pgaio_bounce_buffer_get();
+   zerobuf = pgaio_bounce_buffer_buffer(bb);
+   memset(zerobuf, 0, BLCKSZ);
+
+   /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+   Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+   /*
+    * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+    * more --- we mustn't create a block whose number actually is
+    * InvalidBlockNumber.
+    */
+   // FIXME
+#if 0
+   if (blocknum == InvalidBlockNumber)
+       ereport(ERROR,
+               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                errmsg("cannot extend file \"%s\" beyond %u blocks",
+                       relpath(reln->smgr_rnode, forknum),
+                       InvalidBlockNumber)));
+#endif
+
+   while (remblocks > 0)
+   {
+       int fd;
+       int ret;
+       int segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+       int segendblock = (curblocknum % ((BlockNumber) RELSEG_SIZE)) + remblocks;
+       off_t       seekpos;
+
+       if (segendblock > RELSEG_SIZE)
+           segendblock = RELSEG_SIZE;
+
+       v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+       Assert(segstartblock < RELSEG_SIZE);
+       Assert(segendblock <= RELSEG_SIZE);
+
+       seekpos = (off_t) BLCKSZ * segstartblock;
+
+       fd = FileGetRawDesc(v->mdfd_vfd);
+       ret = posix_fallocate(fd,
+                             seekpos,
+                             (off_t) BLCKSZ * (segendblock - segstartblock));
+
+       if (ret != 0)
+           elog(ERROR, "fallocate failed: %m");
+
+       for (BlockNumber i = segstartblock; i < segendblock; i++)
+       {
+           PgAioInProgress *aio = pg_streaming_write_get_io(pgsw);
+           AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = i};
+
+           pgaio_assoc_bounce_buffer(aio, bb);
+           FileStartWrite(aio, v->mdfd_vfd, zerobuf, BLCKSZ, i * BLCKSZ, &tag, InvalidBuffer, false);
+
+           pg_streaming_write_write(pgsw, aio, (void*) &i);
+
+           /* FIXME: ensure errors are handled equivalently */
+#if 0
+           int         nbytes;
+
+           if ((nbytes = FileWrite(v->mdfd_vfd, zerobuf, BLCKSZ,
+                                   i * BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+           {
+               if (nbytes < 0)
+                   ereport(ERROR,
+                           (errcode_for_file_access(),
+                            errmsg("could not extend file \"%s\": %m",
+                                   FilePathName(v->mdfd_vfd)),
+                            errhint("Check free disk space.")));
+               /* short write: complain appropriately */
+               ereport(ERROR,
+                       (errcode(ERRCODE_DISK_FULL),
+                        errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+                               FilePathName(v->mdfd_vfd),
+                               nbytes, BLCKSZ, blocknum),
+                        errhint("Check free disk space.")));
+           }
+#endif
+       }
+
+       if (!skipFsync && !SmgrIsTemp(reln))
+           register_dirty_segment(reln, forknum, v);
+
+       Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+       remblocks -= segendblock - segstartblock;
+       curblocknum += segendblock - segstartblock;
+   }
+
+   pgaio_bounce_buffer_release(bb);
+
+   pg_streaming_write_wait_all(pgsw);
+   pg_streaming_write_free(pgsw);
+
+   return blocknum + (nblocks - 1);
+}
+
+
 /*
  * mdopenfork() -- Open one fork of the specified relation.
  *
@@ -489,7 +628,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 
    path = relpath(reln->smgr_rnode, forknum);
 
-   fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+   fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
 
    if (fd < 0)
    {
@@ -585,6 +724,8 @@ void
 mdwriteback(SMgrRelation reln, ForkNumber forknum,
            BlockNumber blocknum, BlockNumber nblocks)
 {
+   if (io_data_direct)
+       return;
    /*
     * Issue flush requests in as few requests as possible; have to split at
     * segment boundaries though, since those are actually separate files.
@@ -690,6 +831,37 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
    }
 }
 
+/*
+ * mdread() -- Read the specified block from a relation.
+ */
+void
+mdstartread(PgAioInProgress *io, SMgrRelation reln,
+           ForkNumber forknum, BlockNumber blocknum,
+           char *buffer, int bufno, int mode)
+{
+   off_t       seekpos;
+   MdfdVec    *v;
+   AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = blocknum};
+
+   AssertPointerAlignment(buffer, 4096);
+
+   v = _mdfd_getseg(reln, forknum, blocknum, false,
+                    EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+   seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+   Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+   if (!FileStartRead(io, v->mdfd_vfd, buffer, BLCKSZ, seekpos, &tag, bufno, mode))
+   {
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not write block %u in file \"%s\": %m",
+                       blocknum, FilePathName(v->mdfd_vfd))));
+   }
+
+}
+
 /*
  * mdwrite() -- Write the supplied block at the appropriate location.
  *
@@ -756,6 +928,53 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
        register_dirty_segment(reln, forknum, v);
 }
 
+/*
+ * mdstartwrite() -- Asynchronously start a Write the supplied block at the
+ *  appropriate location.
+ */
+void
+mdstartwrite(PgAioInProgress *io, SMgrRelation reln,
+            ForkNumber forknum, BlockNumber blocknum,
+            char *buffer, int bufno, bool skipFsync, bool release_lock)
+{
+   off_t       seekpos;
+   MdfdVec    *v;
+   AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = blocknum};
+
+   /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+   Assert(blocknum < mdnblocks(reln, forknum));
+#endif
+
+   TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+                                        reln->smgr_rnode.node.spcNode,
+                                        reln->smgr_rnode.node.dbNode,
+                                        reln->smgr_rnode.node.relNode,
+                                        reln->smgr_rnode.backend);
+
+   v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+                    EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+   seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+   Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+   /*
+    * XXX: In the synchronous case this is after the write - should be fine,
+    * I think?
+    */
+   if (!skipFsync && !SmgrIsTemp(reln))
+       register_dirty_segment(reln, forknum, v);
+
+   if (!FileStartWrite(io, v->mdfd_vfd, buffer, BLCKSZ, seekpos, &tag, bufno, release_lock))
+   {
+       ereport(ERROR,
+               (errcode_for_file_access(),
+                errmsg("could not write block %u in file \"%s\": %m",
+                       blocknum, FilePathName(v->mdfd_vfd))));
+   }
+}
+
 /*
  * mdnblocks() -- Get the number of blocks stored in a relation.
  *
@@ -999,6 +1218,21 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
    }
 }
 
+int
+mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
+{
+   MdfdVec    *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
+
+   v = _mdfd_getseg(reln, forknum, blocknum, false,
+                    EXTENSION_FAIL);
+
+   *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+   Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
+
+   return FileGetRawDesc(v->mdfd_vfd);
+}
+
 /*
  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
  */
@@ -1155,7 +1389,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
    fullpath = _mdfd_segpath(reln, forknum, segno);
 
    /* open the file */
-   fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+   fd = PathNameOpenFile(fullpath, _mdfd_open_flags(forknum) | oflags);
 
    pfree(fullpath);
 
@@ -1359,7 +1593,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
        strlcpy(path, p, MAXPGPATH);
        pfree(p);
 
-       file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+       file = PathNameOpenFile(path, _mdfd_open_flags(ftag->forknum));
        if (file < 0)
            return -1;
        need_to_close = true;
index 0f31ff38221f61f9b20f3cbce9bbf6514b9846b9..1c8cc7362c434c60d2578c1910571007650253d0 100644 (file)
@@ -50,10 +50,20 @@ typedef struct f_smgr
                                bool isRedo);
    void        (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                                BlockNumber blocknum, char *buffer, bool skipFsync);
+   BlockNumber (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+                                   BlockNumber blocknum, int nblocks, bool skipFsync);
    bool        (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
                                  BlockNumber blocknum);
    void        (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
                              BlockNumber blocknum, char *buffer);
+   void        (*smgr_startread) (struct PgAioInProgress *io ,
+                                  SMgrRelation reln, ForkNumber forknum,
+                                  BlockNumber blocknum, char *buffer,
+                                  int bufno, int mode);
+   void        (*smgr_startwrite) (struct PgAioInProgress *io,
+                                   SMgrRelation reln, ForkNumber forknum,
+                                   BlockNumber blocknum, char *buffer,
+                                   int bufno, bool skipFsync, bool release_lock);
    void        (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
                               BlockNumber blocknum, char *buffer, bool skipFsync);
    void        (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
@@ -62,6 +72,7 @@ typedef struct f_smgr
    void        (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
                                  BlockNumber nblocks);
    void        (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+   int         (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
 } f_smgr;
 
 static const f_smgr smgrsw[] = {
@@ -75,13 +86,17 @@ static const f_smgr smgrsw[] = {
        .smgr_exists = mdexists,
        .smgr_unlink = mdunlink,
        .smgr_extend = mdextend,
+       .smgr_zeroextend = mdzeroextend,
        .smgr_prefetch = mdprefetch,
        .smgr_read = mdread,
+       .smgr_startread = mdstartread,
        .smgr_write = mdwrite,
+       .smgr_startwrite = mdstartwrite,
        .smgr_writeback = mdwriteback,
        .smgr_nblocks = mdnblocks,
        .smgr_truncate = mdtruncate,
        .smgr_immedsync = mdimmedsync,
+       .smgr_fd = mdfd,
    }
 };
 
@@ -476,6 +491,14 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
        reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
 
+BlockNumber
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+              int nblocks, bool skipFsync)
+{
+   return smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+                                                   nblocks, skipFsync);
+}
+
 /*
  * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
  *
@@ -504,6 +527,13 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
    smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
 }
 
+void
+smgrstartread(struct PgAioInProgress *io, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+             char *buffer, int bufno, int mode)
+{
+   return smgrsw[reln->smgr_which].smgr_startread(io, reln, forknum, blocknum, buffer, bufno, mode);
+}
+
 /*
  * smgrwrite() -- Write the supplied buffer out.
  *
@@ -527,6 +557,16 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                        buffer, skipFsync);
 }
 
+void
+smgrstartwrite(struct PgAioInProgress *io,
+              SMgrRelation reln, ForkNumber forknum,
+              BlockNumber blocknum, char *buffer,
+              int bufno, bool skipFsync, bool release_lock)
+{
+   smgrsw[reln->smgr_which].smgr_startwrite(io, reln, forknum, blocknum,
+                                            buffer, bufno, skipFsync, release_lock);
+}
+
 
 /*
  * smgrwriteback() -- Trigger kernel writeback for the supplied range of
@@ -644,6 +684,12 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
    smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
 }
 
+int
+smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
+{
+   return smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
+}
+
 /*
  * AtEOXact_SMgr
  *
index fdb9eab73a1407008d1fa69668ecdb9679ba3269..26272dc01a0a5a37ef8b9602b13105bf309fa77d 100644 (file)
@@ -65,6 +65,7 @@
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "rewrite/rewriteHandler.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/pmsignal.h"
@@ -3956,6 +3957,9 @@ PostgresMain(int argc, char *argv[],
 
        /* Initialize MaxBackends (if under postmaster, was done already) */
        InitializeMaxBackends();
+
+       /* AIO is needed during InitPostgres() */
+       pgaio_postmaster_init();
    }
 
    /* Early initialization */
index 0f67b99cc55348132bf927c4494422937dcdc138..9cd7c685aaf5ed3fef8e6a00036183a1256e8e74 100644 (file)
@@ -39,6 +39,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
 #include "postmaster/postmaster.h"
+#include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
@@ -124,6 +125,8 @@ InitPostmasterChild(void)
    InitLatch(MyLatch);
    InitializeLatchWaitSet();
 
+   pgaio_postmaster_child_init_local();
+
    /*
     * If possible, make this process a group leader, so that the postmaster
     * can signal any child processes too. Not all processes will have
index 6e55fd0e5deeac24bc7cbe62ffef9451b5414707..d0b6b2c9066a9a0283f9f4dc5b00b5f62be8f3c4 100644 (file)
@@ -2048,6 +2048,42 @@ static struct config_bool ConfigureNamesBool[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"io_data_direct", PGC_SUSET, RESOURCES_DISK,
+           gettext_noop("data file IO uses direct IO."),
+       },
+       &io_data_direct,
+       false,
+       NULL, NULL, NULL
+   },
+
+   {
+       {"io_data_force_async", PGC_SUSET, RESOURCES_DISK,
+           gettext_noop("force synchronous data file to use async IO."),
+       },
+       &io_data_force_async,
+       true, // helpful for development
+       NULL, NULL, NULL
+   },
+
+   {
+       {"io_wal_direct", PGC_SIGHUP, RESOURCES_DISK,
+           gettext_noop("wal file IO uses direct IO."),
+       },
+       &io_wal_direct,
+       false,
+       NULL, NULL, NULL
+   },
+
+   {
+       {"io_wal_init_direct", PGC_SIGHUP, RESOURCES_DISK,
+           gettext_noop("wal file initialization IO uses direct IO."),
+       },
+       &io_wal_init_direct,
+       false,
+       NULL, NULL, NULL
+   },
+
    /* End-of-list marker */
    {
        {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
index 10f15f6a3579a4f4d9329eb158ca1f8923c1643e..3e60145e24c36c9512030fff56a5173ed1d0da3a 100644 (file)
@@ -23,6 +23,7 @@
 #include "common/cryptohash.h"
 #include "common/hashfn.h"
 #include "jit/jit.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/predicate.h"
@@ -130,6 +131,7 @@ typedef struct ResourceOwnerData
    ResourceArray dsmarr;       /* dynamic shmem segments */
    ResourceArray jitarr;       /* JIT contexts */
    ResourceArray cryptohasharr;    /* cryptohash contexts */
+   ResourceArray aiobbarr;     /* AIO BB references */
 
    /* We can remember up to MAX_RESOWNER_LOCKS references to local locks. */
    int         nlocks;         /* number of owned locks */
@@ -178,6 +180,7 @@ static void PrintSnapshotLeakWarning(Snapshot snapshot);
 static void PrintFileLeakWarning(File file);
 static void PrintDSMLeakWarning(dsm_segment *seg);
 static void PrintCryptoHashLeakWarning(Datum handle);
+static void PrintAioBBLeakWarning(Datum handle);
 
 
 /*****************************************************************************
@@ -568,6 +571,18 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
                PrintCryptoHashLeakWarning(foundres);
            pg_cryptohash_free(context);
        }
+
+       /* Ditto for AIO BBs */
+       /* XXX: reconsider phase? */
+       while (ResourceArrayGetAny(&(owner->aiobbarr), &foundres))
+       {
+           PgAioBounceBuffer *bb = (PgAioBounceBuffer *) PointerGetDatum(foundres);
+
+           if (isCommit)
+               PrintAioBBLeakWarning(foundres);
+
+           pgaio_bounce_buffer_release(bb);
+       }
    }
    else if (phase == RESOURCE_RELEASE_LOCKS)
    {
@@ -1432,3 +1447,30 @@ PrintCryptoHashLeakWarning(Datum handle)
    elog(WARNING, "cryptohash context reference leak: context %p still referenced",
         DatumGetPointer(handle));
 }
+
+void
+ResourceOwnerEnlargeAioBB(ResourceOwner owner)
+{
+   ResourceArrayEnlarge(&(owner->aiobbarr));
+}
+
+void
+ResourceOwnerRememberAioBB(ResourceOwner owner, PgAioBounceBuffer *bb)
+{
+   ResourceArrayAdd(&(owner->aiobbarr), PointerGetDatum(bb));
+}
+
+void
+ResourceOwnerForgetAioBB(ResourceOwner owner, PgAioBounceBuffer *bb)
+{
+   if (!ResourceArrayRemove(&(owner->aiobbarr), PointerGetDatum(bb)))
+       elog(ERROR, "aio BB %p is not owned by resource owner %s",
+            DatumGetPointer(bb), owner->name);
+}
+
+static void
+PrintAioBBLeakWarning(Datum handle)
+{
+   elog(WARNING, "aio BB reference leak: bb %zu still referenced",
+        handle);
+}
index 9585ad17b3cfb5e8f9723f04b39aa8097be0cd4c..769499a771d256b2c61c2e69b89068c0f0e27a06 100644 (file)
@@ -315,6 +315,10 @@ extern XLogRecPtr RequestXLogSwitch(bool mark_unimportant);
 
 extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli);
 
+struct PgAioInProgress;
+extern void XLogWriteComplete(struct PgAioInProgress *aio, uint32 write_no);
+extern void XLogFlushComplete(struct PgAioInProgress *aio, uint32 flush_no);
+
 /*
  * Exported for the functions in timeline.c and xlogarchive.c.  Only valid
  * in the startup process.
index d7b55f57ea77f636477fc3f51462e834a6950db7..bc42064bc0e02bfc553ad7b6cef52774ab36d6f1 100644 (file)
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '811', descr => 'AIO backend stats',
+  proname => 'pg_stat_get_aio_backends', procost => '1000', prorows => '20', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int4,int8,int8,int8,int8,int8,int4,int4,int4,int4,int4,int4,int4}', proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid, executed_total_count, issued_total_count, submissions_total_count, foreign_completed_total_count, retry_total_count, inflight_count, unused_count, outstanding_count, pending_count, local_completed_count, foreign_completed_count,last_context}', prosrc => 'pg_stat_get_aio_backends' },
+
+{ oid => '812', descr => 'AIO stats',
+  proname => 'pg_stat_get_aios', procost => '1000', prorows => '20', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int4,text,text,int4,int4,int8,int8,text}', proargmodes => '{o,o,o,o,o,o,o,o}',
+  proargnames => '{id,operation,flags,ring,owner_pid,generation,result,action_desc}', prosrc => 'pg_stat_get_aios' },
+
 ]
index 2fc21b4f89b88619b826839198f02e4382fd4dd1..f415e1988a346ee13f8a788fc1230d2227d49623 100644 (file)
@@ -1060,12 +1060,19 @@ typedef enum
    WAIT_EVENT_WAL_READ,
    WAIT_EVENT_WAL_SYNC,
    WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+   WAIT_EVENT_WAL_WAIT_FLUSH,
    WAIT_EVENT_WAL_WAIT_INSERT,
+   WAIT_EVENT_WAL_WAIT_WRITE,
    WAIT_EVENT_WAL_WRITE,
    WAIT_EVENT_LOGICAL_CHANGES_READ,
    WAIT_EVENT_LOGICAL_CHANGES_WRITE,
    WAIT_EVENT_LOGICAL_SUBXACT_READ,
-   WAIT_EVENT_LOGICAL_SUBXACT_WRITE
+   WAIT_EVENT_LOGICAL_SUBXACT_WRITE,
+   WAIT_EVENT_AIO_SUBMIT,
+   WAIT_EVENT_AIO_IO_COMPLETE_ANY,
+   WAIT_EVENT_AIO_IO_COMPLETE_ONE,
+   WAIT_EVENT_AIO_REFS,
+   WAIT_EVENT_AIO_BACKPRESSURE
 } WaitEventIO;
 
 /* ----------
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
new file mode 100644 (file)
index 0000000..ed8027f
--- /dev/null
@@ -0,0 +1,184 @@
+/*-------------------------------------------------------------------------
+ *
+ * aio.h
+ *   aio
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/aio.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef AIO_H
+#define AIO_H
+
+#include "common/relpath.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/relfilenode.h"
+#include "lib/ilist.h"
+
+#include "access/xlogdefs.h"
+
+typedef struct PgAioInProgress PgAioInProgress;
+typedef struct PgAioBounceBuffer PgAioBounceBuffer;
+
+typedef struct PgAioIoRef
+{
+   uint32 aio_index;
+   uint32 generation_upper;
+   uint32 generation_lower;
+} PgAioIoRef;
+
+/* initialization */
+extern void pgaio_postmaster_init(void);
+extern Size AioShmemSize(void);
+extern void AioShmemInit(void);
+extern void pgaio_postmaster_child_init_local(void);
+extern void pgaio_postmaster_child_init(void);
+
+extern void pgaio_at_abort(void);
+extern void pgaio_at_commit(void);
+
+/*
+ * XXX: Add flags to the initiation functions that govern:
+ * - whether PgAioInProgress will be released once the IO has successfully
+ *   finished, even if done by another backend (e.g. useful when prefetching,
+ *   without wanting to look at the concrete buffer, or when doing buffer
+ *   writeback). In particular this'd also cause the buffer pin to be released
+ *   upon completion.
+ * - whether a failed request needs to be seen by the issuing backend. That's
+ *   not needed e.g. for prefetches, but is for buffer writes by checkpointer.
+ * - whether pending requests might be issued if sensible, or whether that's
+ *   not allowed.
+ *
+ *
+ * FIXME: Add indicator about which IO channel needs to be used (important
+ * e.g. for buffer writes interleaved with WAL writes, for queue depth
+ * management of checkpointer, for readahead)
+ */
+
+extern PgAioInProgress *pgaio_io_get(void);
+extern void pgaio_io_release(PgAioInProgress *io);
+
+extern dlist_node* pgaio_io_node(PgAioInProgress *io);
+
+typedef struct PgAioOnCompletionLocalContext PgAioOnCompletionLocalContext;;
+typedef void (*PgAioOnCompletionLocalCB)(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+struct PgAioOnCompletionLocalContext
+{
+   PgAioOnCompletionLocalCB callback;
+};
+#define pgaio_ocb_container(type, membername, ptr)                             \
+   (AssertVariableIsOfTypeMacro(ptr, PgAioOnCompletionLocalContext *),         \
+    AssertVariableIsOfTypeMacro(((type *) NULL)->membername, PgAioOnCompletionLocalContext),   \
+    ((type *) ((char *) (ptr) - offsetof(type, membername))))
+extern void pgaio_io_on_completion_local(PgAioInProgress *io, PgAioOnCompletionLocalContext *ocb);
+
+extern void pgaio_io_wait(PgAioInProgress *io);
+extern void pgaio_io_wait_ref(PgAioIoRef *ref, bool call_local);
+extern bool pgaio_io_check_ref(PgAioIoRef *ref);
+
+extern bool pgaio_io_success(PgAioInProgress *io);
+extern bool pgaio_io_done(PgAioInProgress *io);
+extern void pgaio_io_recycle(PgAioInProgress *io);
+extern void pgaio_io_ref(PgAioInProgress *io, PgAioIoRef *ref);
+
+static inline bool
+pgaio_io_ref_valid(const PgAioIoRef *ref)
+{
+   return ref->aio_index != PG_UINT32_MAX;
+}
+
+static inline void
+pgaio_io_ref_clear(PgAioIoRef *ref)
+{
+   ref->aio_index = PG_UINT32_MAX;
+}
+
+
+typedef struct AioBufferTag
+{
+   RelFileNodeBackend rnode;           /* physical relation identifier */
+   ForkNumber  forkNum;
+   BlockNumber blockNum;       /* blknum relative to begin of reln */
+} AioBufferTag;
+struct buftag;
+
+extern void pgaio_io_start_flush_range(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes);
+extern void pgaio_io_start_nop(PgAioInProgress *io);
+extern void pgaio_io_start_fsync(PgAioInProgress *io, int fd, bool barrier);
+extern void pgaio_io_start_fdatasync(PgAioInProgress *io, int fd, bool barrier);
+
+extern void pgaio_io_start_read_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes,
+                                      char *bufdata, int buffno, int mode);
+extern void pgaio_io_start_write_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes,
+                                       char *bufdata, int buffno, bool release_lock);
+extern void pgaio_io_start_write_wal(PgAioInProgress *io, int fd,
+                                    uint32 offset, uint32 nbytes,
+                                    char *bufdata, bool no_reorder,
+                                    uint32 write_no);
+extern void pgaio_io_start_write_generic(PgAioInProgress *io, int fd,
+                                        uint64 offset, uint32 nbytes,
+                                        char *bufdata, bool no_reorder);
+extern void pgaio_io_start_fsync_wal(PgAioInProgress *io, int fd, bool barrier,
+                                    bool datasync_only, uint32 sync_no);
+
+extern void pgaio_io_retry(PgAioInProgress *io);
+extern void pgaio_submit_pending(bool drain);
+
+extern void pgaio_print_queues(void);
+struct StringInfoData;
+extern void pgaio_io_print(PgAioInProgress *io, struct StringInfoData *s);
+extern void pgaio_io_ref_print(PgAioIoRef *ref, struct StringInfoData *s);
+struct dlist_head;
+extern void pgaio_print_list(struct dlist_head *head, struct StringInfoData *s, size_t offset);
+
+
+extern void pgaio_assoc_bounce_buffer(PgAioInProgress *io, PgAioBounceBuffer *bb);
+extern PgAioBounceBuffer *pgaio_bounce_buffer_get(void);
+extern void pgaio_bounce_buffer_release(PgAioBounceBuffer *bb);
+extern char *pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb);
+
+
+/*
+ * Helpers. In aio_util.c.
+ */
+
+/*
+ * Helper to efficiently perform bulk writes.
+ */
+typedef struct pg_streaming_write pg_streaming_write;
+typedef void (*pg_streaming_write_completed)(void *pgsw_private, void *write_private);
+
+extern pg_streaming_write *pg_streaming_write_alloc(uint32 iodepth, void *private,
+                                                   pg_streaming_write_completed on_completion);
+extern PgAioInProgress *pg_streaming_write_get_io(pg_streaming_write *pgsw);
+extern uint32 pg_streaming_write_inflight(pg_streaming_write *pgsw);
+extern void pg_streaming_write_write(pg_streaming_write *pgsw, PgAioInProgress *io, void *private);
+extern void pg_streaming_write_wait_all(pg_streaming_write *pgsw);
+extern void pg_streaming_write_free(pg_streaming_write *pgsw);
+
+/*
+ * Helper for efficient reads (using readahead).
+ */
+
+typedef struct PgStreamingRead PgStreamingRead;
+typedef enum PgStreamingReadNextStatus
+{
+   PGSR_NEXT_END,
+   PGSR_NEXT_NO_IO,
+   PGSR_NEXT_IO
+} PgStreamingReadNextStatus;
+
+typedef PgStreamingReadNextStatus (*PgStreamingReadDetermineNextCB)(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private);
+typedef void (*PgStreamingReadRelease)(uintptr_t pgsr_private, uintptr_t read_private);
+extern PgStreamingRead *pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private,
+                                               PgStreamingReadDetermineNextCB determine_next_cb,
+                                               PgStreamingReadRelease release_cb);
+extern void pg_streaming_read_free(PgStreamingRead *pgsr);
+extern uintptr_t pg_streaming_read_get_next(PgStreamingRead *pgsr);
+
+#endif                         /* AIO_H */
index 532711200a6cccc0f87d72d902e85dae11c85516..e7bd7eec4d00b79d63da07dd8da1e25bf734be46 100644 (file)
@@ -17,6 +17,7 @@
 
 #include "port/atomics.h"
 #include "storage/buf.h"
+#include "storage/aio.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/latch.h"
@@ -186,6 +187,7 @@ typedef struct BufferDesc
    int         wait_backend_pid;   /* backend PID of pin-count waiter */
    int         freeNext;       /* link in freelist chain */
 
+   PgAioIoRef  io_in_progress;
    LWLock      content_lock;   /* to lock access to buffer contents */
 } BufferDesc;
 
@@ -305,6 +307,10 @@ extern void WritebackContextInit(WritebackContext *context, int *max_pending);
 extern void IssuePendingWritebacks(WritebackContext *context);
 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
 
+extern void ReadBufferCompleteRead(Buffer buffer, const AioBufferTag *tag, char *bufdata, int mode, bool failed);
+extern void ReadBufferCompleteWrite(Buffer buffer, bool failed, bool release_lock);
+
+
 /* freelist.c */
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
                                     uint32 *buf_state);
index 25008d38f905feb91942b9b4338adcfd196fbc53..2fd1b0c076b99e367b5125e6db42a6dbd73d73e9 100644 (file)
@@ -76,6 +76,11 @@ extern int   checkpoint_flush_after;
 extern int backend_flush_after;
 extern int bgwriter_flush_after;
 
+extern bool io_data_direct;
+extern bool io_data_force_async;
+extern bool io_wal_direct;
+extern bool io_wal_init_direct;
+
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
 
@@ -171,7 +176,8 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 /*
  * prototypes for functions in bufmgr.c
  */
-extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln,
+extern PrefetchBufferResult PrefetchSharedBuffer(Relation reln,
+                                                struct SMgrRelationData *smgr_reln,
                                                 ForkNumber forkNum,
                                                 BlockNumber blockNum);
 extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum,
@@ -183,6 +189,13 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
                                        ForkNumber forkNum, BlockNumber blockNum,
                                        ReadBufferMode mode, BufferAccessStrategy strategy);
+struct PgAioInProgress;
+extern Buffer ReadBufferAsync(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+                             ReadBufferMode mode, BufferAccessStrategy strategy,
+                             bool *already_valid, struct PgAioInProgress** aio);
+extern Buffer BulkExtendBuffered(Relation relation, ForkNumber forkNum,
+                                int extendby,
+                                BufferAccessStrategy strategy);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
index 359b749f7f4086b2e6a00530027c55dcbf26300c..be23daccc31e301cf898277089811d8266574559 100644 (file)
@@ -450,7 +450,8 @@ extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
 extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offset);
 extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
                                    Item newtup, Size newsize);
-extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
+struct PgAioBounceBuffer;
+extern char *PageSetChecksumCopy(Page page, BlockNumber blkno, struct PgAioBounceBuffer **bb);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
 
 #endif                         /* BUFPAGE_H */
index f2662a96fd7bbbd0d1f31ca7fb4aa64fdf68c908..ec6c186609f5931a483214a8d7c1f3625cf39e01 100644 (file)
@@ -81,8 +81,13 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
 extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
-extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int FileRead(File file, char *buffer, int amount, off_t offset,
+                   uint32 wait_event_info);
+struct PgAioInProgress;
+struct AioBufferTag;
+extern bool FileStartRead(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const struct AioBufferTag *tag, int bufid, int mode);
 extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern bool FileStartWrite(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const struct AioBufferTag *tag, int bufid, bool release_lock);
 extern int FileSync(File file, uint32 wait_event_info);
 extern off_t FileSize(File file);
 extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
index 46209e02422fe750f8bf795bf5564f6bddf35ef3..5c1c1b2be5d851c6a90085609726635212f65580 100644 (file)
@@ -219,6 +219,8 @@ typedef enum BuiltinTrancheIds
    LWTRANCHE_SHARED_TIDBITMAP,
    LWTRANCHE_PARALLEL_APPEND,
    LWTRANCHE_PER_XACT_PREDICATE_LIST,
+   LWTRANCHE_AIO_CONTEXT_SUBMISSION,
+   LWTRANCHE_AIO_CONTEXT_COMPLETION,
    LWTRANCHE_FIRST_USER_DEFINED
 }          BuiltinTrancheIds;
 
index 752b440864d843f0d9156c156a148c98eabb7989..39e8d53824d59a8a1e42342d2c50d3f671012e71 100644 (file)
@@ -28,10 +28,18 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
                     BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+                               BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
                       BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                   char *buffer);
+extern void mdstartread(struct PgAioInProgress *,
+                       SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+                       char *buffer, int bufno, int mode);
+extern void mdstartwrite(struct PgAioInProgress *,
+                        SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+                        char *buffer, int bufno, bool skipFsync, bool release_lock);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
                    BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
@@ -40,6 +48,7 @@ extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
                       BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
 
 extern void ForgetDatabaseSyncRequests(Oid dbid);
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
index ebf4a199dcb4d15beaf7b0ed8c7a27cc99175adc..c98d53fb6f2472cad3a0742105d26f96cb601a5e 100644 (file)
@@ -90,10 +90,21 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                       BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+                                 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
                         BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
                     BlockNumber blocknum, char *buffer);
+struct PgAioInProgress;
+extern void smgrstartread(struct PgAioInProgress* io,
+                         SMgrRelation reln, ForkNumber forknum,
+                         BlockNumber blocknum, char *buffer,
+                         int bufno, int mode);
+extern void smgrstartwrite(struct PgAioInProgress* io,
+                          SMgrRelation reln, ForkNumber forknum,
+                          BlockNumber blocknum, char *buffer,
+                          int bufno, bool skipFsync, bool release_lock);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
                      BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
@@ -102,6 +113,7 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
                         int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern int smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
 extern void AtEOXact_SMgr(void);
 
 #endif                         /* SMGR_H */
index c480a1a24bec7d5652c44a7b2d562de4b2eeff41..0e71d0b537f7f1f27c1584363a741c209f8c715d 100644 (file)
@@ -102,4 +102,12 @@ extern void ResourceOwnerRememberCryptoHash(ResourceOwner owner,
 extern void ResourceOwnerForgetCryptoHash(ResourceOwner owner,
                                          Datum handle);
 
+/* support for AIO bounce buffer management */
+struct PgAioBounceBuffer;
+extern void ResourceOwnerEnlargeAioBB(ResourceOwner owner);
+extern void ResourceOwnerRememberAioBB(ResourceOwner owner,
+                                      struct PgAioBounceBuffer *bb);
+extern void ResourceOwnerForgetAioBB(ResourceOwner owner,
+                                    struct PgAioBounceBuffer *bb);
+
 #endif                         /* RESOWNER_PRIVATE_H */
index a687e99d1e4fe18d0c0cc4c71351b432b29080d3..1183bdac8976480c24e72c4bdd39531baf5f4595 100644 (file)
@@ -1765,6 +1765,29 @@ pg_stat_activity| SELECT s.datid,
    FROM ((pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, sslcompression, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid)
      LEFT JOIN pg_database d ON ((s.datid = d.oid)))
      LEFT JOIN pg_authid u ON ((s.usesysid = u.oid)));
+pg_stat_aio_backends| SELECT s.pid,
+    s.executed_total_count,
+    s.issued_total_count,
+    s.submissions_total_count,
+    s.foreign_completed_total_count,
+    s.retry_total_count,
+    s.inflight_count,
+    s.unused_count,
+    s.outstanding_count,
+    s.pending_count,
+    s.local_completed_count,
+    s.foreign_completed_count,
+    s.last_context
+   FROM pg_stat_get_aio_backends() s(pid, executed_total_count, issued_total_count, submissions_total_count, foreign_completed_total_count, retry_total_count, inflight_count, unused_count, outstanding_count, pending_count, local_completed_count, foreign_completed_count, last_context);
+pg_stat_aios| SELECT s.id,
+    s.operation,
+    s.flags,
+    s.ring,
+    s.owner_pid,
+    s.generation,
+    s.result,
+    s.action_desc
+   FROM pg_stat_get_aios() s(id, operation, flags, ring, owner_pid, generation, result, action_desc);
 pg_stat_all_indexes| SELECT c.oid AS relid,
     i.oid AS indexrelid,
     n.nspname AS schemaname,