diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml
index 58d0d90fece0..2f906e9f018d 100644
--- a/doc/src/sgml/acronyms.sgml
+++ b/doc/src/sgml/acronyms.sgml
@@ -9,6 +9,15 @@
 
   <variablelist>
 
+   <varlistentry>
+    <term><acronym>AIO</acronym></term>
+    <listitem>
+     <para>
+      <link linkend="glossary-aio">Asynchronous <acronym>I/O</acronym></link>
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><acronym>ACL</acronym></term>
     <listitem>
@@ -354,6 +363,15 @@
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><acronym>I/O</acronym></term>
+    <listitem>
+     <para>
+      <link linkend="glossary-io">Input/Output</link>
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><acronym>ISO</acronym></term>
     <listitem>
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index c0f812e3f5e8..b88cac598e90 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -81,6 +81,31 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-aio">
+   <glossterm>Asynchronous <acronym>I/O</acronym></glossterm>
+   <acronym>AIO</acronym>
+   <indexterm>
+    <primary>Asynchronous <acronym>I/O</acronym></primary>
+   </indexterm>
+   <glossdef>
+    <para>
+     Asynchronous <acronym>I/O</acronym> (<acronym>AIO</acronym>) describes
+     performing <acronym>I/O</acronym> in a non-blocking way (asynchronously),
+     in contrast to synchronous <acronym>I/O</acronym>, which blocks for the
+     entire duration of the <acronym>I/O</acronym>.
+    </para>
+    <para>
+     With <acronym>AIO</acronym>, starting an <acronym>I/O</acronym> operation
+     is separated from waiting for the result of the operation, allowing
+     multiple <acronym>I/O</acronym> operations to be initiated concurrently,
+     as well as performing <acronym>CPU</acronym> heavy operations
+     concurrently with <acronym>I/O</acronym>. The price for that increased
+     concurrency is increased complexity.
+    </para>
+    <glossseealso otherterm="glossary-io" />
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-atomic">
    <glossterm>Atomic</glossterm>
    <glossdef>
@@ -938,6 +963,20 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-io">
+   <glossterm>Input/Output</glossterm>
+   <acronym>I/O</acronym>
+   <glossdef>
+    <para>
+     Input/Output (<acronym>I/O</acronym>) describes the communication between
+     a program and peripheral devices. In the context of database systems,
+     <acronym>I/O</acronym> commonly, but not exclusively, refers to
+     interaction with storage devices or the network.
+    </para>
+    <glossseealso otherterm="glossary-aio" />
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-insert">
    <glossterm>Insert</glossterm>
    <glossdef>
diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 3f5a306247e6..e9a59af8c34b 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -51,6 +51,11 @@
     </thead>
 
     <tbody>
+     <row>
+      <entry><link linkend="view-pg-aios"><structname>pg_aios</structname></link></entry>
+      <entry>In-use asynchronous IO handles</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-available-extensions"><structname>pg_available_extensions</structname></link></entry>
       <entry>available extensions</entry>
@@ -231,6 +236,295 @@
   </table>
  </sect1>
 
+ <sect1 id="view-pg-aios">
+  <title><structname>pg_aios</structname></title>
+
+  <indexterm zone="view-pg-aios">
+   <primary>pg_aios</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_aios</structname> view lists all <xref
+   linkend="glossary-aio"/> handles that are currently in-use.  An I/O handle
+   is used to reference an I/O operation that is being prepared, executed or
+   is in the process of completing.  <structname>pg_aios</structname> contains
+   one row for each I/O handle.
+  </para>
+
+  <para>
+   This view is mainly useful for developers of
+   <productname>PostgreSQL</productname>, but may also be useful when tuning
+   <productname>PostgreSQL</productname>.
+  </para>
+
+  <table>
+   <title><structname>pg_aios</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pid</structfield> <type>int4</type>
+      </para>
+      <para>
+       Process ID of the server process that is issuing this I/O.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>io_id</structfield> <type>int4</type>
+      </para>
+      <para>
+       Identifier of the I/O handle. Handles are reused once the I/O
+       completed (or if the handle is released before I/O is started). On reuse
+       <link linkend="view-pg-aios-io-generation">
+        <structname>pg_aios</structname>.<structfield>io_generation</structfield>
+       </link>
+       is incremented.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry" id="view-pg-aios-io-generation"><para role="column_definition">
+       <structfield>io_generation</structfield> <type>int8</type>
+      </para>
+      <para>
+       Generation of the I/O handle.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>state</structfield> <type>text</type>
+      </para>
+      <para>
+       State of the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>HANDED_OUT</literal>, referenced by code but not yet used
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>DEFINED</literal>, information necessary for execution is known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>STAGED</literal>, ready for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>SUBMITTED</literal>, submitted for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_IO</literal>, finished, but result has not yet been processed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_SHARED</literal>, shared completion processing completed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_LOCAL</literal>, backend local completion processing completed
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>operation</structfield> <type>text</type>
+      </para>
+      <para>
+       Operation performed using the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>invalid</literal>, not yet known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>readv</literal>, a vectored read
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>writev</literal>, a vectored write
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>off</structfield> <type>int8</type>
+      </para>
+      <para>
+       Offset of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>length</structfield> <type>int8</type>
+      </para>
+      <para>
+       Length of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target</structfield> <type>text</type>
+      </para>
+      <para>
+       What kind of object is the I/O targeting:
+       <itemizedlist spacing="compact">
+        <listitem>
+         <para>
+          <literal>smgr</literal>, I/O on relations
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>handle_data_len</structfield> <type>int2</type>
+      </para>
+      <para>
+       Length of the data associated with the I/O operation. For I/O to/from
+       <xref linkend="guc-shared-buffers"/> and <xref
+       linkend="guc-temp-buffers"/>, this indicates the number of buffers the
+       I/O is operating on.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>raw_result</structfield> <type>int4</type>
+      </para>
+      <para>
+       Low-level result of the I/O operation, or NULL if the operation has not
+       yet completed.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>result</structfield> <type>text</type>
+      </para>
+      <para>
+       High-level result of the I/O operation:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>UNKNOWN</literal> means that the result of the
+          operation is not yet known.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>OK</literal> means the I/O completed successfully.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>PARTIAL</literal> means that the I/O completed without
+          error, but did not process all data. Commonly callers will need to
+          retry and perform the remainder of the work in a separate I/O.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>WARNING</literal> means that the I/O completed without
+          error, but that execution of the IO triggered a warning. E.g. when
+          encountering a corrupted buffer with <xref
+          linkend="guc-zero-damaged-pages"/> enabled.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>ERROR</literal> means the I/O failed with an error.
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target_desc</structfield> <type>text</type>
+      </para>
+      <para>
+       Description of what the I/O operation is targeting.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_sync</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is executed synchronously.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_localmem</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O references process local memory.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_buffered</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is buffered I/O.
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_aios</structname> view is read-only.
+  </para>
+
+  <para>
+   By default, the <structname>pg_aios</structname> view can be read only by
+   superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-available-extensions">
   <title><structname>pg_available_extensions</structname></title>
 
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 6db864892d0d..e554504e1f0d 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -361,8 +361,15 @@ BootstrapModeMain(int argc, char *argv[], bool check_only)
 	BaseInit();
 
 	bootstrap_signals();
+
+	/* need a resowner for IO during BootStrapXLOG() */
+	CreateAuxProcessResourceOwner();
+
 	BootStrapXLOG(bootstrap_data_checksum_version);
 
+	ReleaseAuxProcessResources(true);
+	CurrentResourceOwner = NULL;
+
 	/*
 	 * To ensure that src/common/link-canary.c is linked into the backend, we
 	 * must call it from somewhere.  Here is as good as anywhere.
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 31d269b7ee0c..64a7240aa772 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1391,3 +1391,10 @@ CREATE VIEW pg_stat_subscription_stats AS
 
 CREATE VIEW pg_wait_events AS
     SELECT * FROM pg_get_wait_events();
+
+CREATE VIEW pg_aios AS
+    SELECT * FROM pg_get_aios();
+REVOKE ALL ON pg_aios FROM PUBLIC;
+GRANT SELECT ON pg_aios TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_aios() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_aios() TO pg_read_all_stats;
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6b..a700b02d5a18 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -620,7 +620,7 @@ CreateAnonymousSegment(Size *size)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
 
 		ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
-				   PG_MMAP_FLAGS | mmap_flags, -1, 0);
+				   PG_MMAP_FLAGS | MAP_POPULATE | mmap_flags, -1, 0);
 		mmap_errno = errno;
 		if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
 			elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 72f5acceec78..6e8801a39e32 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -38,11 +38,13 @@
 #include "postmaster/auxprocess.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
+#include "storage/aio.h"
 #include "storage/aio_subsys.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/lwlock.h"
 #include "storage/proc.h"
 #include "storage/procsignal.h"
@@ -90,6 +92,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 	sigjmp_buf	local_sigjmp_buf;
 	MemoryContext bgwriter_context;
 	bool		prev_hibernate;
+	IOQueue    *ioq;
 	WritebackContext wb_context;
 
 	Assert(startup_data_len == 0);
@@ -131,6 +134,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 											 ALLOCSET_DEFAULT_SIZES);
 	MemoryContextSwitchTo(bgwriter_context);
 
+	ioq = io_queue_create(128, 0);
 	WritebackContextInit(&wb_context, &bgwriter_flush_after);
 
 	/*
@@ -228,12 +232,22 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
 
+		/*
+		 * FIXME: this is theoretically racy, but I didn't want to copy
+		 * ProcessMainLoopInterrupts() remaining body here.
+		 */
+		if (ShutdownRequestPending)
+		{
+			io_queue_wait_all(ioq);
+			io_queue_free(ioq);
+		}
+
 		ProcessMainLoopInterrupts();
 
 		/*
 		 * Do one cycle of dirty-buffer writing.
 		 */
-		can_hibernate = BgBufferSync(&wb_context);
+		can_hibernate = BgBufferSync(ioq, &wb_context);
 
 		/* Report pending statistics to the cumulative stats system */
 		pgstat_report_bgwriter();
@@ -250,6 +264,9 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
 			smgrdestroyall();
 		}
 
+		/* finish IO before sleeping, to avoid blocking other backends */
+		io_queue_wait_all(ioq);
+
 		/*
 		 * Log a new xl_running_xacts every now and then so replication can
 		 * get into a consistent state faster (think of suboverflowed
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fda91ffd1ce2..904fe167eb4d 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -49,10 +49,12 @@
 #include "postmaster/bgwriter.h"
 #include "postmaster/interrupt.h"
 #include "replication/syncrep.h"
+#include "storage/aio.h"
 #include "storage/aio_subsys.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pmsignal.h"
@@ -766,7 +768,7 @@ ImmediateCheckpointRequested(void)
  * fraction between 0.0 meaning none, and 1.0 meaning all done.
  */
 void
-CheckpointWriteDelay(int flags, double progress)
+CheckpointWriteDelay(IOQueue *ioq, int flags, double progress)
 {
 	static int	absorb_counter = WRITES_PER_ABSORB;
 
@@ -800,6 +802,13 @@ CheckpointWriteDelay(int flags, double progress)
 		/* Report interim statistics to the cumulative stats system */
 		pgstat_report_checkpointer();
 
+		/*
+		 * Ensure all pending IO is submitted to avoid unnecessary delays for
+		 * other processes.
+		 */
+		io_queue_wait_all(ioq);
+
+
 		/*
 		 * This sleep used to be connected to bgwriter_delay, typically 200ms.
 		 * That resulted in more frequent wakeups if not much work to do.
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index a8d2e024d344..72c32f5c32e8 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -386,6 +386,12 @@ CreateInitDecodingContext(const char *plugin,
 	slot->data.plugin = plugin_name;
 	SpinLockRelease(&slot->mutex);
 
+	if (CurrentResourceOwner == NULL)
+	{
+		Assert(am_walsender);
+		CurrentResourceOwner = AuxProcessResourceOwner;
+	}
+
 	if (XLogRecPtrIsInvalid(restart_lsn))
 		ReplicationSlotReserveWal();
 	else
diff --git a/src/backend/storage/aio/Makefile b/src/backend/storage/aio/Makefile
index c06c50771e02..86fa4276fda9 100644
--- a/src/backend/storage/aio/Makefile
+++ b/src/backend/storage/aio/Makefile
@@ -11,9 +11,11 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	aio.o \
 	aio_callback.o \
+	aio_funcs.o \
 	aio_init.o \
 	aio_io.o \
 	aio_target.o \
+	io_queue.o \
 	method_io_uring.o \
 	method_sync.o \
 	method_worker.o \
diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md
new file mode 100644
index 000000000000..534aaad22bc3
--- /dev/null
+++ b/src/backend/storage/aio/README.md
@@ -0,0 +1,451 @@
+# Asynchronous & Direct IO
+
+## Motivation
+
+### Why Asynchronous IO
+
+Until the introduction of asynchronous IO Postgres relied on the operating
+system to hide the cost of synchronous IO from Postgres. While this worked
+surprisingly well in a lot of workloads, it does not do as good a job on
+prefetching and controlled writeback as we would like.
+
+There are important expensive operations like `fdatasync()` where the operating
+system cannot hide the storage latency. This is particularly important for WAL
+writes, where the ability to asynchronously issue `fdatasync()` or O_DSYNC
+writes can yield significantly higher throughput.
+
+
+### Why Direct / unbuffered IO
+
+The main reasons to want to use Direct IO are:
+
+- Lower CPU usage / higher throughput. Particularly on modern storage buffered
+  writes are bottlenecked by the operating system having to copy data from the
+  kernel's page cache to postgres buffer pool using the CPU. Whereas direct IO
+  can often move the data directly between the storage devices and postgres'
+  buffer cache, using DMA. While that transfer is ongoing, the CPU is free to
+  perform other work.
+- Reduced latency - Direct IO can have substantially lower latency than
+  buffered IO, which can be impactful for OLTP workloads bottlenecked by WAL
+  write latency.
+- Avoiding double buffering between operating system cache and postgres'
+  shared_buffers.
+- Better control over the timing and pace of dirty data writeback.
+
+
+The main reasons *not* to use Direct IO are:
+
+- Without AIO, Direct IO is unusably slow for most purposes.
+- Even with AIO, many parts of postgres need to be modified to perform
+  explicit prefetching.
+- In situations where shared_buffers cannot be set appropriately large,
+  e.g. because there are many different postgres instances hosted on shared
+  hardware, performance will often be worse than when using buffered IO.
+
+
+## AIO Usage Example
+
+In many cases code that can benefit from AIO does not directly have to
+interact with the AIO interface, but can use AIO via higher-level
+abstractions. See [Helpers](#helpers).
+
+In this example, a buffer will be read into shared buffers.
+
+```C
+/*
+ * Result of the operation, only to be accessed in this backend.
+ */
+PgAioReturn ioret;
+
+/*
+ * Acquire an AIO Handle, ioret will get result upon completion.
+ *
+ * Note that ioret needs to stay alive until the IO completes or
+ * CurrentResourceOwner is released (i.e. an error is thrown).
+ */
+PgAioHandle *ioh = pgaio_io_acquire(CurrentResourceOwner, &ioret);
+
+/*
+ * Reference that can be used to wait for the IO we initiate below. This
+ * reference can reside in local or shared memory and waited upon by any
+ * process. An arbitrary number of references can be made for each IO.
+ */
+PgAioWaitRef iow;
+
+pgaio_io_get_wref(ioh, &iow);
+
+/*
+ * Arrange for shared buffer completion callbacks to be called upon completion
+ * of the IO. This callback will update the buffer descriptors associated with
+ * the AioHandle, which e.g. allows other backends to access the buffer.
+ *
+ * A callback can be passed a small bit of data, e.g. to indicate whether to
+ * zero a buffer if it is invalid.
+ *
+ * Multiple completion callbacks can be registered for each handle.
+ */
+pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0);
+
+/*
+ * The completion callback needs to know which buffers to update when the IO
+ * completes. As the AIO subsystem does not know about buffers, we have to
+ * associate this information with the AioHandle, for use by the completion
+ * callback registered above.
+ *
+ * In this example we're reading only a single buffer, hence the 1.
+ */
+pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1);
+
+/*
+ * Pass the AIO handle to lower-level function. When operating on the level of
+ * buffers, we don't know how exactly the IO is performed, that is the
+ * responsibility of the storage manager implementation.
+ *
+ * E.g. md.c needs to translate block numbers into offsets in segments.
+ *
+ * Once the IO handle has been handed off to smgstartreadv(), it may not
+ * further be used, as the IO may immediately get executed below
+ * smgrstartreadv() and the handle reused for another IO.
+ *
+ * To issue multiple IOs in an efficient way, a caller can call
+ * pgaio_enter_batchmode() before starting multiple IOs, and end that batch
+ * with pgaio_exit_batchmode().  Note that one needs to be careful while there
+ * may be unsubmitted IOs, as another backend may need to wait for one of the
+ * unsubmitted IOs. If this backend then had to wait for the other backend,
+ * it'd end in an undetected deadlock. See pgaio_enter_batchmode() for more
+ * details.
+ *
+ * Note that even while in batchmode an IO might get submitted immediately,
+ * e.g. due to reaching a limit on the number of unsubmitted IOs, and even
+ * complete before smgrstartreadv() returns.
+ */
+smgrstartreadv(ioh, operation->smgr, forknum, blkno,
+               BufferGetBlock(buffer), 1);
+
+/*
+ * To benefit from AIO, it is beneficial to perform other work, including
+ * submitting other IOs, before waiting for the IO to complete. Otherwise
+ * we could just have used synchronous, blocking IO.
+ */
+perform_other_work();
+
+/*
+ * We did some other work and now need the IO operation to have completed to
+ * continue.
+ */
+pgaio_wref_wait(&iow);
+
+/*
+ * At this point the IO has completed. We do not yet know whether it succeeded
+ * or failed, however. The buffer's state has been updated, which allows other
+ * backends to use the buffer (if the IO succeeded), or retry the IO (if it
+ * failed).
+ *
+ * Note that in case the IO has failed, a LOG message may have been emitted,
+ * but no ERROR has been raised. This is crucial, as another backend waiting
+ * for this IO should not see an ERROR.
+ *
+ * To check whether the operation succeeded, and to raise an ERROR, or if more
+ * appropriate LOG, the PgAioReturn we passed to pgaio_io_acquire() is used.
+ */
+if (ioret.result.status == PGAIO_RS_ERROR)
+    pgaio_result_report(ioret.result, &ioret.target_data, ERROR);
+
+/*
+ * Besides having succeeded completely, the IO could also have a) partially
+ * completed or b) succeeded with a warning (e.g. due to zero_damaged_pages).
+ * If we e.g. tried to read many blocks at once, the read might have
+ * only succeeded for the first few blocks.
+ *
+ * If the IO partially succeeded and this backend needs all blocks to have
+ * completed, this backend needs to reissue the IO for the remaining buffers.
+ * The AIO subsystem cannot handle this retry transparently.
+ *
+ * As this example is already long, and we only read a single block, we'll just
+ * error out if there's a partial read or a warning.
+ */
+if (ioret.result.status != PGAIO_RS_OK)
+    pgaio_result_report(ioret.result, &ioret.target_data, ERROR);
+
+/*
+ * The IO succeeded, so we can use the buffer now.
+ */
+```
+
+
+## Design Criteria & Motivation
+
+### Deadlock and Starvation Dangers due to AIO
+
+Using AIO in a naive way can easily lead to deadlocks in an environment where
+the source/target of AIO are shared resources, like pages in postgres'
+shared_buffers.
+
+Consider one backend performing readahead on a table, initiating IO for a
+number of buffers ahead of the current "scan position". If that backend then
+performs some operation that blocks, or even just is slow, the IO completion
+for the asynchronously initiated read may not be processed.
+
+This AIO implementation solves this problem by requiring that AIO methods
+either allow AIO completions to be processed by any backend in the system
+(e.g. io_uring), or to guarantee that AIO processing will happen even when the
+issuing backend is blocked (e.g. worker mode, which offloads completion
+processing to the AIO workers).
+
+
+### IO can be started in critical sections
+
+Using AIO for WAL writes can reduce the overhead of WAL logging substantially:
+
+- AIO allows to start WAL writes eagerly, so they complete before needing to
+  wait
+- AIO allows to have multiple WAL flushes in progress at the same time
+- AIO makes it more realistic to use O\_DIRECT + O\_DSYNC, which can reduce
+  the number of roundtrips to storage on some OSs and storage HW (buffered IO
+  and direct IO without O_DSYNC needs to issue a write and after the write's
+  completion a cache flush, whereas O\_DIRECT + O\_DSYNC can use a single
+  Force Unit Access (FUA) write).
+
+The need to be able to execute IO in critical sections has substantial design
+implication on the AIO subsystem. Mainly because completing IOs (see prior
+section) needs to be possible within a critical section, even if the
+to-be-completed IO itself was not issued in a critical section. Consider
+e.g. the case of a backend first starting a number of writes from shared
+buffers and then starting to flush the WAL. Because only a limited amount of
+IO can be in-progress at the same time, initiating IO for flushing the WAL may
+require to first complete IO that was started earlier.
+
+
+### State for AIO needs to live in shared memory
+
+Because postgres uses a process model and because AIOs need to be
+complete-able by any backend much of the state of the AIO subsystem needs to
+live in shared memory.
+
+In an `EXEC_BACKEND` build, a backend's executable code and other process
+local state is not necessarily mapped to the same addresses in each process
+due to ASLR. This means that the shared memory cannot contain pointers to
+callbacks.
+
+
+## Design of the AIO Subsystem
+
+
+### AIO Methods
+
+To achieve portability and performance, multiple methods of performing AIO are
+implemented and others are likely worth adding in the future.
+
+
+#### Synchronous Mode
+
+`io_method=sync` does not actually perform AIO but allows to use the AIO API
+while performing synchronous IO. This can be useful for debugging. The code
+for the synchronous mode is also used as a fallback by e.g. the [worker
+mode](#worker) uses it to execute IO that cannot be executed by workers.
+
+
+#### Worker
+
+`io_method=worker` is available on every platform postgres runs on, and
+implements asynchronous IO - from the view of the issuing process - by
+dispatching the IO to one of several worker processes performing the IO in a
+synchronous manner.
+
+
+#### io_uring
+
+`io_method=io_uring` is available on Linux 5.1+. In contrast to worker mode it
+dispatches all IO from within the process, lowering context switch rate /
+latency.
+
+
+### AIO Handles
+
+The central API piece for postgres' AIO abstraction are AIO handles. To
+execute an IO one first has to acquire an IO handle (`pgaio_io_acquire()`) and
+then "define" it, i.e. associate an IO operation with the handle.
+
+Often AIO handles are acquired on a higher level and then passed to a lower
+level to be fully defined. E.g., for IO to/from shared buffers, bufmgr.c
+routines acquire the handle, which is then passed through smgr.c, md.c to be
+finally fully defined in fd.c.
+
+The functions used at the lowest level to define the operation are
+`pgaio_io_start_*()`.
+
+Because acquisition of an IO handle
+[must always succeed](#io-can-be-started-in-critical-sections)
+and the number of AIO Handles
+[has to be limited](#state-for-aio-needs-to-live-in-shared-memory)
+AIO handles can be reused as soon as they have completed. Obviously code needs
+to be able to react to IO completion. State can be updated using
+[AIO Completion callbacks](#aio-callbacks)
+and the issuing backend can provide a backend local variable to receive the
+result of the IO, as described in
+[AIO Result](#aio-results).
+An IO can be waited for, by both the issuing and any other backend, using
+[AIO References](#aio-wait-references).
+
+
+Because an AIO Handle is not executable just after calling
+`pgaio_io_acquire()` and because `pgaio_io_acquire()` needs to always succeed
+(absent a PANIC), only a single AIO Handle may be acquired (i.e. returned by
+`pgaio_io_acquire()`) without causing the IO to have been defined (by,
+potentially indirectly, causing `pgaio_io_start_*()` to have been
+called). Otherwise a backend could trivially self-deadlock by using up all AIO
+Handles without the ability to wait for some of the IOs to complete.
+
+If it turns out that an AIO Handle is not needed, e.g., because the handle was
+acquired before holding a contended lock, it can be released without being
+defined using `pgaio_io_release()`.
+
+
+### AIO Callbacks
+
+Commonly several layers need to react to completion of an IO. E.g. for a read
+md.c needs to check if the IO outright failed or was shorter than needed,
+bufmgr.c needs to verify the page looks valid and bufmgr.c needs to update the
+BufferDesc to update the buffer's state.
+
+The fact that several layers / subsystems need to react to IO completion poses
+a few challenges:
+
+- Upper layers should not need to know details of lower layers. E.g. bufmgr.c
+  should not assume the IO will pass through md.c.  Therefore upper levels
+  cannot know what lower layers would consider an error.
+
+- Lower layers should not need to know about upper layers. E.g. smgr APIs are
+  used going through shared buffers but are also used bypassing shared
+  buffers. This means that e.g. md.c is not in a position to validate
+  checksums.
+
+- Having code in the AIO subsystem for every possible combination of layers
+  would lead to a lot of duplication.
+
+The "solution" to this is the ability to associate multiple completion
+callbacks with a handle. E.g. bufmgr.c can have a callback to update the
+BufferDesc state and to verify the page and md.c can have another callback to
+check if the IO operation was successful.
+
+As [mentioned](#state-for-aio-needs-to-live-in-shared-memory), shared memory
+currently cannot contain function pointers. Because of that completion
+callbacks are not directly identified by function pointers but by IDs
+(`PgAioHandleCallbackID`).  A substantial added benefit is that that
+allows callbacks to be identified by much smaller amount of memory (a single
+byte currently).
+
+In addition to completion, AIO callbacks also are called to "stage" an
+IO. This is, e.g., used to increase buffer reference counts to account for the
+AIO subsystem referencing the buffer, which is required to handle the case
+where the issuing backend errors out and releases its own pins while the IO is
+still ongoing.
+
+As [explained earlier](#io-can-be-started-in-critical-sections) IO completions
+need to be safe to execute in critical sections. To allow the backend that
+issued the IO to error out in case of failure [AIO Result](#aio-results) can
+be used.
+
+
+### AIO Targets
+
+In addition to the completion callbacks describe above, each AIO Handle has
+exactly one "target". Each target has some space inside an AIO Handle with
+information specific to the target and can provide callbacks to allow to
+reopen the underlying file (required for worker mode) and to describe the IO
+operation (used for debug logging and error messages).
+
+I.e., if two different uses of AIO can describe the identity of the file being
+operated on the same way, it likely makes sense to use the same
+target. E.g. different smgr implementations can describe IO with
+RelFileLocator, ForkNumber and BlockNumber and can thus share a target. In
+contrast, IO for a WAL file would be described with TimeLineID and XLogRecPtr
+and it would not make sense to use the same target for smgr and WAL.
+
+
+### AIO Wait References
+
+As [described above](#aio-handles), AIO Handles can be reused immediately
+after completion and therefore cannot be used to wait for completion of the
+IO. Waiting is enabled using AIO wait references, which do not just identify
+an AIO Handle but also include the handles "generation".
+
+A reference to an AIO Handle can be acquired using `pgaio_io_get_wref()` and
+then waited upon using `pgaio_wref_wait()`.
+
+
+### AIO Results
+
+As AIO completion callbacks
+[are executed in critical sections](#io-can-be-started-in-critical-sections)
+and [may be executed by any backend](#deadlock-and-starvation-dangers-due-to-aio)
+completion callbacks cannot be used to, e.g., make the query that triggered an
+IO ERROR out.
+
+To allow to react to failing IOs the issuing backend can pass a pointer to a
+`PgAioReturn` in backend local memory. Before an AIO Handle is reused the
+`PgAioReturn` is filled with information about the IO. This includes
+information about whether the IO was successful (as a value of
+`PgAioResultStatus`) and enough information to raise an error in case of a
+failure (via `pgaio_result_report()`, with the error details encoded in
+`PgAioResult`).
+
+
+### AIO Errors
+
+It would be very convenient to have shared completion callbacks encode the
+details of errors as an `ErrorData` that could be raised at a later
+time. Unfortunately doing so would require allocating memory. While elog.c can
+guarantee (well, kinda) that logging a message will not run out of memory,
+that only works because a very limited number of messages are in the process
+of being logged.  With AIO a large number of concurrently issued AIOs might
+fail.
+
+To avoid the need for preallocating a potentially large amount of memory (in
+shared memory no less!), completion callbacks instead have to encode errors in
+a more compact format that can be converted into an error message.
+
+
+### AIO Bounce Buffers
+
+For some uses of AIO there is no convenient memory location as the source /
+destination of an AIO. E.g. when data checksums are enabled, writes from
+shared buffers currently cannot be done directly from shared buffers, as a
+shared buffer lock still allows some modification, e.g., for hint bits (see
+`FlushBuffer()`). If the write were done in-place, such modifications can
+cause the checksum to fail.
+
+For synchronous IO this is solved by copying the buffer to separate memory
+before computing the checksum and using that copy as the source buffer for the
+AIO.
+
+However, for AIO that is not a workable solution:
+- Instead of a single buffer many buffers are required, as many IOs might be
+  in flight
+- When using the [worker method](#worker), the source/target of IO needs to be
+  in shared memory, otherwise the workers won't be able to access the memory.
+
+The AIO subsystem addresses this by providing a limited number of bounce
+buffers that can be used as the source / target for IO. A bounce buffer can be
+acquired with `pgaio_bounce_buffer_get()` and multiple bounce buffers can be
+associated with an AIO Handle with `pgaio_io_assoc_bounce_buffer()`.
+
+Bounce buffers are automatically released when the IO completes.
+
+
+## Helpers
+
+Using the low-level AIO API introduces too much complexity to do so all over
+the tree. Most uses of AIO should be done via reusable, higher-level,
+helpers.
+
+
+### Read Stream
+
+A common and very beneficial use of AIO are reads where a substantial number
+of to-be-read locations are known ahead of time. E.g., for a sequential scan
+the set of blocks that need to be read can be determined solely by knowing the
+current position and checking the buffer mapping table.
+
+The [Read Stream](../../../include/storage/read_stream.h) interface makes it
+comparatively easy to use AIO for such use cases.
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index e3ed087e8a2b..cff48964d076 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -24,6 +24,8 @@
  *
  * - read_stream.c - helper for reading buffered relation data
  *
+ * - README.md - higher-level overview over AIO
+ *
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -60,6 +62,8 @@ static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation
 static const char *pgaio_io_state_get_name(PgAioHandleState s);
 static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
 
+static void pgaio_bounce_buffer_wait_for_free(void);
+
 
 /* Options for io_method. */
 const struct config_enum_entry io_method_options[] = {
@@ -74,6 +78,7 @@ const struct config_enum_entry io_method_options[] = {
 /* GUCs */
 int			io_method = DEFAULT_IO_METHOD;
 int			io_max_concurrency = -1;
+int			io_bounce_buffers = -1;
 
 /* global control for AIO */
 PgAioCtl   *pgaio_ctl;
@@ -660,6 +665,21 @@ pgaio_io_reclaim(PgAioHandle *ioh)
 	if (ioh->state != PGAIO_HS_HANDED_OUT)
 		dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
 
+	/* reclaim all associated bounce buffers */
+	if (!slist_is_empty(&ioh->bounce_buffers))
+	{
+		slist_mutable_iter it;
+
+		slist_foreach_modify(it, &ioh->bounce_buffers)
+		{
+			PgAioBounceBuffer *bb = slist_container(PgAioBounceBuffer, node, it.cur);
+
+			slist_delete_current(&it);
+
+			slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
+		}
+	}
+
 	if (ioh->resowner)
 	{
 		ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
@@ -1044,6 +1064,166 @@ pgaio_submit_staged(void)
 
 
 
+/* --------------------------------------------------------------------------------
+ * Functions primarily related to PgAioBounceBuffer
+ * --------------------------------------------------------------------------------
+ */
+
+PgAioBounceBuffer *
+pgaio_bounce_buffer_get(void)
+{
+	PgAioBounceBuffer *bb = NULL;
+	slist_node *node;
+
+	if (pgaio_my_backend->handed_out_bb != NULL)
+		elog(ERROR, "can only hand out one BB");
+
+	/*
+	 * XXX: It probably is not a good idea to have bounce buffers be per
+	 * backend, that's a fair bit of memory.
+	 */
+	if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+	{
+		pgaio_bounce_buffer_wait_for_free();
+	}
+
+	node = slist_pop_head_node(&pgaio_my_backend->idle_bbs);
+	bb = slist_container(PgAioBounceBuffer, node, node);
+
+	pgaio_my_backend->handed_out_bb = bb;
+
+	bb->resowner = CurrentResourceOwner;
+	ResourceOwnerRememberAioBounceBuffer(bb->resowner, &bb->resowner_node);
+
+	return bb;
+}
+
+void
+pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb)
+{
+	if (pgaio_my_backend->handed_out_bb != bb)
+		elog(ERROR, "can only assign handed out BB");
+	pgaio_my_backend->handed_out_bb = NULL;
+
+	/*
+	 * There can be many bounce buffers assigned in case of vectorized IOs.
+	 */
+	slist_push_head(&ioh->bounce_buffers, &bb->node);
+
+	/* once associated with an IO, the IO has ownership */
+	ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
+	bb->resowner = NULL;
+}
+
+uint32
+pgaio_bounce_buffer_id(PgAioBounceBuffer *bb)
+{
+	return bb - pgaio_ctl->bounce_buffers;
+}
+
+void
+pgaio_bounce_buffer_release(PgAioBounceBuffer *bb)
+{
+	if (pgaio_my_backend->handed_out_bb != bb)
+		elog(ERROR, "can only release handed out BB");
+
+	slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node);
+	pgaio_my_backend->handed_out_bb = NULL;
+
+	ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node);
+	bb->resowner = NULL;
+}
+
+void
+pgaio_bounce_buffer_release_resowner(dlist_node *bb_node, bool on_error)
+{
+	PgAioBounceBuffer *bb = dlist_container(PgAioBounceBuffer, resowner_node, bb_node);
+
+	Assert(bb->resowner);
+
+	if (!on_error)
+		elog(WARNING, "leaked AIO bounce buffer");
+
+	pgaio_bounce_buffer_release(bb);
+}
+
+char *
+pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb)
+{
+	return bb->buffer;
+}
+
+static void
+pgaio_bounce_buffer_wait_for_free(void)
+{
+	static uint32 lastpos = 0;
+
+	if (pgaio_my_backend->num_staged_ios > 0)
+	{
+		pgaio_debug(DEBUG2, "submitting %d, while acquiring free bb",
+					pgaio_my_backend->num_staged_ios);
+		pgaio_submit_staged();
+	}
+
+	for (uint32 i = lastpos; i < lastpos + io_max_concurrency; i++)
+	{
+		uint32		thisoff = pgaio_my_backend->io_handle_off + (i % io_max_concurrency);
+		PgAioHandle *ioh = &pgaio_ctl->io_handles[thisoff];
+
+		switch (ioh->state)
+		{
+			case PGAIO_HS_IDLE:
+			case PGAIO_HS_HANDED_OUT:
+				continue;
+			case PGAIO_HS_DEFINED:	/* should have been submitted above */
+			case PGAIO_HS_STAGED:
+				elog(ERROR, "shouldn't get here with io:%d in state %d",
+					 pgaio_io_get_id(ioh), ioh->state);
+				break;
+			case PGAIO_HS_COMPLETED_IO:
+			case PGAIO_HS_SUBMITTED:
+				if (!slist_is_empty(&ioh->bounce_buffers))
+				{
+					pgaio_debug_io(DEBUG2, ioh,
+								   "waiting for IO to reclaim BB with %d in flight",
+								   dclist_count(&pgaio_my_backend->in_flight_ios));
+
+					/* see comment in pgaio_io_wait_for_free() about raciness */
+					pgaio_io_wait(ioh, ioh->generation);
+
+					if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+						elog(WARNING, "empty after wait");
+
+					if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
+					{
+						lastpos = i;
+						return;
+					}
+				}
+				break;
+			case PGAIO_HS_COMPLETED_SHARED:
+			case PGAIO_HS_COMPLETED_LOCAL:
+				/* reclaim */
+				pgaio_io_reclaim(ioh);
+
+				if (!slist_is_empty(&pgaio_my_backend->idle_bbs))
+				{
+					lastpos = i;
+					return;
+				}
+				break;
+		}
+	}
+
+	/*
+	 * The submission above could have caused the IO to complete at any time.
+	 */
+	if (slist_is_empty(&pgaio_my_backend->idle_bbs))
+		elog(PANIC, "no more bbs");
+}
+
+
+
 /* --------------------------------------------------------------------------------
  * Other
  * --------------------------------------------------------------------------------
diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index bf42778a48c7..21665e7eccb3 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -41,10 +41,13 @@ static const PgAioHandleCallbacksEntry aio_handle_cbs[] = {
 	CALLBACK_ENTRY(PGAIO_HCB_INVALID, aio_invalid_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_MD_READV, aio_md_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_MD_WRITEV, aio_md_writev_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_READV, aio_shared_buffer_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_WRITEV, aio_shared_buffer_writev_cb),
 
 	CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_READV, aio_local_buffer_readv_cb),
+	CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_WRITEV, aio_local_buffer_writev_cb),
 #undef CALLBACK_ENTRY
 };
 
diff --git a/src/backend/storage/aio/aio_funcs.c b/src/backend/storage/aio/aio_funcs.c
new file mode 100644
index 000000000000..584e683371a3
--- /dev/null
+++ b/src/backend/storage/aio/aio_funcs.c
@@ -0,0 +1,230 @@
+/*-------------------------------------------------------------------------
+ *
+ * aio_funcs.c
+ *    AIO - SQL interface for AIO
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *    src/backend/storage/aio/aio_funcs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "nodes/execnodes.h"
+#include "port/atomics.h"
+#include "storage/aio_internal.h"
+#include "storage/lock.h"
+#include "storage/proc.h"
+#include "storage/procnumber.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/tuplestore.h"
+
+
+/*
+ * Byte length of an iovec.
+ */
+static size_t
+iov_byte_length(const struct iovec *iov, int cnt)
+{
+	size_t		len = 0;
+
+	for (int i = 0; i < cnt; i++)
+	{
+		len += iov[i].iov_len;
+	}
+
+	return len;
+}
+
+Datum
+pg_get_aios(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+#define PG_GET_AIOS_COLS	15
+
+	for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++)
+	{
+		PgAioHandle *live_ioh = &pgaio_ctl->io_handles[i];
+		uint32		ioh_id = pgaio_io_get_id(live_ioh);
+		Datum		values[PG_GET_AIOS_COLS] = {0};
+		bool		nulls[PG_GET_AIOS_COLS] = {0};
+		ProcNumber	owner;
+		PGPROC	   *owner_proc;
+		int32		owner_pid;
+		PgAioHandleState start_state;
+		uint64		start_generation;
+		PgAioHandle ioh_copy;
+		struct iovec iov_copy[PG_IOV_MAX];
+
+
+		/*
+		 * There is no lock that could prevent the state of the IO to advance
+		 * concurrently - and we don't want to introduce one, as that would
+		 * introduce atomics into a very common path. Instead we
+		 *
+		 * 1) Determine the state + generation of the IO.
+		 *
+		 * 2) Copy the IO to local memory.
+		 *
+		 * 3) Check if state or generation of the IO changed. If the state
+		 * changed, retry, if the generation changed don't display the IO.
+		 */
+
+		/* 1) from above */
+		start_generation = live_ioh->generation;
+
+		/*
+		 * Retry at this point, so we can accept changing states, but not
+		 * changing generations.
+		 */
+retry:
+		pg_read_barrier();
+		start_state = live_ioh->state;
+
+		if (start_state == PGAIO_HS_IDLE)
+			continue;
+
+		/* 2) from above */
+		memcpy(&ioh_copy, live_ioh, sizeof(PgAioHandle));
+
+		/*
+		 * Safe to copy even if no iovec is used - we always reserve the
+		 * required space.
+		 */
+		memcpy(&iov_copy, &pgaio_ctl->iovecs[ioh_copy.iovec_off],
+			   PG_IOV_MAX * sizeof(struct iovec));
+
+		/*
+		 * Copy information about owner before 3) below, if the process exited
+		 * it'd have to wait for the IO to finish first, which we would detect
+		 * in 3).
+		 */
+		owner = ioh_copy.owner_procno;
+		owner_proc = GetPGProcByNumber(owner);
+		owner_pid = owner_proc->pid;
+
+		/* 3) from above */
+		pg_read_barrier();
+
+		/*
+		 * The IO completed and a new one was started with the same ID. Don't
+		 * display it - it really started after this function was called.
+		 * There be a risk of a livelock if we just retried endlessly, if IOs
+		 * complete very quickly.
+		 */
+		if (live_ioh->generation != start_generation)
+			continue;
+
+		/*
+		 * The IO's state changed while we were "rendering" it. Just start
+		 * from scratch. There's no risk of a livelock here, as an IO has a
+		 * limited sets of states it can be in, and state changes go only in a
+		 * single direction.
+		 */
+		if (live_ioh->state != start_state)
+			goto retry;
+
+		/*
+		 * Now that we have copied the IO into local memory and checked that
+		 * it's still in the same state, we are not allowed to access "live"
+		 * memory anymore. To make it slightly easier to catch such cases, set
+		 * the "live" pointers to NULL.
+		 */
+		live_ioh = NULL;
+		owner_proc = NULL;
+
+
+		/* column: owning pid */
+		if (owner_pid != 0)
+			values[0] = Int32GetDatum(owner_pid);
+		else
+			nulls[0] = false;
+
+		/* column: IO's id */
+		values[1] = ioh_id;
+
+		/* column: IO's generation */
+		values[2] = Int64GetDatum(start_generation);
+
+		/* column: IO's state */
+		values[3] = CStringGetTextDatum(pgaio_io_get_state_name(&ioh_copy));
+
+		/*
+		 * If the IO is in PGAIO_HS_HANDED_OUT state, none of the following
+		 * fields are valid yet (or are in the process of being set).
+		 * Therefore we don't want to display any other columns.
+		 */
+		if (start_state == PGAIO_HS_HANDED_OUT)
+		{
+			memset(nulls + 4, 1, (lengthof(nulls) - 4) * sizeof(bool));
+			goto display;
+		}
+
+		/* column: IO's operation */
+		values[4] = CStringGetTextDatum(pgaio_io_get_op_name(&ioh_copy));
+
+		/* columns: details about the IO's operation (offset, length) */
+		switch (ioh_copy.op)
+		{
+			case PGAIO_OP_INVALID:
+				nulls[5] = true;
+				nulls[6] = true;
+				break;
+			case PGAIO_OP_READV:
+				values[5] = Int64GetDatum(ioh_copy.op_data.read.offset);
+				values[6] =
+					Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.read.iov_length));
+				break;
+			case PGAIO_OP_WRITEV:
+				values[5] = Int64GetDatum(ioh_copy.op_data.write.offset);
+				values[6] =
+					Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.write.iov_length));
+				break;
+		}
+
+		/* column: IO's target */
+		values[7] = CStringGetTextDatum(pgaio_io_get_target_name(&ioh_copy));
+
+		/* column: length of IO's data array */
+		values[8] = Int16GetDatum(ioh_copy.handle_data_len);
+
+		/* column: raw result (i.e. some form of syscall return value) */
+		if (start_state == PGAIO_HS_COMPLETED_IO
+			|| start_state == PGAIO_HS_COMPLETED_SHARED
+			|| start_state == PGAIO_HS_COMPLETED_LOCAL)
+			values[9] = Int32GetDatum(ioh_copy.result);
+		else
+			nulls[9] = true;
+
+		/*
+		 * column: result in the higher level representation (unknown if not
+		 * finished)
+		 */
+		values[10] =
+			CStringGetTextDatum(pgaio_result_status_string(ioh_copy.distilled_result.status));
+
+		/* column: target description */
+		values[11] = CStringGetTextDatum(pgaio_io_get_target_description(&ioh_copy));
+
+		/* columns: one for each flag */
+		values[12] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_SYNCHRONOUS);
+		values[13] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_REFERENCES_LOCAL);
+		values[14] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_BUFFERED);
+
+display:
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+
+	return (Datum) 0;
+}
diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c
index 885c3940c662..95b10933fedb 100644
--- a/src/backend/storage/aio/aio_init.c
+++ b/src/backend/storage/aio/aio_init.c
@@ -88,6 +88,32 @@ AioHandleDataShmemSize(void)
 							 io_max_concurrency));
 }
 
+static Size
+AioBounceBufferDescShmemSize(void)
+{
+	Size		sz;
+
+	/* PgAioBounceBuffer itself */
+	sz = mul_size(sizeof(PgAioBounceBuffer),
+				  mul_size(AioProcs(), io_bounce_buffers));
+
+	return sz;
+}
+
+static Size
+AioBounceBufferDataShmemSize(void)
+{
+	Size		sz;
+
+	/* and the associated buffer */
+	sz = mul_size(BLCKSZ,
+				  mul_size(io_bounce_buffers, AioProcs()));
+	/* memory for alignment */
+	sz += BLCKSZ;
+
+	return sz;
+}
+
 /*
  * Choose a suitable value for io_max_concurrency.
  *
@@ -113,6 +139,33 @@ AioChooseMaxConcurrency(void)
 	return Min(max_proportional_pins, 64);
 }
 
+/*
+ * Choose a suitable value for io_bounce_buffers.
+ *
+ * It's very unlikely to be useful to allocate more bounce buffers for each
+ * backend than the backend is allowed to pin. Additionally, bounce buffers
+ * currently are used for writes, it seems very uncommon for more than 10% of
+ * shared_buffers to be written out concurrently.
+ *
+ * XXX: This quickly can take up significant amounts of memory, the logic
+ * should probably fine tuned.
+ */
+static int
+AioChooseBounceBuffers(void)
+{
+	uint32		max_backends;
+	int			max_proportional_pins;
+
+	/* Similar logic to LimitAdditionalPins() */
+	max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
+	max_proportional_pins = (NBuffers / 10) / max_backends;
+
+	max_proportional_pins = Max(max_proportional_pins, 1);
+
+	/* apply upper limit */
+	return Min(max_proportional_pins, 256);
+}
+
 Size
 AioShmemSize(void)
 {
@@ -136,11 +189,31 @@ AioShmemSize(void)
 							PGC_S_OVERRIDE);
 	}
 
+
+	/*
+	 * If io_bounce_buffers is -1, we automatically choose a suitable value.
+	 *
+	 * See also comment above.
+	 */
+	if (io_bounce_buffers == -1)
+	{
+		char		buf[32];
+
+		snprintf(buf, sizeof(buf), "%d", AioChooseBounceBuffers());
+		SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER,
+						PGC_S_DYNAMIC_DEFAULT);
+		if (io_bounce_buffers == -1)	/* failed to apply it? */
+			SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER,
+							PGC_S_OVERRIDE);
+	}
+
 	sz = add_size(sz, AioCtlShmemSize());
 	sz = add_size(sz, AioBackendShmemSize());
 	sz = add_size(sz, AioHandleShmemSize());
 	sz = add_size(sz, AioHandleIOVShmemSize());
 	sz = add_size(sz, AioHandleDataShmemSize());
+	sz = add_size(sz, AioBounceBufferDescShmemSize());
+	sz = add_size(sz, AioBounceBufferDataShmemSize());
 
 	/* Reserve space for method specific resources. */
 	if (pgaio_method_ops->shmem_size)
@@ -156,6 +229,9 @@ AioShmemInit(void)
 	uint32		io_handle_off = 0;
 	uint32		iovec_off = 0;
 	uint32		per_backend_iovecs = io_max_concurrency * io_max_combine_limit;
+	uint32		bounce_buffers_off = 0;
+	uint32		per_backend_bb = io_bounce_buffers;
+	char	   *bounce_buffers_data;
 
 	pgaio_ctl = (PgAioCtl *)
 		ShmemInitStruct("AioCtl", AioCtlShmemSize(), &found);
@@ -167,6 +243,7 @@ AioShmemInit(void)
 
 	pgaio_ctl->io_handle_count = AioProcs() * io_max_concurrency;
 	pgaio_ctl->iovec_count = AioProcs() * per_backend_iovecs;
+	pgaio_ctl->bounce_buffers_count = AioProcs() * per_backend_bb;
 
 	pgaio_ctl->backend_state = (PgAioBackend *)
 		ShmemInitStruct("AioBackend", AioBackendShmemSize(), &found);
@@ -179,6 +256,40 @@ AioShmemInit(void)
 	pgaio_ctl->handle_data = (uint64 *)
 		ShmemInitStruct("AioHandleData", AioHandleDataShmemSize(), &found);
 
+	pgaio_ctl->bounce_buffers = (PgAioBounceBuffer *)
+		ShmemInitStruct("AioBounceBufferDesc", AioBounceBufferDescShmemSize(),
+						&found);
+
+	bounce_buffers_data =
+		ShmemInitStruct("AioBounceBufferData", AioBounceBufferDataShmemSize(),
+						&found);
+	bounce_buffers_data =
+		(char *) TYPEALIGN(BLCKSZ, (uintptr_t) bounce_buffers_data);
+	pgaio_ctl->bounce_buffers_data = bounce_buffers_data;
+
+
+	/* Initialize IO handles. */
+	for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++)
+	{
+		PgAioHandle *ioh = &pgaio_ctl->io_handles[i];
+
+		ioh->op = PGAIO_OP_INVALID;
+		ioh->target = PGAIO_TID_INVALID;
+		ioh->state = PGAIO_HS_IDLE;
+
+		slist_init(&ioh->bounce_buffers);
+	}
+
+	/* Initialize Bounce Buffers. */
+	for (uint64 i = 0; i < pgaio_ctl->bounce_buffers_count; i++)
+	{
+		PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[i];
+
+		bb->buffer = bounce_buffers_data;
+		bounce_buffers_data += BLCKSZ;
+	}
+
+
 	for (int procno = 0; procno < AioProcs(); procno++)
 	{
 		PgAioBackend *bs = &pgaio_ctl->backend_state[procno];
@@ -186,9 +297,13 @@ AioShmemInit(void)
 		bs->io_handle_off = io_handle_off;
 		io_handle_off += io_max_concurrency;
 
+		bs->bounce_buffers_off = bounce_buffers_off;
+		bounce_buffers_off += per_backend_bb;
+
 		dclist_init(&bs->idle_ios);
 		memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE);
 		dclist_init(&bs->in_flight_ios);
+		slist_init(&bs->idle_bbs);
 
 		/* initialize per-backend IOs */
 		for (int i = 0; i < io_max_concurrency; i++)
@@ -210,6 +325,14 @@ AioShmemInit(void)
 			dclist_push_tail(&bs->idle_ios, &ioh->node);
 			iovec_off += io_max_combine_limit;
 		}
+
+		/* initialize per-backend bounce buffers */
+		for (int i = 0; i < per_backend_bb; i++)
+		{
+			PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[bs->bounce_buffers_off + i];
+
+			slist_push_head(&bs->idle_bbs, &bb->node);
+		}
 	}
 
 out:
diff --git a/src/backend/storage/aio/io_queue.c b/src/backend/storage/aio/io_queue.c
new file mode 100644
index 000000000000..526aa1d5e06d
--- /dev/null
+++ b/src/backend/storage/aio/io_queue.c
@@ -0,0 +1,204 @@
+/*-------------------------------------------------------------------------
+ *
+ * io_queue.c
+ *	  AIO - Mechanism for tracking many IOs
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/aio/io_queue.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "storage/aio.h"
+#include "storage/io_queue.h"
+#include "utils/resowner.h"
+
+
+
+typedef struct TrackedIO
+{
+	PgAioWaitRef iow;
+	dlist_node	node;
+} TrackedIO;
+
+struct IOQueue
+{
+	int			depth;
+	int			unsubmitted;
+
+	bool		has_reserved;
+
+	dclist_head idle;
+	dclist_head in_progress;
+
+	TrackedIO	tracked_ios[FLEXIBLE_ARRAY_MEMBER];
+};
+
+
+IOQueue *
+io_queue_create(int depth, int flags)
+{
+	size_t		sz;
+	IOQueue    *ioq;
+
+	sz = offsetof(IOQueue, tracked_ios)
+		+ sizeof(TrackedIO) * depth;
+
+	ioq = palloc0(sz);
+
+	ioq->depth = 0;
+
+	for (int i = 0; i < depth; i++)
+	{
+		TrackedIO  *tio = &ioq->tracked_ios[i];
+
+		pgaio_wref_clear(&tio->iow);
+		dclist_push_tail(&ioq->idle, &tio->node);
+	}
+
+	return ioq;
+}
+
+void
+io_queue_wait_one(IOQueue *ioq)
+{
+	/* submit all pending IO before waiting */
+	pgaio_submit_staged();
+
+	while (!dclist_is_empty(&ioq->in_progress))
+	{
+		/* FIXME: Should we really pop here already? */
+		dlist_node *node = dclist_pop_head_node(&ioq->in_progress);
+		TrackedIO  *tio = dclist_container(TrackedIO, node, node);
+
+		pgaio_wref_wait(&tio->iow);
+		dclist_push_head(&ioq->idle, &tio->node);
+	}
+}
+
+void
+io_queue_reserve(IOQueue *ioq)
+{
+	if (ioq->has_reserved)
+		return;
+
+	if (dclist_is_empty(&ioq->idle))
+		io_queue_wait_one(ioq);
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	ioq->has_reserved = true;
+}
+
+PgAioHandle *
+io_queue_acquire_io(IOQueue *ioq)
+{
+	PgAioHandle *ioh;
+
+	io_queue_reserve(ioq);
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	if (!io_queue_is_empty(ioq))
+	{
+		ioh = pgaio_io_acquire_nb(CurrentResourceOwner, NULL);
+		if (ioh == NULL)
+		{
+			/*
+			 * Need to wait for all IOs, blocking might not be legal in the
+			 * context.
+			 *
+			 * XXX: This doesn't make a whole lot of sense, we're also
+			 * blocking here. What was I smoking when I wrote the above?
+			 */
+			io_queue_wait_all(ioq);
+			ioh = pgaio_io_acquire(CurrentResourceOwner, NULL);
+		}
+	}
+	else
+	{
+		ioh = pgaio_io_acquire(CurrentResourceOwner, NULL);
+	}
+
+	return ioh;
+}
+
+void
+io_queue_track(IOQueue *ioq, const struct PgAioWaitRef *iow)
+{
+	dlist_node *node;
+	TrackedIO  *tio;
+
+	Assert(ioq->has_reserved);
+	ioq->has_reserved = false;
+
+	Assert(!dclist_is_empty(&ioq->idle));
+
+	node = dclist_pop_head_node(&ioq->idle);
+	tio = dclist_container(TrackedIO, node, node);
+
+	tio->iow = *iow;
+
+	dclist_push_tail(&ioq->in_progress, &tio->node);
+
+	ioq->unsubmitted++;
+
+	/*
+	 * XXX: Should have some smarter logic here. We don't want to wait too
+	 * long to submit, that'll mean we're more likely to block. But we also
+	 * don't want to have the overhead of submitting every IO individually.
+	 */
+	if (ioq->unsubmitted >= 4)
+	{
+		pgaio_submit_staged();
+		ioq->unsubmitted = 0;
+	}
+}
+
+void
+io_queue_wait_all(IOQueue *ioq)
+{
+	/* submit all pending IO before waiting */
+	pgaio_submit_staged();
+
+	while (!dclist_is_empty(&ioq->in_progress))
+	{
+		/* wait for the last IO to minimize unnecessary wakeups */
+		dlist_node *node = dclist_tail_node(&ioq->in_progress);
+		TrackedIO  *tio = dclist_container(TrackedIO, node, node);
+
+		if (!pgaio_wref_check_done(&tio->iow))
+		{
+			ereport(DEBUG3,
+					errmsg("io_queue_wait_all for io:%d",
+						   pgaio_wref_get_id(&tio->iow)),
+					errhidestmt(true),
+					errhidecontext(true));
+
+			pgaio_wref_wait(&tio->iow);
+		}
+
+		dclist_delete_from(&ioq->in_progress, &tio->node);
+		dclist_push_head(&ioq->idle, &tio->node);
+	}
+}
+
+bool
+io_queue_is_empty(IOQueue *ioq)
+{
+	return dclist_is_empty(&ioq->in_progress);
+}
+
+void
+io_queue_free(IOQueue *ioq)
+{
+	io_queue_wait_all(ioq);
+
+	pfree(ioq);
+}
diff --git a/src/backend/storage/aio/meson.build b/src/backend/storage/aio/meson.build
index 2f0f03d80712..270c4a64428b 100644
--- a/src/backend/storage/aio/meson.build
+++ b/src/backend/storage/aio/meson.build
@@ -3,9 +3,11 @@
 backend_sources += files(
   'aio.c',
   'aio_callback.c',
+  'aio_funcs.c',
   'aio_init.c',
   'aio_io.c',
   'aio_target.c',
+  'io_queue.c',
   'method_io_uring.c',
   'method_sync.c',
   'method_worker.c',
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 0bcdab14ae7e..c719ba2727a8 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -302,14 +302,41 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
 	return num_staged_ios;
 }
 
+static void
+pgaio_uring_completion_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	/* No need for context if a backend is completing the IO for itself */
+	if (ioh->owner_procno == MyProcNumber)
+		return;
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("completing I/O on behalf of process %d", owner_pid);
+}
+
 static void
 pgaio_uring_drain_locked(PgAioUringContext *context)
 {
 	int			ready;
 	int			orig_ready;
+	ErrorContextCallback errcallback = {0};
 
 	Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
 
+	errcallback.callback = pgaio_uring_completion_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/*
 	 * Don't drain more events than available right now. Otherwise it's
 	 * plausible that one backend could get stuck, for a while, receiving CQEs
@@ -337,9 +364,11 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 			PgAioHandle *ioh;
 
 			ioh = io_uring_cqe_get_data(cqe);
+			errcallback.arg = ioh;
 			io_uring_cqe_seen(&context->io_uring_ring, cqe);
 
 			pgaio_io_process_completion(ioh, cqe->res);
+			errcallback.arg = NULL;
 		}
 
 		END_CRIT_SECTION();
@@ -348,6 +377,8 @@ pgaio_uring_drain_locked(PgAioUringContext *context)
 					"drained %d/%d, now expecting %d",
 					ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
 	}
+
+	error_context_stack = errcallback.previous;
 }
 
 static void
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 4a7853d13fac..31d94ac82c54 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -357,11 +357,33 @@ pgaio_worker_register(void)
 	on_shmem_exit(pgaio_worker_die, 0);
 }
 
+static void
+pgaio_worker_error_callback(void *arg)
+{
+	ProcNumber	owner;
+	PGPROC	   *owner_proc;
+	int32		owner_pid;
+	PgAioHandle *ioh = arg;
+
+	if (!ioh)
+		return;
+
+	Assert(ioh->owner_procno != MyProcNumber);
+	Assert(MyBackendType == B_IO_WORKER);
+
+	owner = ioh->owner_procno;
+	owner_proc = GetPGProcByNumber(owner);
+	owner_pid = owner_proc->pid;
+
+	errcontext("I/O worker executing I/O on behalf of process %d", owner_pid);
+}
+
 void
 IoWorkerMain(const void *startup_data, size_t startup_data_len)
 {
 	sigjmp_buf	local_sigjmp_buf;
 	PgAioHandle *volatile error_ioh = NULL;
+	ErrorContextCallback errcallback = {0};
 	volatile int error_errno = 0;
 	char		cmd[128];
 
@@ -388,6 +410,10 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 	sprintf(cmd, "%d", MyIoWorkerId);
 	set_ps_display(cmd);
 
+	errcallback.callback = pgaio_worker_error_callback;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	/* see PostgresMain() */
 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 	{
@@ -471,6 +497,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 
 			ioh = &pgaio_ctl->io_handles[io_index];
 			error_ioh = ioh;
+			errcallback.arg = ioh;
 
 			pgaio_debug_io(DEBUG4, ioh,
 						   "worker %d processing IO",
@@ -511,6 +538,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 			pgaio_io_perform_synchronously(ioh);
 
 			RESUME_INTERRUPTS();
+			errcallback.arg = NULL;
 		}
 		else
 		{
@@ -522,6 +550,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		CHECK_FOR_INTERRUPTS();
 	}
 
+	error_context_stack = errcallback.previous;
 	proc_exit(0);
 }
 
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 36c54fb695b0..cec93129f582 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -404,6 +404,30 @@ read_stream_start_pending_read(ReadStream *stream)
 static void
 read_stream_look_ahead(ReadStream *stream)
 {
+	/*
+	 * Batch-submitting multiple IOs is more efficient than doing so
+	 * one-by-one. If we just ramp up to the max, we'll only be allowed to
+	 * submit one io_combine_limit sized IO. Defer submitting IO in that case.
+	 *
+	 * FIXME: This needs better heuristics.
+	 */
+#if 1
+	if (!stream->sync_mode && stream->distance > (io_combine_limit * 8))
+	{
+		if (stream->pinned_buffers + stream->pending_read_nblocks > ((stream->distance * 3) / 4))
+		{
+#if 0
+			ereport(LOG,
+					errmsg("reduce reduce reduce: pinned: %d, pending: %d, distance: %d",
+						   stream->pinned_buffers,
+						   stream->pending_read_nblocks,
+						   stream->distance));
+#endif
+			return;
+		}
+	}
+#endif
+
 	/*
 	 * Allow amortizing the cost of submitting IO over multiple IOs. This
 	 * requires that we don't do any operations that could lead to a deadlock
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f9681d09e1e7..d9a362d75539 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -52,6 +52,7 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
+#include "storage/io_queue.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
 #include "storage/proc.h"
@@ -76,6 +77,7 @@
 /* Bits in SyncOneBuffer's return value */
 #define BUF_WRITTEN				0x01
 #define BUF_REUSABLE			0x02
+#define BUF_CANT_MERGE			0x04
 
 #define RELS_BSEARCH_THRESHOLD		20
 
@@ -515,13 +517,7 @@ static void UnpinBuffer(BufferDesc *buf);
 static void UnpinBufferNoOwner(BufferDesc *buf);
 static void BufferSync(int flags);
 static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
-static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
-						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
-static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-							  uint32 set_flag_bits, bool forget_owner,
-							  bool release_aio);
 static void AbortBufferIO(Buffer buffer);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
@@ -536,6 +532,7 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c
 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
 						IOObject io_object, IOContext io_context);
+
 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
 									   ForkNumber forkNum,
 									   BlockNumber nForkBlock,
@@ -3325,6 +3322,57 @@ UnpinBufferNoOwner(BufferDesc *buf)
 	}
 }
 
+typedef struct BuffersToWrite
+{
+	int			nbuffers;
+	BufferTag	start_at_tag;
+	uint32		max_combine;
+
+	XLogRecPtr	max_lsn;
+
+	PgAioHandle *ioh;
+	PgAioWaitRef iow;
+
+	uint64		total_writes;
+
+	Buffer		buffers[IOV_MAX];
+	PgAioBounceBuffer *bounce_buffers[IOV_MAX];
+	const void *data_ptrs[IOV_MAX];
+} BuffersToWrite;
+
+static int	PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf,
+								 bool skip_recently_used,
+								 IOQueue *ioq, WritebackContext *wb_context);
+
+static void WriteBuffers(BuffersToWrite *to_write,
+						 IOQueue *ioq, WritebackContext *wb_context);
+
+static void
+BuffersToWriteInit(BuffersToWrite *to_write,
+				   IOQueue *ioq, WritebackContext *wb_context)
+{
+	to_write->total_writes = 0;
+	to_write->nbuffers = 0;
+	to_write->ioh = NULL;
+	pgaio_wref_clear(&to_write->iow);
+	to_write->max_lsn = InvalidXLogRecPtr;
+
+	pgaio_enter_batchmode();
+}
+
+static void
+BuffersToWriteEnd(BuffersToWrite *to_write)
+{
+	if (to_write->ioh != NULL)
+	{
+		pgaio_io_release(to_write->ioh);
+		to_write->ioh = NULL;
+	}
+
+	pgaio_exit_batchmode();
+}
+
+
 #define ST_SORT sort_checkpoint_bufferids
 #define ST_ELEMENT_TYPE CkptSortItem
 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
@@ -3356,7 +3404,10 @@ BufferSync(int flags)
 	binaryheap *ts_heap;
 	int			i;
 	int			mask = BM_DIRTY;
+	IOQueue    *ioq;
 	WritebackContext wb_context;
+	BuffersToWrite to_write;
+	int			max_combine;
 
 	/*
 	 * Unless this is a shutdown checkpoint or we have been explicitly told,
@@ -3418,7 +3469,9 @@ BufferSync(int flags)
 	if (num_to_scan == 0)
 		return;					/* nothing to do */
 
+	ioq = io_queue_create(512, 0);
 	WritebackContextInit(&wb_context, &checkpoint_flush_after);
+	max_combine = Min(io_bounce_buffers, io_combine_limit);
 
 	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
 
@@ -3526,48 +3579,91 @@ BufferSync(int flags)
 	 */
 	num_processed = 0;
 	num_written = 0;
+
+	BuffersToWriteInit(&to_write, ioq, &wb_context);
+
 	while (!binaryheap_empty(ts_heap))
 	{
 		BufferDesc *bufHdr = NULL;
 		CkptTsStatus *ts_stat = (CkptTsStatus *)
 			DatumGetPointer(binaryheap_first(ts_heap));
+		bool		batch_continue = true;
 
-		buf_id = CkptBufferIds[ts_stat->index].buf_id;
-		Assert(buf_id != -1);
-
-		bufHdr = GetBufferDescriptor(buf_id);
-
-		num_processed++;
+		Assert(ts_stat->num_scanned <= ts_stat->num_to_scan);
 
 		/*
-		 * We don't need to acquire the lock here, because we're only looking
-		 * at a single bit. It's possible that someone else writes the buffer
-		 * and clears the flag right after we check, but that doesn't matter
-		 * since SyncOneBuffer will then do nothing.  However, there is a
-		 * further race condition: it's conceivable that between the time we
-		 * examine the bit here and the time SyncOneBuffer acquires the lock,
-		 * someone else not only wrote the buffer but replaced it with another
-		 * page and dirtied it.  In that improbable case, SyncOneBuffer will
-		 * write the buffer though we didn't need to.  It doesn't seem worth
-		 * guarding against this, though.
+		 * Collect a batch of buffers to write out from the current
+		 * tablespace. That causes some imbalance between the tablespaces, but
+		 * that's more than outweighed by the efficiency gain due to batching.
 		 */
-		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
+		while (batch_continue &&
+			   to_write.nbuffers < max_combine &&
+			   ts_stat->num_scanned < ts_stat->num_to_scan)
 		{
-			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+			buf_id = CkptBufferIds[ts_stat->index].buf_id;
+			Assert(buf_id != -1);
+
+			bufHdr = GetBufferDescriptor(buf_id);
+
+			num_processed++;
+
+			/*
+			 * We don't need to acquire the lock here, because we're only
+			 * looking at a single bit. It's possible that someone else writes
+			 * the buffer and clears the flag right after we check, but that
+			 * doesn't matter since SyncOneBuffer will then do nothing.
+			 * However, there is a further race condition: it's conceivable
+			 * that between the time we examine the bit here and the time
+			 * SyncOneBuffer acquires the lock, someone else not only wrote
+			 * the buffer but replaced it with another page and dirtied it. In
+			 * that improbable case, SyncOneBuffer will write the buffer
+			 * though we didn't need to.  It doesn't seem worth guarding
+			 * against this, though.
+			 */
+			if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
 			{
-				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
-				PendingCheckpointerStats.buffers_written++;
-				num_written++;
+				int			result = PrepareToWriteBuffer(&to_write, buf_id + 1, false,
+														  ioq, &wb_context);
+
+				if (result & BUF_CANT_MERGE)
+				{
+					Assert(to_write.nbuffers > 0);
+					WriteBuffers(&to_write, ioq, &wb_context);
+
+					result = PrepareToWriteBuffer(&to_write, buf_id + 1, false,
+												  ioq, &wb_context);
+					Assert(result != BUF_CANT_MERGE);
+				}
+
+				if (result & BUF_WRITTEN)
+				{
+					TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
+					PendingCheckpointerStats.buffers_written++;
+					num_written++;
+				}
+				else
+				{
+					batch_continue = false;
+				}
 			}
+			else
+			{
+				if (to_write.nbuffers > 0)
+					WriteBuffers(&to_write, ioq, &wb_context);
+			}
+
+			/*
+			 * Measure progress independent of actually having to flush the
+			 * buffer - otherwise writing become unbalanced.
+			 */
+			ts_stat->progress += ts_stat->progress_slice;
+			ts_stat->num_scanned++;
+			ts_stat->index++;
 		}
 
-		/*
-		 * Measure progress independent of actually having to flush the buffer
-		 * - otherwise writing become unbalanced.
-		 */
-		ts_stat->progress += ts_stat->progress_slice;
-		ts_stat->num_scanned++;
-		ts_stat->index++;
+		if (to_write.nbuffers > 0)
+			WriteBuffers(&to_write, ioq, &wb_context);
+
 
 		/* Have all the buffers from the tablespace been processed? */
 		if (ts_stat->num_scanned == ts_stat->num_to_scan)
@@ -3585,15 +3681,23 @@ BufferSync(int flags)
 		 *
 		 * (This will check for barrier events even if it doesn't sleep.)
 		 */
-		CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
+		CheckpointWriteDelay(ioq, flags, (double) num_processed / num_to_scan);
 	}
 
+	Assert(to_write.nbuffers == 0);
+	io_queue_wait_all(ioq);
+
 	/*
 	 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
 	 * IOContext will always be IOCONTEXT_NORMAL.
 	 */
 	IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
 
+	io_queue_wait_all(ioq);		/* IssuePendingWritebacks might have added
+								 * more */
+	io_queue_free(ioq);
+	BuffersToWriteEnd(&to_write);
+
 	pfree(per_ts_stat);
 	per_ts_stat = NULL;
 	binaryheap_free(ts_heap);
@@ -3619,7 +3723,7 @@ BufferSync(int flags)
  * bgwriter_lru_maxpages to 0.)
  */
 bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSync(IOQueue *ioq, WritebackContext *wb_context)
 {
 	/* info obtained from freelist.c */
 	int			strategy_buf_id;
@@ -3662,6 +3766,9 @@ BgBufferSync(WritebackContext *wb_context)
 	long		new_strategy_delta;
 	uint32		new_recent_alloc;
 
+	BuffersToWrite to_write;
+	int			max_combine;
+
 	/*
 	 * Find out where the freelist clock sweep currently is, and how many
 	 * buffer allocations have happened since our last call.
@@ -3682,6 +3789,8 @@ BgBufferSync(WritebackContext *wb_context)
 		return true;
 	}
 
+	max_combine = Min(io_bounce_buffers, io_combine_limit);
+
 	/*
 	 * Compute strategy_delta = how many buffers have been scanned by the
 	 * clock sweep since last time.  If first time through, assume none. Then
@@ -3838,11 +3947,25 @@ BgBufferSync(WritebackContext *wb_context)
 	num_written = 0;
 	reusable_buffers = reusable_buffers_est;
 
+	BuffersToWriteInit(&to_write, ioq, wb_context);
+
 	/* Execute the LRU scan */
 	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
 	{
-		int			sync_state = SyncOneBuffer(next_to_clean, true,
-											   wb_context);
+		int			sync_state;
+
+		sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1,
+										  true, ioq, wb_context);
+		if (sync_state & BUF_CANT_MERGE)
+		{
+			Assert(to_write.nbuffers > 0);
+
+			WriteBuffers(&to_write, ioq, wb_context);
+
+			sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1,
+											  true, ioq, wb_context);
+			Assert(sync_state != BUF_CANT_MERGE);
+		}
 
 		if (++next_to_clean >= NBuffers)
 		{
@@ -3853,6 +3976,13 @@ BgBufferSync(WritebackContext *wb_context)
 
 		if (sync_state & BUF_WRITTEN)
 		{
+			Assert(sync_state & BUF_REUSABLE);
+
+			if (to_write.nbuffers == max_combine)
+			{
+				WriteBuffers(&to_write, ioq, wb_context);
+			}
+
 			reusable_buffers++;
 			if (++num_written >= bgwriter_lru_maxpages)
 			{
@@ -3864,6 +3994,11 @@ BgBufferSync(WritebackContext *wb_context)
 			reusable_buffers++;
 	}
 
+	if (to_write.nbuffers > 0)
+		WriteBuffers(&to_write, ioq, wb_context);
+
+	BuffersToWriteEnd(&to_write);
+
 	PendingBgWriterStats.buf_written_clean += num_written;
 
 #ifdef BGW_DEBUG
@@ -3902,8 +4037,66 @@ BgBufferSync(WritebackContext *wb_context)
 	return (bufs_to_lap == 0 && recent_alloc == 0);
 }
 
+static inline bool
+BufferTagsSameRel(const BufferTag *tag1, const BufferTag *tag2)
+{
+	return (tag1->spcOid == tag2->spcOid) &&
+		(tag1->dbOid == tag2->dbOid) &&
+		(tag1->relNumber == tag2->relNumber) &&
+		(tag1->forkNum == tag2->forkNum)
+		;
+}
+
+static bool
+CanMergeWrite(BuffersToWrite *to_write, BufferDesc *cur_buf_hdr)
+{
+	BlockNumber cur_block = cur_buf_hdr->tag.blockNum;
+
+	Assert(to_write->nbuffers > 0); /* can't merge with nothing */
+	Assert(to_write->start_at_tag.relNumber != InvalidOid);
+	Assert(to_write->start_at_tag.blockNum != InvalidBlockNumber);
+
+	Assert(to_write->ioh != NULL);
+
+	/*
+	 * First check if the blocknumber is one that we could actually merge,
+	 * that's cheaper than checking the tablespace/db/relnumber/fork match.
+	 */
+	if (to_write->start_at_tag.blockNum + to_write->nbuffers != cur_block)
+		return false;
+
+	if (!BufferTagsSameRel(&to_write->start_at_tag, &cur_buf_hdr->tag))
+		return false;
+
+	/*
+	 * Need to check with smgr how large a write we're allowed to make. To
+	 * reduce the overhead of the smgr check, only inquire once, when
+	 * processing the first to-be-merged buffer. That avoids the overhead in
+	 * the common case of writing out buffers that definitely not mergeable.
+	 */
+	if (to_write->nbuffers == 1)
+	{
+		SMgrRelation smgr;
+
+		smgr = smgropen(BufTagGetRelFileLocator(&to_write->start_at_tag), INVALID_PROC_NUMBER);
+
+		to_write->max_combine = smgrmaxcombine(smgr,
+											   to_write->start_at_tag.forkNum,
+											   to_write->start_at_tag.blockNum);
+	}
+	else
+	{
+		Assert(to_write->max_combine > 0);
+	}
+
+	if (to_write->start_at_tag.blockNum + to_write->max_combine <= cur_block)
+		return false;
+
+	return true;
+}
+
 /*
- * SyncOneBuffer -- process a single buffer during syncing.
+ * PrepareToWriteBuffer -- process a single buffer during syncing.
  *
  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
  * buffers marked recently used, as these are not replacement candidates.
@@ -3912,22 +4105,50 @@ BgBufferSync(WritebackContext *wb_context)
  *	BUF_WRITTEN: we wrote the buffer.
  *	BUF_REUSABLE: buffer is available for replacement, ie, it has
  *		pin count 0 and usage count 0.
+ *	BUF_CANT_MERGE: can't combine this write with prior writes, caller needs
+ *		to issue those first
  *
  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
  * after locking it, but we don't care all that much.)
  */
 static int
-SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf,
+					 bool skip_recently_used,
+					 IOQueue *ioq, WritebackContext *wb_context)
 {
-	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
-	int			result = 0;
+	BufferDesc *cur_buf_hdr = GetBufferDescriptor(buf - 1);
 	uint32		buf_state;
-	BufferTag	tag;
+	int			result = 0;
+	XLogRecPtr	cur_buf_lsn;
+	LWLock	   *content_lock;
+	bool		may_block;
+
+	/*
+	 * Check if this buffer can be written out together with already prepared
+	 * writes. We check before we have pinned the buffer, so the buffer can be
+	 * written out and replaced between this check and us pinning the buffer -
+	 * we'll recheck below. The reason for the pre-check is that we don't want
+	 * to pin the buffer just to find out that we can't merge the IO.
+	 */
+	if (to_write->nbuffers != 0)
+	{
+		if (!CanMergeWrite(to_write, cur_buf_hdr))
+		{
+			result |= BUF_CANT_MERGE;
+			return result;
+		}
+	}
+	else
+	{
+		to_write->start_at_tag = cur_buf_hdr->tag;
+	}
 
 	/* Make sure we can handle the pin */
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 
+	/* XXX: Should also check if we are allowed to pin one more buffer */
+
 	/*
 	 * Check whether buffer needs writing.
 	 *
@@ -3937,7 +4158,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 * don't worry because our checkpoint.redo points before log record for
 	 * upcoming changes and so we are not required to write such dirty buffer.
 	 */
-	buf_state = LockBufHdr(bufHdr);
+	buf_state = LockBufHdr(cur_buf_hdr);
 
 	if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
 		BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
@@ -3946,40 +4167,300 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	}
 	else if (skip_recently_used)
 	{
+#if 0
+		elog(LOG, "at block %d: skip recent with nbuffers %d",
+			 cur_buf_hdr->tag.blockNum, to_write->nbuffers);
+#endif
 		/* Caller told us not to write recently-used buffers */
-		UnlockBufHdr(bufHdr, buf_state);
+		UnlockBufHdr(cur_buf_hdr, buf_state);
 		return result;
 	}
 
 	if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
 	{
 		/* It's clean, so nothing to do */
-		UnlockBufHdr(bufHdr, buf_state);
+		UnlockBufHdr(cur_buf_hdr, buf_state);
 		return result;
 	}
 
+	/* pin the buffer, from now on its identity can't change anymore */
+	PinBuffer_Locked(cur_buf_hdr);
+
 	/*
-	 * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
-	 * buffer is clean by the time we've locked it.)
+	 * Acquire IO, if needed, now that it's likely that we'll need to write.
 	 */
-	PinBuffer_Locked(bufHdr);
-	LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+	if (to_write->ioh == NULL)
+	{
+		/* otherwise we should already have acquired a handle */
+		Assert(to_write->nbuffers == 0);
 
-	FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+		to_write->ioh = io_queue_acquire_io(ioq);
+		pgaio_io_get_wref(to_write->ioh, &to_write->iow);
+	}
 
-	LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+	/*
+	 * If we are merging, check if the buffer's identity possibly changed
+	 * while we hadn't yet pinned it.
+	 *
+	 * XXX: It might be worth checking if we still want to write the buffer
+	 * out, e.g. it could have been replaced with a buffer that doesn't have
+	 * BM_CHECKPOINT_NEEDED set.
+	 */
+	if (to_write->nbuffers != 0)
+	{
+		if (!CanMergeWrite(to_write, cur_buf_hdr))
+		{
+			elog(LOG, "changed identity");
+			UnpinBuffer(cur_buf_hdr);
 
-	tag = bufHdr->tag;
+			result |= BUF_CANT_MERGE;
 
-	UnpinBuffer(bufHdr);
+			return result;
+		}
+	}
+
+	may_block = to_write->nbuffers == 0
+		&& !pgaio_have_staged()
+		&& io_queue_is_empty(ioq)
+		;
+	content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+
+	if (!may_block)
+	{
+		if (LWLockConditionalAcquire(content_lock, LW_SHARED))
+		{
+			/* done */
+		}
+		else if (to_write->nbuffers == 0)
+		{
+			/*
+			 * Need to wait for all prior IO to finish before blocking for
+			 * lock acquisition, to avoid the risk a deadlock due to us
+			 * waiting for another backend that is waiting for our unsubmitted
+			 * IO to complete.
+			 */
+			pgaio_submit_staged();
+			io_queue_wait_all(ioq);
+
+			elog(DEBUG2, "at block %u: can't block, nbuffers = 0",
+				 cur_buf_hdr->tag.blockNum
+				);
+
+			may_block = to_write->nbuffers == 0
+				&& !pgaio_have_staged()
+				&& io_queue_is_empty(ioq)
+				;
+			Assert(may_block);
+
+			LWLockAcquire(content_lock, LW_SHARED);
+		}
+		else
+		{
+			elog(DEBUG2, "at block %d: can't block nbuffers = %d",
+				 cur_buf_hdr->tag.blockNum,
+				 to_write->nbuffers);
+
+			UnpinBuffer(cur_buf_hdr);
+			result |= BUF_CANT_MERGE;
+			Assert(to_write->nbuffers > 0);
+
+			return result;
+		}
+	}
+	else
+	{
+		LWLockAcquire(content_lock, LW_SHARED);
+	}
+
+	if (!may_block)
+	{
+		if (!StartBufferIO(cur_buf_hdr, false, !may_block))
+		{
+			pgaio_submit_staged();
+			io_queue_wait_all(ioq);
+
+			may_block = io_queue_is_empty(ioq) && to_write->nbuffers == 0 && !pgaio_have_staged();
+
+			if (!StartBufferIO(cur_buf_hdr, false, !may_block))
+			{
+				elog(DEBUG2, "at block %d: non-waitable StartBufferIO returns false, %d",
+					 cur_buf_hdr->tag.blockNum,
+					 may_block);
+
+				/*
+				 * FIXME: can't tell whether this is because the buffer has
+				 * been cleaned
+				 */
+				if (!may_block)
+				{
+					result |= BUF_CANT_MERGE;
+					Assert(to_write->nbuffers > 0);
+				}
+				LWLockRelease(content_lock);
+				UnpinBuffer(cur_buf_hdr);
+
+				return result;
+			}
+		}
+	}
+	else
+	{
+		if (!StartBufferIO(cur_buf_hdr, false, false))
+		{
+			elog(DEBUG2, "waitable StartBufferIO returns false");
+			LWLockRelease(content_lock);
+			UnpinBuffer(cur_buf_hdr);
+
+			/*
+			 * FIXME: Historically we returned BUF_WRITTEN in this case, which
+			 * seems wrong
+			 */
+			return result;
+		}
+	}
 
 	/*
-	 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
-	 * IOContext will always be IOCONTEXT_NORMAL.
+	 * Run PageGetLSN while holding header lock, since we don't have the
+	 * buffer locked exclusively in all cases.
 	 */
-	ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
+	buf_state = LockBufHdr(cur_buf_hdr);
 
-	return result | BUF_WRITTEN;
+	cur_buf_lsn = BufferGetLSN(cur_buf_hdr);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf_state &= ~BM_JUST_DIRTIED;
+
+	UnlockBufHdr(cur_buf_hdr, buf_state);
+
+	to_write->buffers[to_write->nbuffers] = buf;
+	to_write->nbuffers++;
+
+	if (buf_state & BM_PERMANENT &&
+		(to_write->max_lsn == InvalidXLogRecPtr || to_write->max_lsn < cur_buf_lsn))
+	{
+		to_write->max_lsn = cur_buf_lsn;
+	}
+
+	result |= BUF_WRITTEN;
+
+	return result;
+}
+
+static void
+WriteBuffers(BuffersToWrite *to_write,
+			 IOQueue *ioq, WritebackContext *wb_context)
+{
+	SMgrRelation smgr;
+	Buffer		first_buf;
+	BufferDesc *first_buf_hdr;
+	bool		needs_checksum;
+
+	Assert(to_write->nbuffers > 0 && to_write->nbuffers <= io_combine_limit);
+
+	first_buf = to_write->buffers[0];
+	first_buf_hdr = GetBufferDescriptor(first_buf - 1);
+
+	smgr = smgropen(BufTagGetRelFileLocator(&first_buf_hdr->tag), INVALID_PROC_NUMBER);
+
+	/*
+	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
+	 * rule that log updates must hit disk before any of the data-file changes
+	 * they describe do.
+	 *
+	 * However, this rule does not apply to unlogged relations, which will be
+	 * lost after a crash anyway.  Most unlogged relation pages do not bear
+	 * LSNs since we never emit WAL records for them, and therefore flushing
+	 * up through the buffer LSN would be useless, but harmless.  However,
+	 * GiST indexes use LSNs internally to track page-splits, and therefore
+	 * unlogged GiST pages bear "fake" LSNs generated by
+	 * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
+	 * LSN counter could advance past the WAL insertion point; and if it did
+	 * happen, attempting to flush WAL through that location would fail, with
+	 * disastrous system-wide consequences.  To make sure that can't happen,
+	 * skip the flush if the buffer isn't permanent.
+	 */
+	if (to_write->max_lsn != InvalidXLogRecPtr)
+		XLogFlush(to_write->max_lsn);
+
+	/*
+	 * Now it's safe to write the buffer to disk. Note that no one else should
+	 * have been able to write it, while we were busy with log flushing,
+	 * because we got the exclusive right to perform I/O by setting the
+	 * BM_IO_IN_PROGRESS bit.
+	 */
+
+	for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++)
+	{
+		Buffer		cur_buf = to_write->buffers[nbuf];
+		BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1);
+		Block		bufBlock;
+		char	   *bufToWrite;
+
+		bufBlock = BufHdrGetBlock(cur_buf_hdr);
+		needs_checksum = PageNeedsChecksumCopy((Page) bufBlock);
+
+		/*
+		 * Update page checksum if desired.  Since we have only shared lock on
+		 * the buffer, other processes might be updating hint bits in it, so
+		 * we must copy the page to a bounce buffer if we do checksumming.
+		 */
+		if (needs_checksum)
+		{
+			PgAioBounceBuffer *bb = pgaio_bounce_buffer_get();
+
+			pgaio_io_assoc_bounce_buffer(to_write->ioh, bb);
+
+			bufToWrite = pgaio_bounce_buffer_buffer(bb);
+			memcpy(bufToWrite, bufBlock, BLCKSZ);
+			PageSetChecksumInplace((Page) bufToWrite, cur_buf_hdr->tag.blockNum);
+		}
+		else
+		{
+			bufToWrite = bufBlock;
+		}
+
+		to_write->data_ptrs[nbuf] = bufToWrite;
+	}
+
+	pgaio_io_set_handle_data_32(to_write->ioh,
+								(uint32 *) to_write->buffers,
+								to_write->nbuffers);
+	pgaio_io_register_callbacks(to_write->ioh, PGAIO_HCB_SHARED_BUFFER_WRITEV, 0);
+
+	smgrstartwritev(to_write->ioh, smgr,
+					BufTagGetForkNum(&first_buf_hdr->tag),
+					first_buf_hdr->tag.blockNum,
+					to_write->data_ptrs,
+					to_write->nbuffers,
+					false);
+	pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+					   IOOP_WRITE, 1, BLCKSZ * to_write->nbuffers);
+
+
+	for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++)
+	{
+		Buffer		cur_buf = to_write->buffers[nbuf];
+		BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1);
+
+		UnpinBuffer(cur_buf_hdr);
+	}
+
+	io_queue_track(ioq, &to_write->iow);
+	to_write->total_writes++;
+
+	/* clear state for next write */
+	to_write->nbuffers = 0;
+	to_write->start_at_tag.relNumber = InvalidOid;
+	to_write->start_at_tag.blockNum = InvalidBlockNumber;
+	to_write->max_combine = 0;
+	to_write->max_lsn = InvalidXLogRecPtr;
+	to_write->ioh = NULL;
+	pgaio_wref_clear(&to_write->iow);
+
+	/*
+	 * FIXME: Implement issuing writebacks (note wb_context isn't used here).
+	 * Possibly needs to be integrated with io_queue.c.
+	 */
 }
 
 /*
@@ -4353,6 +4834,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	error_context_stack = errcallback.previous;
 }
 
+
 /*
  * RelationGetNumberOfBlocksInFork
  *		Determines the current number of pages in the specified relation fork.
@@ -5540,7 +6022,15 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
+	{
+		/*
+		 * FIXME: Wait for AIO writes, otherwise there would be a risk of
+		 * deadlock. This isn't entirely trivial to do in a race-free way, IO
+		 * could be started between us checking whether there is IO and
+		 * enqueueing ourselves for the lock.
+		 */
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -5555,6 +6045,19 @@ ConditionalLockBuffer(Buffer buffer)
 {
 	BufferDesc *buf;
 
+	/*
+	 * FIXME: Wait for AIO writes. Some code does not deal well
+	 * ConditionalLockBuffer() continuously failing, e.g.
+	 * spginsert()->spgdoinsert() ends up busy-looping (easily reproducible by
+	 * just making this function always fail and running the regression
+	 * tests). While that code could be fixed, it'd be hard to find all
+	 * problematic places.
+	 *
+	 * It would be OK to wait for the IO as waiting for IO completion does not
+	 * need to wait for any locks that could lead to an undetected deadlock or
+	 * such.
+	 */
+
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return true;			/* act as though we got it */
@@ -5618,10 +6121,8 @@ LockBufferForCleanup(Buffer buffer)
 	CheckBufferIsPinnedOnce(buffer);
 
 	/*
-	 * We do not yet need to be worried about in-progress AIOs holding a pin,
-	 * as we, so far, only support doing reads via AIO and this function can
-	 * only be called once the buffer is valid (i.e. no read can be in
-	 * flight).
+	 * FIXME: See AIO related comments in LockBuffer() and
+	 * ConditionalLockBuffer()
 	 */
 
 	/* Nobody else to wait for */
@@ -5634,6 +6135,11 @@ LockBufferForCleanup(Buffer buffer)
 	{
 		uint32		buf_state;
 
+		/*
+		 * FIXME: LockBuffer()'s handling of in-progress writes (once
+		 * implemented) should suffice to deal with deadlock risk.
+		 */
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		buf_state = LockBufHdr(bufHdr);
@@ -5781,7 +6287,13 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 
 	Assert(BufferIsValid(buffer));
 
-	/* see AIO related comment in LockBufferForCleanup() */
+	/*
+	 * FIXME: Should wait for IO for the same reason as in
+	 * ConditionalLockBuffer(). Needs to happen before the
+	 * ConditionalLockBuffer() call below, as we'd never reach the
+	 * ConditionalLockBuffer() call due the buffer pin held for the duration
+	 * of the IO.
+	 */
 
 	if (BufferIsLocal(buffer))
 	{
@@ -5838,7 +6350,10 @@ IsBufferCleanupOK(Buffer buffer)
 
 	Assert(BufferIsValid(buffer));
 
-	/* see AIO related comment in LockBufferForCleanup() */
+	/*
+	 * FIXME: See AIO related comments in LockBuffer() and
+	 * ConditionalLockBuffer()
+	 */
 
 	if (BufferIsLocal(buffer))
 	{
@@ -5962,7 +6477,7 @@ WaitIO(BufferDesc *buf)
  * find out if they can perform the I/O as part of a larger operation, without
  * waiting for the answer or distinguishing the reasons why not.
  */
-static bool
+bool
 StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
 {
 	uint32		buf_state;
@@ -6019,7 +6534,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
  * resource owner. (forget_owner=false is used when the resource owner itself
  * is being released)
  */
-static void
+void
 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
 				  bool forget_owner, bool release_aio)
 {
@@ -7144,12 +7659,129 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
 			affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
 }
 
+/*
+ * Helper for AIO writev completion callbacks, supporting both shared and temp
+ * buffers. Gets called once for each buffer in a multi-page write.
+ */
+static pg_attribute_always_inline PgAioResult
+buffer_writev_complete_one(uint8 buf_off, Buffer buffer, uint8 flags,
+						   bool failed, bool is_temp)
+{
+	BufferDesc *buf_hdr = is_temp ?
+		GetLocalBufferDescriptor(-buffer - 1)
+		: GetBufferDescriptor(buffer - 1);
+	PgAioResult result = {.status = PGAIO_RS_OK};
+	bool		clear_dirty;
+	uint32		set_flag_bits;
+
+#ifdef USE_ASSERT_CHECKING
+	{
+		uint32		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+		Assert(buf_state & BM_VALID);
+		Assert(buf_state & BM_TAG_VALID);
+		/* temp buffers don't use BM_IO_IN_PROGRESS */
+		if (!is_temp)
+			Assert(buf_state & BM_IO_IN_PROGRESS);
+		Assert(buf_state & BM_DIRTY);
+	}
+#endif
+
+	clear_dirty = failed ? false : true;
+	set_flag_bits = failed ? BM_IO_ERROR : 0;
+
+	if (is_temp)
+		TerminateLocalBufferIO(buf_hdr, clear_dirty, set_flag_bits, true);
+	else
+		TerminateBufferIO(buf_hdr, clear_dirty, set_flag_bits, false, true);
+
+	/*
+	 * The initiator of IO is not managing the lock (i.e. we called
+	 * LWLockDisown()), we are.
+	 */
+	if (!is_temp)
+		LWLockReleaseDisowned(BufferDescriptorGetContentLock(buf_hdr),
+							  LW_SHARED);
+
+	/* FIXME: tracepoint */
+
+	return result;
+}
+
+/*
+ * Perform completion handling of a single AIO write. This write may cover
+ * multiple blocks / buffers.
+ *
+ * Shared between shared and local buffers, to reduce code duplication.
+ */
+static pg_attribute_always_inline PgAioResult
+buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result,
+					   uint8 cb_data, bool is_temp)
+{
+	PgAioResult result = prior_result;
+	PgAioTargetData *td = pgaio_io_get_target_data(ioh);
+	uint64	   *io_data;
+	uint8		handle_data_len;
+
+	if (is_temp)
+	{
+		Assert(td->smgr.is_temp);
+		Assert(pgaio_io_get_owner(ioh) == MyProcNumber);
+	}
+	else
+		Assert(!td->smgr.is_temp);
+
+	/*
+	 * Iterate over all the buffers affected by this IO and call appropriate
+	 * per-buffer completion function for each buffer.
+	 */
+	io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
+	for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
+	{
+		Buffer		buf = io_data[buf_off];
+		PgAioResult buf_result;
+		bool		failed;
+
+		Assert(BufferIsValid(buf));
+
+		/*
+		 * If the entire failed on a lower-level, each buffer needs to be
+		 * marked as failed. In case of a partial read, some buffers may be
+		 * ok.
+		 */
+		failed =
+			prior_result.status == PGAIO_RS_ERROR
+			|| prior_result.result <= buf_off;
+
+		buf_result = buffer_writev_complete_one(buf_off, buf, cb_data, failed,
+												is_temp);
+
+		/*
+		 * If there wasn't any prior error and the IO for this page failed in
+		 * some form, set the whole IO's to the page's result.
+		 */
+		if (result.status != PGAIO_RS_ERROR && buf_result.status != PGAIO_RS_OK)
+		{
+			result = buf_result;
+			pgaio_result_report(result, td, LOG);
+		}
+	}
+
+	return result;
+}
+
 static void
 shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
 {
 	buffer_stage_common(ioh, false, false);
 }
 
+static void
+shared_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data)
+{
+	buffer_stage_common(ioh, true, false);
+}
+
 static PgAioResult
 shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
 							 uint8 cb_data)
@@ -7195,6 +7827,13 @@ shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result,
 	return prior_result;
 }
 
+static PgAioResult
+shared_buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result,
+							  uint8 cb_data)
+{
+	return buffer_writev_complete(ioh, prior_result, cb_data, false);
+}
+
 static void
 local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
 {
@@ -7208,6 +7847,17 @@ local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result,
 	return buffer_readv_complete(ioh, prior_result, cb_data, true);
 }
 
+static void
+local_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data)
+{
+	/*
+	 * Currently this is unreachable as the only write support is for
+	 * checkpointer / bgwriter, which don't deal with local buffers.
+	 */
+	elog(ERROR, "should be unreachable");
+}
+
+
 /* readv callback is passed READ_BUFFERS_* flags as callback data */
 const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
 	.stage = shared_buffer_readv_stage,
@@ -7217,6 +7867,11 @@ const PgAioHandleCallbacks aio_shared_buffer_readv_cb = {
 	.report = buffer_readv_report,
 };
 
+const PgAioHandleCallbacks aio_shared_buffer_writev_cb = {
+	.stage = shared_buffer_writev_stage,
+	.complete_shared = shared_buffer_writev_complete,
+};
+
 /* readv callback is passed READ_BUFFERS_* flags as callback data */
 const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
 	.stage = local_buffer_readv_stage,
@@ -7230,3 +7885,7 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
 	.complete_local = local_buffer_readv_complete,
 	.report = buffer_readv_report,
 };
+
+const PgAioHandleCallbacks aio_local_buffer_writev_cb = {
+	.stage = local_buffer_writev_stage,
+};
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 336715b6c634..b72a5957a206 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -555,7 +555,12 @@ GetAccessStrategy(BufferAccessStrategyType btype)
 			return NULL;
 
 		case BAS_BULKREAD:
-			ring_size_kb = 256;
+
+			/*
+			 * FIXME: Temporary increase to allow large enough streaming reads
+			 * to actually benefit from AIO. This needs a better solution.
+			 */
+			ring_size_kb = 2 * 1024;
 			break;
 		case BAS_BULKWRITE:
 			ring_size_kb = 16 * 1024;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index bf89076bb109..ed56202af14f 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -57,7 +57,6 @@ static int	NLocalPinnedBuffers = 0;
 static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
 static Buffer GetLocalVictimBuffer(void);
-static void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 
 
 /*
@@ -597,7 +596,7 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit
  *
  * See also InvalidateBuffer().
  */
-static void
+void
 InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
 {
 	Buffer		buffer = BufferDescriptorGetBuffer(bufHdr);
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 0e8299dd5564..2138d47dab93 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2348,6 +2348,34 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
 	return returnCode;
 }
 
+int
+FileStartWriteV(PgAioHandle *ioh, File file,
+				int iovcnt, off_t offset,
+				uint32 wait_event_info)
+{
+	int			returnCode;
+	Vfd		   *vfdP;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileStartWriteV: %d (%s) " INT64_FORMAT " %d",
+			   file, VfdCache[file].fileName,
+			   (int64) offset,
+			   iovcnt));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	vfdP = &VfdCache[file];
+
+	/* FIXME: think about / reimplement  temp_file_limit */
+
+	pgaio_io_start_writev(ioh, vfdP->fd, iovcnt, offset);
+
+	return 0;
+}
+
 int
 FileSync(File file, uint32 wait_event_info)
 {
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 82457bacc62f..d4bbd2cdf00e 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1490,6 +1490,16 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 	return true;
 }
 
+bool
+PageNeedsChecksumCopy(Page page)
+{
+	if (PageIsNew(page))
+		return false;
+
+	/* If we don't need a checksum, just return the passed-in data */
+	return DataChecksumsEnabled();
+}
+
 
 /*
  * Set checksum for a page in shared buffers.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index b55892b94051..bacfe5f121ba 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -155,12 +155,19 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 
 static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
 static void md_readv_report(PgAioResult result, const PgAioTargetData *target_data, int elevel);
+static PgAioResult md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
+static void md_writev_report(PgAioResult result, const PgAioTargetData *target_data, int elevel);
 
 const PgAioHandleCallbacks aio_md_readv_cb = {
 	.complete_shared = md_readv_complete,
 	.report = md_readv_report,
 };
 
+const PgAioHandleCallbacks aio_md_writev_cb = {
+	.complete_shared = md_writev_complete,
+	.report = md_writev_report,
+};
+
 
 static inline int
 _mdfd_open_flags(void)
@@ -910,9 +917,30 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 * is ON or we are InRecovery, we should instead return zeroes
 				 * without complaining.  This allows, for example, the case of
 				 * trying to update a block that was later truncated away.
+				 *
+				 * NB: We think that this codepath is unreachable in recovery
+				 * and incomplete with zero_damaged_pages, as missing segments
+				 * are not created. Putting blocks into the buffer-pool that
+				 * do not exist on disk is rather problematic, as it will not
+				 * be found by scans that rely on smgrnblocks(), as they are
+				 * beyond EOF. It also can cause weird problems with relation
+				 * extension, as relation extension does not expect blocks
+				 * beyond EOF to exist.
+				 *
+				 * Therefore we do not want to copy the logic into
+				 * mdstartreadv(), where it would have to be more complicated
+				 * due to potential differences in the zero_damaged_pages
+				 * setting between the definer and completor of IO.
+				 *
+				 * For PG 18, we are putting an Assert(false) in into
+				 * mdreadv() (triggering failures in assertion-enabled builds,
+				 * but continuing to work in production builds). Afterwards we
+				 * plan to remove this code entirely.
 				 */
 				if (zero_damaged_pages || InRecovery)
 				{
+					Assert(false);	/* see comment above */
+
 					for (BlockNumber i = transferred_this_segment / BLCKSZ;
 						 i < nblocks_this_segment;
 						 ++i)
@@ -1007,6 +1035,13 @@ mdstartreadv(PgAioHandle *ioh,
 	/*
 	 * The error checks corresponding to the post-read checks in mdreadv() are
 	 * in md_readv_complete().
+	 *
+	 * However we chose, at least for now, to not implement the
+	 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
+	 * that logic is rather problematic, and we want to get rid of it. Here
+	 * equivalent logic would have to be more complicated due to potential
+	 * differences in the zero_damaged_pages setting between the definer and
+	 * completor of IO.
 	 */
 }
 
@@ -1115,6 +1150,64 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	}
 }
 
+/*
+ * mdstartwritev() -- Asynchronous version of mdrwritev().
+ */
+void
+mdstartwritev(PgAioHandle *ioh,
+			  SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	off_t		seekpos;
+	MdfdVec    *v;
+	BlockNumber nblocks_this_segment;
+	struct iovec *iov;
+	int			iovcnt;
+	int			ret;
+
+	v = _mdfd_getseg(reln, forknum, blocknum, false,
+					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	nblocks_this_segment =
+		Min(nblocks,
+			RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+
+	if (nblocks_this_segment != nblocks)
+		elog(ERROR, "write crossing segment boundary");
+
+	iovcnt = pgaio_io_get_iovec(ioh, &iov);
+
+	Assert(nblocks <= iovcnt);
+
+	iovcnt = buffers_to_iovec(iov, unconstify(void **, buffers), nblocks_this_segment);
+
+	Assert(iovcnt <= nblocks_this_segment);
+
+	if (!(io_direct_flags & IO_DIRECT_DATA))
+		pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
+
+	pgaio_io_set_target_smgr(ioh,
+							 reln,
+							 forknum,
+							 blocknum,
+							 nblocks,
+							 skipFsync);
+	pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_WRITEV, 0);
+
+	ret = FileStartWriteV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
+	if (ret != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not start writing blocks %u..%u in file \"%s\": %m",
+						blocknum,
+						blocknum + nblocks_this_segment - 1,
+						FilePathName(v->mdfd_vfd))));
+}
+
 
 /*
  * mdwriteback() -- Tell the kernel to write pages back to storage.
@@ -1503,6 +1596,40 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	}
 }
 
+/*
+ * Like register_dirty_segment(), except for use by AIO. In the completion
+ * callback we don't have access to the MdfdVec (the completion callback might
+ * be executed in a different backend than the issuing backend), therefore we
+ * have to implement this slightly differently.
+ */
+static void
+register_dirty_segment_aio(RelFileLocator locator, ForkNumber forknum, uint64 segno)
+{
+	FileTag		tag;
+
+	INIT_MD_FILETAG(tag, locator, forknum, segno);
+
+	/*
+	 * Can't block here waiting for checkpointer to accept our sync request,
+	 * as checkpointer might be waiting for this AIO to finish if offloaded to
+	 * a worker.
+	 */
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
+	{
+		char		path[MAXPGPATH];
+
+		ereport(DEBUG1,
+				(errmsg_internal("could not forward fsync request because request queue is full")));
+
+		/* reuse mdsyncfiletag() to avoid duplicating code */
+		if (mdsyncfiletag(&tag, path))
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							path)));
+	}
+}
+
 /*
  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
  */
@@ -1953,7 +2080,7 @@ md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 		 * might not process the query result immediately (because it is busy
 		 * doing another part of query processing) or at all (e.g. if it was
 		 * cancelled or errored out due to another IO also failing).  The
-		 * issuer of the IO will emit an ERROR when processing the IO's
+		 * definer of the IO will emit an ERROR when processing the IO's
 		 * results
 		 */
 		pgaio_result_report(result, td, LOG_SERVER_ONLY);
@@ -2037,3 +2164,103 @@ md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
 					   td->smgr.nblocks * (size_t) BLCKSZ));
 	}
 }
+
+/*
+ * AIO completion callback for mdstartwritev().
+ */
+static PgAioResult
+md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
+{
+	PgAioTargetData *td = pgaio_io_get_target_data(ioh);
+	PgAioResult result = prior_result;
+
+	if (prior_result.result < 0)
+	{
+		result.status = PGAIO_RS_ERROR;
+		result.id = PGAIO_HCB_MD_WRITEV;
+		/* For "hard" errors, track the error number in error_data */
+		result.error_data = -prior_result.result;
+		result.result = 0;
+
+		pgaio_result_report(result, td, LOG);
+
+		return result;
+	}
+
+	/*
+	 * As explained above smgrstartwritev(), the smgr API operates on the
+	 * level of blocks, rather than bytes. Convert.
+	 */
+	result.result /= BLCKSZ;
+
+	Assert(result.result <= td->smgr.nblocks);
+
+	if (result.result == 0)
+	{
+		/* consider 0 blocks written a failure */
+		result.status = PGAIO_RS_ERROR;
+		result.id = PGAIO_HCB_MD_WRITEV;
+		result.error_data = 0;
+
+		pgaio_result_report(result, td, LOG);
+
+		return result;
+	}
+
+	if (result.status != PGAIO_RS_ERROR &&
+		result.result < td->smgr.nblocks)
+	{
+		/* partial writes should be retried at upper level */
+		result.status = PGAIO_RS_PARTIAL;
+		result.id = PGAIO_HCB_MD_WRITEV;
+	}
+
+	if (!td->smgr.skip_fsync)
+		register_dirty_segment_aio(td->smgr.rlocator, td->smgr.forkNum,
+								   td->smgr.blockNum / ((BlockNumber) RELSEG_SIZE));
+
+	return result;
+}
+
+/*
+ * AIO error reporting callback for mdstartwritev().
+ */
+static void
+md_writev_report(PgAioResult result, const PgAioTargetData *td, int elevel)
+{
+	RelPathStr	path;
+
+	path = relpathbackend(td->smgr.rlocator,
+						  td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
+						  td->smgr.forkNum);
+
+	if (result.error_data != 0)
+	{
+		errno = result.error_data;	/* for errcode_for_file_access() */
+
+		ereport(elevel,
+				errcode_for_file_access(),
+				errmsg("could not write blocks %u..%u in file \"%s\": %m",
+					   td->smgr.blockNum,
+					   td->smgr.blockNum + td->smgr.nblocks,
+					   path.str)
+			);
+	}
+	else
+	{
+		/*
+		 * NB: This will typically only be output in debug messages, while
+		 * retrying a partial IO.
+		 */
+		ereport(elevel,
+				errcode(ERRCODE_DATA_CORRUPTED),
+				errmsg("could not write blocks %u..%u in file \"%s\": wrote only %zu of %zu bytes",
+					   td->smgr.blockNum,
+					   td->smgr.blockNum + td->smgr.nblocks - 1,
+					   path.str,
+					   result.result * (size_t) BLCKSZ,
+					   td->smgr.nblocks * (size_t) BLCKSZ
+					   )
+			);
+	}
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 320dc04d83ae..85daf686e08a 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -115,6 +115,11 @@ typedef struct f_smgr
 								BlockNumber blocknum,
 								const void **buffers, BlockNumber nblocks,
 								bool skipFsync);
+	void		(*smgr_startwritev) (PgAioHandle *ioh,
+									 SMgrRelation reln, ForkNumber forknum,
+									 BlockNumber blocknum,
+									 const void **buffers, BlockNumber nblocks,
+									 bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -142,6 +147,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_readv = mdreadv,
 		.smgr_startreadv = mdstartreadv,
 		.smgr_writev = mdwritev,
+		.smgr_startwritev = mdstartwritev,
 		.smgr_writeback = mdwriteback,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
@@ -738,6 +744,14 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  * blocks not successfully read might bear unspecified modifications, up to
  * the full nblocks). This maintains the abstraction that smgr operates on the
  * level of blocks, rather than bytes.
+ *
+ * Compared to smgrreadv(), more responsibilities fall on the caller:
+ * - Partial reads need to be handle by the caller re-issuing IO for the
+ *   unread blocks
+ * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are
+ *   responsible for pgaio_result_report() to mirror that news to the user (if
+ *   the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if
+ *   PGAIO_RS_ERROR).
  */
 void
 smgrstartreadv(PgAioHandle *ioh,
@@ -787,6 +801,29 @@ smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	RESUME_INTERRUPTS();
 }
 
+/*
+ * smgrstartwritev() -- asynchronous version of smgrwritev()
+ *
+ * This starts an asynchronous writev IO using the IO handle `ioh`. Other than
+ * `ioh` all parameters are the same as smgrwritev().
+ *
+ * Completion callbacks above smgr will be passed the result as the number of
+ * successfully written blocks if the write [partially] succeeds. This
+ * maintains the abstraction that smgr operates on the level of blocks, rather
+ * than bytes.
+ */
+void
+smgrstartwritev(PgAioHandle *ioh,
+				SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+	HOLD_INTERRUPTS();
+	smgrsw[reln->smgr_which].smgr_startwritev(ioh,
+											  reln, forknum, blocknum, buffers,
+											  nblocks, skipFsync);
+	RESUME_INTERRUPTS();
+}
+
 /*
  * smgrwriteback() -- Trigger kernel writeback for the supplied range of
  *					   blocks.
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 7958ea11b735..222e24bcb08b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -785,8 +785,12 @@ InitPostgres(const char *in_dbname, Oid dboid,
 		 * We don't yet have an aux-process resource owner, but StartupXLOG
 		 * and ShutdownXLOG will need one.  Hence, create said resource owner
 		 * (and register a callback to clean it up after ShutdownXLOG runs).
+		 *
+		 * In bootstrap mode CreateAuxProcessResourceOwner() was already
+		 * called in BootstrapModeMain().
 		 */
-		CreateAuxProcessResourceOwner();
+		if (!bootstrap)
+			CreateAuxProcessResourceOwner();
 
 		StartupXLOG();
 		/* Release (and warn about) any buffer pins leaked in StartupXLOG */
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 4eaeca89f2c7..bd49b302293c 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3293,6 +3293,19 @@ struct config_int ConfigureNamesInt[] =
 		check_io_max_concurrency, NULL, NULL
 	},
 
+	{
+		{"io_bounce_buffers",
+			PGC_POSTMASTER,
+			RESOURCES_IO,
+			gettext_noop("Number of IO Bounce Buffers reserved for each backend."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&io_bounce_buffers,
+		-1, -1, 4096,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"io_workers",
 			PGC_SIGHUP,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ff56a1f0732c..2c6456e907f5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -211,6 +211,8 @@
 					# -1 sets based on shared_buffers
 					# (change requires restart)
 #io_workers = 3				# 1-32;
+#io_bounce_buffers = -1			# -1 sets based on shared_buffers
+					# (change requires restart)
 
 # - Worker Processes -
 
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index d39f3e1b655c..81e7e27965a6 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -159,10 +159,11 @@ struct ResourceOwnerData
 	LOCALLOCK  *locks[MAX_RESOWNER_LOCKS];	/* list of owned locks */
 
 	/*
-	 * AIO handles need be registered in critical sections and therefore
-	 * cannot use the normal ResourceElem mechanism.
+	 * AIO handles & bounce buffers need be registered in critical sections
+	 * and therefore cannot use the normal ResourceElem mechanism.
 	 */
 	dlist_head	aio_handles;
+	dlist_head	aio_bounce_buffers;
 };
 
 
@@ -434,6 +435,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 	}
 
 	dlist_init(&owner->aio_handles);
+	dlist_init(&owner->aio_bounce_buffers);
 
 	return owner;
 }
@@ -742,6 +744,13 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 
 			pgaio_io_release_resowner(node, !isCommit);
 		}
+
+		while (!dlist_is_empty(&owner->aio_bounce_buffers))
+		{
+			dlist_node *node = dlist_head_node(&owner->aio_bounce_buffers);
+
+			pgaio_bounce_buffer_release_resowner(node, !isCommit);
+		}
 	}
 	else if (phase == RESOURCE_RELEASE_LOCKS)
 	{
@@ -1111,3 +1120,15 @@ ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
 {
 	dlist_delete_from(&owner->aio_handles, ioh_node);
 }
+
+void
+ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node)
+{
+	dlist_push_tail(&owner->aio_bounce_buffers, ioh_node);
+}
+
+void
+ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node)
+{
+	dlist_delete_from(&owner->aio_bounce_buffers, ioh_node);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 8b68b16d79da..d9c41fa426b3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12493,4 +12493,13 @@
   proargtypes => 'int4',
   prosrc => 'gist_stratnum_common' },
 
+# AIO related functions
+{ oid => '9200', descr => 'information about in-progress asynchronous IOs',
+  proname => 'pg_get_aios', prorows => '100', proretset => 't',
+  provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int4,int4,int8,text,text,int8,int8,text,int2,int4,text,text,bool,bool,bool}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}',
+  prosrc => 'pg_get_aios' },
+
 ]
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 800ecbfd13b3..a8081d411b68 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -31,7 +31,8 @@ pg_noreturn extern void BackgroundWriterMain(const void *startup_data, size_t st
 pg_noreturn extern void CheckpointerMain(const void *startup_data, size_t startup_data_len);
 
 extern void RequestCheckpoint(int flags);
-extern void CheckpointWriteDelay(int flags, double progress);
+struct IOQueue;
+extern void CheckpointWriteDelay(struct IOQueue *ioq, int flags, double progress);
 
 extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
 
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
index 9fe9d9ad9fa4..72d5680e767e 100644
--- a/src/include/storage/aio.h
+++ b/src/include/storage/aio.h
@@ -194,13 +194,16 @@ typedef enum PgAioHandleCallbackID
 	PGAIO_HCB_INVALID = 0,
 
 	PGAIO_HCB_MD_READV,
+	PGAIO_HCB_MD_WRITEV,
 
 	PGAIO_HCB_SHARED_BUFFER_READV,
+	PGAIO_HCB_SHARED_BUFFER_WRITEV,
 
 	PGAIO_HCB_LOCAL_BUFFER_READV,
+	PGAIO_HCB_LOCAL_BUFFER_WRITEV,
 } PgAioHandleCallbackID;
 
-#define PGAIO_HCB_MAX	PGAIO_HCB_LOCAL_BUFFER_READV
+#define PGAIO_HCB_MAX	PGAIO_HCB_LOCAL_BUFFER_WRITEV
 StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS),
 				 "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS");
 
@@ -352,6 +355,20 @@ extern bool pgaio_have_staged(void);
 
 
 
+/* --------------------------------------------------------------------------------
+ * Bounce Buffers
+ * --------------------------------------------------------------------------------
+ */
+
+extern PgAioBounceBuffer *pgaio_bounce_buffer_get(void);
+extern void pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb);
+extern uint32 pgaio_bounce_buffer_id(PgAioBounceBuffer *bb);
+extern void pgaio_bounce_buffer_release(PgAioBounceBuffer *bb);
+extern char *pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb);
+extern void pgaio_bounce_buffer_release_resowner(struct dlist_node *bb_node, bool on_error);
+
+
+
 /* --------------------------------------------------------------------------------
  * Other
  * --------------------------------------------------------------------------------
@@ -364,6 +381,7 @@ extern void pgaio_closing_fd(int fd);
 /* GUCs */
 extern PGDLLIMPORT int io_method;
 extern PGDLLIMPORT int io_max_concurrency;
+extern PGDLLIMPORT int io_bounce_buffers;
 
 
 #endif							/* AIO_H */
diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h
index 7f18da2c8565..833f97361a1f 100644
--- a/src/include/storage/aio_internal.h
+++ b/src/include/storage/aio_internal.h
@@ -127,6 +127,12 @@ struct PgAioHandle
 	/* raw result of the IO operation */
 	int32		result;
 
+	/*
+	 * List of bounce_buffers owned by IO. It would suffice to use an index
+	 * based linked list here.
+	 */
+	slist_head	bounce_buffers;
+
 	/**
 	 * In which list the handle is registered, depends on the state:
 	 * - IDLE, in per-backend list
@@ -182,11 +188,24 @@ struct PgAioHandle
 };
 
 
+/* typedef is in aio_types.h */
+struct PgAioBounceBuffer
+{
+	slist_node	node;
+	struct ResourceOwnerData *resowner;
+	dlist_node	resowner_node;
+	char	   *buffer;
+};
+
+
 typedef struct PgAioBackend
 {
 	/* index into PgAioCtl->io_handles */
 	uint32		io_handle_off;
 
+	/* index into PgAioCtl->bounce_buffers */
+	uint32		bounce_buffers_off;
+
 	/* IO Handles that currently are not used */
 	dclist_head idle_ios;
 
@@ -217,6 +236,12 @@ typedef struct PgAioBackend
 	 * IOs being appended at the end.
 	 */
 	dclist_head in_flight_ios;
+
+	/* Bounce Buffers that currently are not used */
+	slist_head	idle_bbs;
+
+	/* see handed_out_io */
+	PgAioBounceBuffer *handed_out_bb;
 } PgAioBackend;
 
 
@@ -244,6 +269,15 @@ typedef struct PgAioCtl
 
 	uint32		io_handle_count;
 	PgAioHandle *io_handles;
+
+	/*
+	 * To perform AIO on buffers that are not located in shared memory (either
+	 * because they are not in shared memory or because we need to operate on
+	 * a copy, as e.g. the case for writes when checksums are in use)
+	 */
+	uint32		bounce_buffers_count;
+	PgAioBounceBuffer *bounce_buffers;
+	char	   *bounce_buffers_data;
 } PgAioCtl;
 
 
diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h
index 181833660778..3c18dade49cb 100644
--- a/src/include/storage/aio_types.h
+++ b/src/include/storage/aio_types.h
@@ -134,4 +134,6 @@ typedef struct PgAioReturn
 } PgAioReturn;
 
 
+typedef struct PgAioBounceBuffer PgAioBounceBuffer;
+
 #endif							/* AIO_TYPES_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 72b36a4af26f..45c2b70b7360 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -21,6 +21,8 @@
 #include "storage/buf.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
+#include "storage/io_queue.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
 #include "storage/procnumber.h"
 #include "storage/shmem.h"
@@ -434,6 +436,12 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
 
+/* solely to make it easier to write tests */
+extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
+extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
+							  bool forget_owner, bool release_aio);
+
+
 /* freelist.c */
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
@@ -478,6 +486,7 @@ extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty,
 								   uint32 set_flag_bits, bool release_aio);
 extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait);
 extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln);
+extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
 									 ForkNumber forkNum,
 									 BlockNumber firstDelBlock);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index f2192ceb2719..89e0ca11288b 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -174,7 +174,9 @@ extern PGDLLIMPORT int backend_flush_after;
 extern PGDLLIMPORT int bgwriter_flush_after;
 
 extern const PgAioHandleCallbacks aio_shared_buffer_readv_cb;
+extern const PgAioHandleCallbacks aio_shared_buffer_writev_cb;
 extern const PgAioHandleCallbacks aio_local_buffer_readv_cb;
+extern const PgAioHandleCallbacks aio_local_buffer_writev_cb;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
@@ -295,7 +297,8 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 extern bool IsBufferCleanupOK(Buffer buffer);
 extern bool HoldingBufferPinThatDelaysRecovery(void);
 
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+struct IOQueue;
+extern bool BgBufferSync(struct IOQueue *ioq, struct WritebackContext *wb_context);
 
 extern uint32 GetPinLimit(void);
 extern uint32 GetLocalPinLimit(void);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index aeb67c498c59..000a3ab23f90 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -507,5 +507,6 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 									Item newtup, Size newsize);
 extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
+extern bool PageNeedsChecksumCopy(Page page);
 
 #endif							/* BUFPAGE_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index b77d8e5e30e3..2cc7c5a47617 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -112,6 +112,7 @@ extern int	FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event
 extern ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info);
 extern ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info);
 extern int	FileStartReadV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info);
+extern int	FileStartWriteV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
 extern int	FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info);
 extern int	FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
diff --git a/src/include/storage/io_queue.h b/src/include/storage/io_queue.h
new file mode 100644
index 000000000000..92b1e9afe6f5
--- /dev/null
+++ b/src/include/storage/io_queue.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * io_queue.h
+ *	  Mechanism for tracking many IOs
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/io_queue.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IO_QUEUE_H
+#define IO_QUEUE_H
+
+#include "storage/aio_types.h"
+
+struct IOQueue;
+typedef struct IOQueue IOQueue;
+
+struct PgAioWaitRef;
+
+extern IOQueue *io_queue_create(int depth, int flags);
+extern void io_queue_track(IOQueue *ioq, const PgAioWaitRef *iow);
+extern void io_queue_wait_one(IOQueue *ioq);
+extern void io_queue_wait_all(IOQueue *ioq);
+extern bool io_queue_is_empty(IOQueue *ioq);
+extern void io_queue_reserve(IOQueue *ioq);
+extern PgAioHandle *io_queue_acquire_io(IOQueue *ioq);
+extern void io_queue_free(IOQueue *ioq);
+
+#endif							/* IO_QUEUE_H */
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 9d7131eff438..47ae6c36c94f 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -21,6 +21,7 @@
 #include "storage/sync.h"
 
 extern const PgAioHandleCallbacks aio_md_readv_cb;
+extern const PgAioHandleCallbacks aio_md_writev_cb;
 
 /* md storage manager functionality */
 extern void mdinit(void);
@@ -45,6 +46,10 @@ extern void mdstartreadv(PgAioHandle *ioh,
 extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum,
 					 const void **buffers, BlockNumber nblocks, bool skipFsync);
+extern void mdstartwritev(PgAioHandle *ioh,
+						  SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum,
+						  const void **buffers, BlockNumber nblocks, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 856ebcda350e..f00b3763ac99 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -108,6 +108,11 @@ extern void smgrwritev(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum,
 					   const void **buffers, BlockNumber nblocks,
 					   bool skipFsync);
+extern void smgrstartwritev(PgAioHandle *ioh,
+							SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum,
+							const void **buffers, BlockNumber nblocks,
+							bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h
index aede4bfc820a..7e2ec2241693 100644
--- a/src/include/utils/resowner.h
+++ b/src/include/utils/resowner.h
@@ -168,5 +168,7 @@ extern void ResourceOwnerForgetLock(ResourceOwner owner, struct LOCALLOCK *local
 struct dlist_node;
 extern void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node);
 extern void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node);
+extern void ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node);
+extern void ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node);
 
 #endif							/* RESOWNER_H */
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 4e4be3fa511a..aa1d27bbed31 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -14,6 +14,7 @@ SUBDIRS = \
 		  oauth_validator \
 		  plsample \
 		  spgist_name_ops \
+		  test_aio \
 		  test_bloomfilter \
 		  test_copy_callbacks \
 		  test_custom_rmgrs \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2b0574514731..9de0057bd1d4 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -13,6 +13,7 @@ subdir('oauth_validator')
 subdir('plsample')
 subdir('spgist_name_ops')
 subdir('ssl_passphrase_callback')
+subdir('test_aio')
 subdir('test_bloomfilter')
 subdir('test_copy_callbacks')
 subdir('test_custom_rmgrs')
diff --git a/src/test/modules/test_aio/.gitignore b/src/test/modules/test_aio/.gitignore
new file mode 100644
index 000000000000..716e17f5a2ad
--- /dev/null
+++ b/src/test/modules/test_aio/.gitignore
@@ -0,0 +1,2 @@
+# Generated subdirectories
+/tmp_check/
diff --git a/src/test/modules/test_aio/Makefile b/src/test/modules/test_aio/Makefile
new file mode 100644
index 000000000000..f53cc64671a8
--- /dev/null
+++ b/src/test/modules/test_aio/Makefile
@@ -0,0 +1,26 @@
+# src/test/modules/test_aio/Makefile
+
+PGFILEDESC = "test_aio - test code for AIO"
+
+MODULE_big = test_aio
+OBJS = \
+	$(WIN32RES) \
+	test_aio.o
+
+EXTENSION = test_aio
+DATA = test_aio--1.0.sql
+
+TAP_TESTS = 1
+
+export enable_injection_points
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_aio
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_aio/meson.build b/src/test/modules/test_aio/meson.build
new file mode 100644
index 000000000000..73d2fd68eaa1
--- /dev/null
+++ b/src/test/modules/test_aio/meson.build
@@ -0,0 +1,37 @@
+# Copyright (c) 2024-2025, PostgreSQL Global Development Group
+
+test_aio_sources = files(
+  'test_aio.c',
+)
+
+if host_system == 'windows'
+  test_aio_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_aio',
+    '--FILEDESC', 'test_aio - test code for AIO',])
+endif
+
+test_aio = shared_module('test_aio',
+  test_aio_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_aio
+
+test_install_data += files(
+  'test_aio.control',
+  'test_aio--1.0.sql',
+)
+
+tests += {
+  'name': 'test_aio',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'tap': {
+    'env': {
+       'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+    },
+    'tests': [
+      't/001_aio.pl',
+      't/002_io_workers.pl',
+    ],
+  },
+}
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
new file mode 100644
index 000000000000..58a56c8cec24
--- /dev/null
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -0,0 +1,1504 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+###
+# Test io_method=worker
+###
+my $node_worker = create_node('worker');
+$node_worker->start();
+
+test_generic('worker', $node_worker);
+SKIP:
+{
+	skip 'Injection points not supported by this build', 1
+	  unless $ENV{enable_injection_points} eq 'yes';
+	test_inject_worker('worker', $node_worker);
+}
+
+$node_worker->stop();
+
+
+###
+# Test io_method=io_uring
+###
+
+if (have_io_uring())
+{
+	my $node_uring = create_node('io_uring');
+	$node_uring->start();
+	test_generic('io_uring', $node_uring);
+	$node_uring->stop();
+}
+
+
+###
+# Test io_method=sync
+###
+
+my $node_sync = create_node('sync');
+
+# just to have one test not use the default auto-tuning
+
+$node_sync->append_conf(
+	'postgresql.conf', qq(
+io_max_concurrency=4
+));
+
+$node_sync->start();
+test_generic('sync', $node_sync);
+$node_sync->stop();
+
+done_testing();
+
+
+###
+# Test Helpers
+###
+
+sub create_node
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+	my $io_method = shift;
+
+	my $node = PostgreSQL::Test::Cluster->new($io_method);
+
+	# Want to test initdb for each IO method, otherwise we could just reuse
+	# the cluster.
+	#
+	# Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the
+	# options specified by ->extra, if somebody puts -c io_method=xyz in
+	# PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we
+	# detect it.
+	local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS};
+	if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS}
+		&& $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/)
+	{
+		$ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method";
+	}
+
+	$node->init(extra => [ '-c', "io_method=$io_method" ]);
+
+	$node->append_conf(
+		'postgresql.conf', qq(
+shared_preload_libraries=test_aio
+log_min_messages = 'DEBUG3'
+log_statement=all
+log_error_verbosity=default
+restart_after_crash=false
+temp_buffers=100
+));
+
+	ok(1, "$io_method: initdb");
+
+	return $node;
+}
+
+sub have_io_uring
+{
+	# To detect if io_uring is supported, we look at the error message for
+	# assigning an invalid value to an enum GUC, which lists all the valid
+	# options. We need to use -C to deal with running as administrator on
+	# windows, the superuser check is omitted if -C is used.
+	my ($stdout, $stderr) =
+	  run_command [qw(postgres -C invalid -c io_method=invalid)];
+	die "can't determine supported io_method values"
+	  unless $stderr =~ m/Available values: ([^\.]+)\./;
+	my $methods = $1;
+	note "supported io_method values are: $methods";
+
+	return ($methods =~ m/io_uring/) ? 1 : 0;
+}
+
+sub psql_like
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $io_method = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $expected_stdout = shift;
+	my $expected_stderr = shift;
+	my ($cmdret, $output);
+
+	($output, $cmdret) = $psql->query($sql);
+
+	like($output, $expected_stdout, "$io_method: $name: expected stdout");
+	like($psql->{stderr}, $expected_stderr,
+		"$io_method: $name: expected stderr");
+	$psql->{stderr} = '';
+
+	return $output;
+}
+
+sub query_wait_block
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $io_method = shift;
+	my $node = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $waitfor = shift;
+
+	my $pid = $psql->query_safe('SELECT pg_backend_pid()');
+
+	$psql->{stdin} .= qq($sql;\n);
+	$psql->{run}->pump_nb();
+	ok(1, "$io_method: $name: issued sql");
+
+	$node->poll_query_until('postgres',
+		qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
+		$waitfor);
+	ok(1, "$io_method: $name: observed $waitfor wait event");
+}
+
+# Returns count of checksum failures for the specified database or for shared
+# relations, if $datname is undefined.
+sub checksum_failures
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+	my $psql = shift;
+	my $datname = shift;
+	my $checksum_count;
+	my $checksum_last_failure;
+
+	if (defined $datname)
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname';
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname';
+));
+	}
+	else
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL;
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL;
+));
+	}
+
+	return $checksum_count, $checksum_last_failure;
+}
+
+###
+# Sub-tests
+###
+
+# Sanity checks for the IO handle API
+sub test_handle
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in implicit xact",
+		qq(SELECT handle_get()),
+		qr/^$/,
+		qr/leaked AIO handle/,
+		"$io_method: leaky handle_get() warns");
+
+	# leak warning: explicit xact
+	psql_like(
+		$io_method, $psql,
+		"handle_get() leak in explicit xact",
+		qq(BEGIN; SELECT handle_get(); COMMIT),
+		qr/^$/, qr/leaked AIO handle/);
+
+
+	# leak warning: explicit xact, rollback
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in explicit xact, rollback",
+		qq(BEGIN; SELECT handle_get(); ROLLBACK;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning: subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning + error: released in different command (thus resowner)
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in different command",
+		qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle.*release in unexpected state/ms);
+
+	# no leak, release in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in same command",
+		qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/^$/);
+
+	# normal handle use
+	psql_like($io_method, $psql, "handle_get_release()",
+		qq(SELECT handle_get_release()),
+		qr/^$/, qr/^$/);
+
+	# should error out, API violation
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get_twice()",
+		qq(SELECT handle_get_twice()),
+		qr/^$/,
+		qr/ERROR:  API violation: Only one IO can be handed out$/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in implicit xact",
+		qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit xact",
+		qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	$psql->quit();
+}
+
+# Sanity checks for the batchmode API
+sub test_batchmode
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning & recovery: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in implicit xact",
+		qq(SELECT batch_start()),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+	# leak warning & recovery: explicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in explicit xact",
+		qq(BEGIN; SELECT batch_start(); COMMIT;),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+
+	# leak warning & recovery: explicit xact, rollback
+	#
+	# XXX: This doesn't fail right now, due to not getting a chance to do
+	# something at transaction command commit. That's not a correctness issue,
+	# it just means it's a bit harder to find buggy code.
+	#psql_like($io_method, $psql,
+	#		  "batch_start() leak & cleanup after abort",
+	#		  qq(BEGIN; SELECT batch_start(); ROLLBACK;),
+	#		  qr/^$/,
+	#		  qr/open AIO batch at end/, "$io_method: leaky batch_start() warns");
+
+	# no warning, batch closed in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start(), batch_end() works",
+		qq(SELECT batch_start() UNION ALL SELECT batch_end()),
+		qr/^$/,
+		qr/^$/,
+		"$io_method: batch_start(), batch_end()");
+
+	$psql->quit();
+}
+
+# Test that simple cases of invalid pages are reported
+sub test_io_error
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_corr(data int not null);
+INSERT INTO tmp_corr SELECT generate_series(1, 10000);
+SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
+));
+
+	foreach my $tblname (qw(tbl_corr tmp_corr))
+	{
+		my $invalid_page_re =
+		  $tblname eq 'tbl_corr'
+		  ? qr/invalid page in block 1 of relation base\/\d+\/\d+/
+		  : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/;
+
+		# verify the error is reported in custom C code
+		psql_like(
+			$io_method,
+			$psql,
+			"read_rel_block_ll() of $tblname page",
+			qq(SELECT read_rel_block_ll('$tblname', 1)),
+			qr/^$/,
+			$invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, seq scan
+		psql_like(
+			$io_method, $psql,
+			"sequential scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname),
+			qr/^$/, $invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, tid scan
+		psql_like(
+			$io_method,
+			$psql,
+			"tid scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'),
+			qr/^$/,
+			$invalid_page_re);
+	}
+
+	$psql->quit();
+}
+
+# Test interplay between StartBufferIO and TerminateBufferIO
+sub test_startwait_io
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+
+	### Verify behavior for normal tables
+
+	# create a buffer we can play around with
+	my $buf_id = psql_like(
+		$io_method, $psql_a,
+		"creation of toy buffer succeeds",
+		qq(SELECT buffer_create_toy('tbl_ok', 1)),
+		qr/^\d+$/, qr/^$/);
+
+	# check that one backend can perform StartBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# but not twice on the same buffer (non-waiting)
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartBufferIO fails, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+	psql_like(
+		$io_method,
+		$psql_b,
+		"second StartBufferIO fails, other session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, without marking it as success, this should trigger the
+	# waiting session to be able to start the io
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, not valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+
+	# Because the IO was terminated, but not marked as valid, second session should get the right to start io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/);
+	ok(1, "$io_method: blocking start buffer io, can start io");
+
+	# terminate the IO again
+	$psql_b->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+
+
+	# same as the above scenario, but mark IO as having succeeded
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking buffer io w/ success: first start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, marking it as success
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+	# Because the IO was terminated, and marked as valid, second session should complete but not need io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/);
+	ok(1, "$io_method: blocking start buffer io, no need to start io");
+
+	# buffer is valid now, make it invalid again
+	$psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);));
+
+
+	### Verify behavior for temporary tables
+
+	# Can't unfortunately share the code with the normal table case, there are
+	# too many behavioral differences.
+
+	# create a buffer we can play around with
+	$psql_a->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_ok(data int not null);
+INSERT INTO tmp_ok SELECT generate_series(1, 10000);
+));
+	$buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);));
+
+	# check that one backend can perform StartLocalBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartLocalBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer
+	# succeeds as well. This test mostly serves as a documentation of that
+	# fact. If we had actually started IO, it'd be different.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartLocalBufferIO succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, without marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after not marking valid succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);)
+	);
+
+	# Now another StartLocalBufferIO should fail, this time because the buffer
+	# is already valid.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after marking valid fails",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^f$/,
+		qr/^$/);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that if the backend issuing a read doesn't wait for the IO's
+# completion, another backend can complete the IO
+sub test_complete_foreign
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+	# Issue IO without waiting for completion, then sleep
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read started by sleeping backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Issue IO without waiting for completion, then exit.
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+	$psql_a->reconnect_and_clear();
+
+	# Check that another backend can read the relevant block. This verifies
+	# that the exiting backend left the AIO in a sane state.
+	psql_like(
+		$io_method,
+		$psql_b,
+		"read buffer started by exited backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Read a tbl_corr block, then sleep. The other session will retry the IO
+	# and also fail. The easiest thing to verify that seems to be to check
+	# that both are in the log.
+	my $log_location = -s $node->logfile;
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);));
+
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read of tbl_corr block started by other backend",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^$/,
+		qr/invalid page in block/);
+
+	# The log message issued for the read_rel_block_ll() should be logged as a LOG
+	$node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: LOG message for background read"
+	);
+
+	# But for the SELECT, it should be an ERROR
+	$log_location =
+	  $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read"
+	);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that we deal correctly with FDs being closed while IO is in progress
+sub test_close_fd
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, waiting for results",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>true,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>false,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting, query works",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	$psql->quit();
+}
+
+# Tests using injection points. Mostly to exercise hard IO errors that are
+# hard to trigger without using injection points.
+sub test_inject
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# injected what we'd expect
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method, $psql,
+		"injection point not triggering failure",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^1$/, qr/^$/);
+
+	# injected a read shorter than a single block, expecting error
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(17);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method,
+		$psql,
+		"single block short read fails",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/
+	);
+
+	# shorten multi-block read to a single block, should retry
+	my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0);
+SELECT invalidate_rel_block('tbl_ok', 1);
+SELECT invalidate_rel_block('tbl_ok', 2);
+SELECT invalidate_rel_block('tbl_ok', 3);
+/* gap */
+SELECT invalidate_rel_block('tbl_ok', 5);
+SELECT invalidate_rel_block('tbl_ok', 6);
+SELECT invalidate_rel_block('tbl_ok', 7);
+SELECT invalidate_rel_block('tbl_ok', 8););
+
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (1 block) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# shorten multi-block read to two blocks, should retry
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);));
+
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (2 blocks) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# verify that page verification errors are detected even as part of a
+	# shortened multi-block read (tbl_corr, block 1 is corrupted)
+	$psql->query_safe(
+		qq(
+SELECT invalidate_rel_block('tbl_corr', 0);
+SELECT invalidate_rel_block('tbl_corr', 1);
+SELECT invalidate_rel_block('tbl_corr', 2);
+SELECT inj_io_short_read_attach(8192);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"shortened multi-block read detects invalid page",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*invalid page in block 1 of relation base\/.*/);
+
+	# trigger a hard error, should error out
+	$psql->query_safe(
+		qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EIO'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"first hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"second hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+	# now the IO should be ok.
+	psql_like(
+		$io_method, $psql,
+		"recovers after hard error",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# trigger a different hard error, should error out
+	$psql->query_safe(
+		qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EROFS'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+	));
+	psql_like(
+		$io_method,
+		$psql,
+		"different hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Read-only file system/
+	);
+	$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+	$psql->quit();
+}
+
+# Tests using injection points, only for io_method=worker.
+#
+# io_method=worker has the special case of needing to reopen files. That can
+# in theory fail, because the file could be gone. That's a hard path to test
+# for real, so we use an injection point to trigger it.
+sub test_inject_worker
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# trigger a failure to reopen, should error out, but should recover
+	$psql->query_safe(
+		qq(
+SELECT inj_io_reopen_attach();
+SELECT invalidate_rel_block('tbl_ok', 1);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"failure to open: detected",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_reopen_detach();));
+
+	# check that we indeed recover
+	psql_like(
+		$io_method, $psql,
+		"failure to open: recovers",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Verify that we handle a relation getting removed (due to a rollback or a
+# DROP TABLE) while IO is ongoing for that table.
+sub test_invalidate
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal unlogged temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+		my $tblname = $persistency . '_transactional';
+
+		my $create_sql = qq(
+CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200);
+);
+
+		# Verify that outstanding read IO does not cause problems with
+		# AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql;");
+		$psql->query_safe(
+			qq(
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+		psql_like(
+			$io_method,
+			$psql,
+			"rollback of newly created $persistency table with outstanding IO",
+			qq(ROLLBACK),
+			qr/^$/,
+			qr/^$/);
+
+		# Verify that outstanding read IO does not cause problems with
+		# CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql; COMMIT;");
+		$psql->query_safe(
+			qq(
+BEGIN;
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+
+		psql_like(
+			$io_method, $psql,
+			"drop $persistency table with outstanding IO",
+			qq(DROP TABLE $tblname),
+			qr/^$/, qr/^$/);
+
+		psql_like($io_method, $psql,
+			"commit after drop $persistency table with outstanding IO",
+			qq(COMMIT), qr/^$/, qr/^$/);
+	}
+
+	$psql->quit();
+}
+
+# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages
+sub test_zero
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+
+		$psql_a->query_safe(
+			qq(
+CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_zero SELECT generate_series(1, 10000);
+));
+
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true);
+));
+
+		# Check that page validity errors are detected
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 0 of relation base\/.*\/.*$/
+		);
+
+		# Check that page validity errors are zeroed
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+		# And that once the corruption is fixed, we can read again
+		$psql_a->query(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test re-read of block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^$/);
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 3",
+			qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true);
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		# First test error
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 2,3 in larger read",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing via ZERO_ON_ERROR flag
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing vio zero_damaged_pages
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)
+COMMIT;
+),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		$psql_a->query_safe(qq(COMMIT));
+
+
+		# Verify that bufmgr.c IO detects page validity errors
+		$psql_a->query(
+			qq(
+SELECT invalidate_rel_block('tbl_zero', g.i)
+FROM generate_series(0, 15) g(i);
+SELECT modify_rel_block('tbl_zero', 3, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify reading zero_damaged_pages=off",
+			qq(
+SELECT count(*) FROM tbl_zero),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+		# Verify that bufmgr.c IO zeroes out pages with page validity errors
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify zero_damaged_pages=on",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT count(*) FROM tbl_zero;
+COMMIT;
+),
+			qr/^\d+$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+		# Check that warnings/errors about page validity in an IO started by
+		# session A that session B might complete aren't logged visibly to
+		# session B.
+		#
+		# This will only ever trigger for io_method's like io_uring, that can
+		# complete IO's in a client backend. But it doesn't seem worth
+		# restricting to that.
+		#
+		# This requires cross-session access to the same relation, hence the
+		# restriction to non-temporary table.
+		if ($sql_persistency ne 'temporary')
+		{
+			# Create a corruption and then read the block without waiting for
+			# completion.
+			$psql_a->query(qq(
+SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true);
+SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true)
+));
+
+			psql_like(
+				$io_method,
+				$psql_b,
+				"$persistency: test completing read by other session doesn't generate warning",
+				qq(SELECT count(*) > 0 FROM tbl_zero;),
+			qr/^t$/, qr/^$/);
+		}
+
+		# Clean up
+		$psql_a->query_safe(
+			qq(
+DROP TABLE tbl_zero;
+));
+	}
+
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that we detect checksum failures and report them
+sub test_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql_a->query_safe(
+		qq(
+CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_normal SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true);
+
+CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_temp SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true);
+));
+
+	# To be able to test checksum failures on shared rels we need a shared rel
+	# with invalid pages - which is a bit scary. pg_shseclabel seems like a
+	# good bet, as it's not accessed in a default configuration.
+	$psql_a->query_safe(
+		qq(
+SELECT grow_rel('pg_shseclabel', 4);
+SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true);
+));
+
+
+	# Check that page validity errors are detected, checksums stats increase, normal rel
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"normal rel: test reading of invalid block 3",
+		qq(
+SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation base\/\d+\/\d+$/
+	);
+
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: normal rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: normal rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, temp rel
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"temp rel: test reading of invalid block 4, valid block 5",
+		qq(
+SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: temp rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: temp rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, shared rel
+	($cs_count_before, $cs_ts_after) = checksum_failures($psql_a);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"shared rel: reading of invalid blocks 2+3",
+		qq(
+SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a);
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: shared rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: shared rel: checksum timestamp is not null");
+
+
+	# and restore sanity
+	$psql_a->query(
+		qq(
+SELECT modify_rel_block('pg_shseclabel', 1, zero=>true);
+DROP TABLE tbl_normal;
+));
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+}
+
+# Verify checksum handling when creating database from an invalid database.
+# This also serves as a minimal check that cross-database IO is handled
+# reasonably.
+sub test_checksum_createdb
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$node->safe_psql('postgres',
+		'CREATE DATABASE regression_createdb_source');
+
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000);
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+));
+
+	my $createdb_sql = qq(
+CREATE DATABASE regression_createdb_target
+TEMPLATE regression_createdb_source
+STRATEGY wal_log;
+);
+
+	# Verify that CREATE DATABASE of an invalid database fails and is
+	# accounted for accurately.
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	psql_like(
+		$io_method,
+		$psql,
+		"create database w/ wal strategy, invalid source",
+		$createdb_sql,
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 1 of relation base\/\d+\/\d+$/
+	);
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	cmp_ok($cs_count_before + 1, '<=', $cs_count_after,
+		"$io_method: create database w/ wal strategy, invalid source: checksum count increased"
+	);
+
+	# Verify that CREATE DATABASE of the fixed database succeeds.
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+));
+	psql_like($io_method, $psql,
+		"create database w/ wal strategy, valid source",
+		$createdb_sql, qr/^$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Test that we detect checksum failures and report them
+#
+# In several places we make sure that the server log actually contains
+# individual information for each block involved in the IO.
+sub test_ignore_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# Test setup
+	$psql->query_safe(
+		qq(
+CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000);
+));
+
+	my $count_sql = "SELECT count(*) FROM tbl_cs_fail";
+	my $invalidate_sql = qq(
+SELECT invalidate_rel_block('tbl_cs_fail', g.i)
+FROM generate_series(0, 6) g(i);
+);
+
+	my $expect = $psql->query_safe($count_sql);
+
+
+	# Very basic tests for ignore_checksum_failure=off / on
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true);
+));
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+		"reading block w/ wrong checksum with ignore_checksum_failure=off fails",
+		$count_sql, qr/^$/, qr/ERROR:  invalid page in block/);
+
+	$psql->query_safe("SET ignore_checksum_failure=on");
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+			  "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds",
+			  $count_sql,
+			  qr/^$expect$/,
+			  qr/WARNING:  ignoring (checksum failure|\d checksum failures)/);
+
+
+	# Verify that ignore_checksum_failure=off works in multi-block reads
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+));
+
+	my $log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of checksum failed block 3, with ignore",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  ignoring checksum failure in block 3/
+	);
+
+	# Check that the log contains a LOG message about the failure
+	$log_location =
+	  $node->wait_for_log(qr/LOG:  ignoring checksum failure/, $log_location);
+
+	# check that we error
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 2, checksum failed 3, invalid 4, zero=false with ignore",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/\d+$/
+	);
+
+	# Test multi-block read with different problems in different blocks
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	$log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 1, checksum failed 2, 3, invalid 3-5, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..5 of relation/
+	);
+
+
+	# Unfortunately have to scan the whole log since determining $log_location
+	# above in each of the tests, as wait_for_log() returns the size of the
+	# file.
+
+	$node->wait_for_log(qr/LOG:  ignoring checksum failure in block 2/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 2");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 3 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about invalid page in block 3");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 4 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 4");
+
+	$node->wait_for_log(qr/LOG:  invalid page in block 5 of relation base.*; zeroing out page/,
+						$log_location);
+	ok(1, "$io_method: found information about checksum failure in block 5");
+
+
+	# Reading a page with both an invalid header and an invalid checksum
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block with both invalid header and invalid checksum, zero=false",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block 3 with both invalid header and invalid checksum, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*; zeroing out page/
+	);
+
+
+	$psql->quit();
+}
+
+
+# Run all tests that are supported for all io_methods
+sub test_generic
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	is($node->safe_psql('postgres', 'SHOW io_method'),
+		$io_method, "$io_method: io_method set correctly");
+
+	$node->safe_psql(
+		'postgres', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+
+INSERT INTO tbl_corr SELECT generate_series(1, 10000);
+INSERT INTO tbl_ok SELECT generate_series(1, 10000);
+SELECT grow_rel('tbl_corr', 16);
+SELECT grow_rel('tbl_ok', 16);
+
+SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true);
+CHECKPOINT;
+));
+
+	test_handle($io_method, $node);
+	test_io_error($io_method, $node);
+	test_batchmode($io_method, $node);
+	test_startwait_io($io_method, $node);
+	test_complete_foreign($io_method, $node);
+	test_close_fd($io_method, $node);
+	test_invalidate($io_method, $node);
+	test_zero($io_method, $node);
+	test_checksum($io_method, $node);
+	test_ignore_checksum($io_method, $node);
+	test_checksum_createdb($io_method, $node);
+
+  SKIP:
+	{
+		skip 'Injection points not supported by this build', 1
+		  unless $ENV{enable_injection_points} eq 'yes';
+		test_inject($io_method, $node);
+	}
+}
diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl
new file mode 100644
index 000000000000..af5fae15ea78
--- /dev/null
+++ b/src/test/modules/test_aio/t/002_io_workers.pl
@@ -0,0 +1,125 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use List::Util qw(shuffle);
+
+
+my $node = PostgreSQL::Test::Cluster->new('worker');
+$node->init();
+$node->append_conf(
+	'postgresql.conf', qq(
+io_method=worker
+));
+
+$node->start();
+
+# Test changing the number of I/O worker processes while also evaluating the
+# handling of their termination.
+test_number_of_io_workers_dynamic($node);
+
+$node->stop();
+
+done_testing();
+
+
+sub test_number_of_io_workers_dynamic
+{
+	my $node = shift;
+
+	my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers');
+
+	# Verify that worker count can't be set to 0
+	change_number_of_io_workers($node, 0, $prev_worker_count, 1);
+
+	# Verify that worker count can't be set to 33 (above the max)
+	change_number_of_io_workers($node, 33, $prev_worker_count, 1);
+
+	# Try changing IO workers to a random value and verify that the worker
+	# count ends up as expected. Always test the min/max of workers.
+	#
+	# Valid range for io_workers is [1, 32]. 8 tests in total seems
+	# reasonable.
+	my @io_workers_range = shuffle(1 ... 32);
+	foreach my $worker_count (1, 32, @io_workers_range[ 0, 6 ])
+	{
+		$prev_worker_count =
+		  change_number_of_io_workers($node, $worker_count,
+			$prev_worker_count, 0);
+	}
+}
+
+sub change_number_of_io_workers
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my $prev_worker_count = shift;
+	my $expect_failure = shift;
+	my ($result, $stdout, $stderr);
+
+	($result, $stdout, $stderr) =
+	  $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count");
+	$node->safe_psql('postgres', 'SELECT pg_reload_conf()');
+
+	if ($expect_failure)
+	{
+		ok( $stderr =~
+			  /$worker_count is outside the valid range for parameter "io_workers"/,
+			"updating number of io_workers to $worker_count failed, as expected"
+		);
+
+		return $prev_worker_count;
+	}
+	else
+	{
+		is( $node->safe_psql('postgres', 'SHOW io_workers'),
+			$worker_count,
+			"updating number of io_workers from $prev_worker_count to $worker_count"
+		);
+
+		check_io_worker_count($node, $worker_count);
+		terminate_io_worker($node, $worker_count);
+		check_io_worker_count($node, $worker_count);
+
+		return $worker_count;
+	}
+}
+
+sub terminate_io_worker
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my ($pid, $ret);
+
+	# Select a random io worker
+	$pid = $node->safe_psql(
+		'postgres',
+		qq(SELECT pid FROM pg_stat_activity WHERE
+			backend_type = 'io worker' ORDER BY RANDOM() LIMIT 1));
+
+	# terminate IO worker with SIGINT
+	is(PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'INT', $pid),
+		0, "random io worker process signalled with INT");
+
+	# Check that worker exits
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE pid = $pid), '0'),
+		"random io worker process exited after signal");
+}
+
+sub check_io_worker_count
+{
+	my $node = shift;
+	my $worker_count = shift;
+
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'io worker'),
+			$worker_count),
+		"io worker count is $worker_count");
+}
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
new file mode 100644
index 000000000000..e2a812351662
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -0,0 +1,129 @@
+/* src/test/modules/test_aio/test_aio--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_aio" to load this file. \quit
+
+
+CREATE FUNCTION errno_from_string(sym text)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION grow_rel(rel regclass, nblocks int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION modify_rel_block(rel regclass, blockno int,
+  zero bool DEFAULT false,
+  corrupt_header bool DEFAULT false,
+  corrupt_checksum bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION read_rel_block_ll(
+    rel regclass,
+    blockno int,
+    nblocks int DEFAULT 1,
+    wait_complete bool DEFAULT true,
+    batchmode_enter bool DEFAULT false,
+    smgrreleaseall bool DEFAULT false,
+    batchmode_exit bool DEFAULT false,
+    zero_on_error bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_create_toy(rel regclass, blockno int4)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_start_io(buffer int, for_input bool, nowait bool)
+RETURNS pg_catalog.bool STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed bool, io_error bool, release_aio bool)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Handle related functions
+ */
+CREATE FUNCTION handle_get_and_error()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_twice()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_release()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_release_last()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+/*
+ * Batchmode related functions
+ */
+CREATE FUNCTION batch_start()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION batch_end()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION bb_get_and_error()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get_twice()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_get_release()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION bb_release_last()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Injection point related functions
+ */
+CREATE FUNCTION inj_io_short_read_attach(result int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_short_read_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_attach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
new file mode 100644
index 000000000000..2a7f4378ef37
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.c
@@ -0,0 +1,859 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_aio.c
+ *		Helpers to write tests for AIO
+ *
+ * This module provides interface functions for C functionality to SQL, to
+ * make it possible to test AIO related behavior in a targeted way from SQL.
+ * It'd not generally be safe to export these functions to SQL, but for a test
+ * that's fine.
+ *
+ * Copyright (c) 2020-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/test_aio/test_aio.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "fmgr.h"
+#include "storage/aio.h"
+#include "storage/aio_internal.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/builtins.h"
+#include "utils/injection_point.h"
+#include "utils/rel.h"
+
+
+PG_MODULE_MAGIC;
+
+
+typedef struct InjIoErrorState
+{
+	bool		enabled_short_read;
+	bool		enabled_reopen;
+
+	bool		short_read_result_set;
+	int			short_read_result;
+}			InjIoErrorState;
+
+static InjIoErrorState * inj_io_error_state;
+
+/* Shared memory init callbacks */
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+
+static PgAioHandle *last_handle;
+static PgAioBounceBuffer *last_bb;
+
+
+
+static void
+test_aio_shmem_request(void)
+{
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+
+	RequestAddinShmemSpace(sizeof(InjIoErrorState));
+}
+
+static void
+test_aio_shmem_startup(void)
+{
+	bool		found;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Create or attach to the shared memory state */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	inj_io_error_state = ShmemInitStruct("injection_points",
+										 sizeof(InjIoErrorState),
+										 &found);
+
+	if (!found)
+	{
+		/* First time through, initialize */
+		inj_io_error_state->enabled_short_read = false;
+		inj_io_error_state->enabled_reopen = false;
+
+#ifdef USE_INJECTION_POINTS
+		InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED",
+							 "test_aio",
+							 "inj_io_short_read",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+
+		InjectionPointAttach("AIO_WORKER_AFTER_REOPEN",
+							 "test_aio",
+							 "inj_io_reopen",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+
+#endif
+	}
+	else
+	{
+		/*
+		 * Pre-load the injection points now, so we can call them in a
+		 * critical section.
+		 */
+#ifdef USE_INJECTION_POINTS
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+		elog(LOG, "injection point loaded");
+#endif
+	}
+
+	LWLockRelease(AddinShmemInitLock);
+}
+
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = test_aio_shmem_request;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = test_aio_shmem_startup;
+}
+
+
+PG_FUNCTION_INFO_V1(errno_from_string);
+Datum
+errno_from_string(PG_FUNCTION_ARGS)
+{
+	const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+	if (strcmp(sym, "EIO") == 0)
+		PG_RETURN_INT32(EIO);
+	else if (strcmp(sym, "EAGAIN") == 0)
+		PG_RETURN_INT32(EAGAIN);
+	else if (strcmp(sym, "EINTR") == 0)
+		PG_RETURN_INT32(EINTR);
+	else if (strcmp(sym, "ENOSPC") == 0)
+		PG_RETURN_INT32(ENOSPC);
+	else if (strcmp(sym, "EROFS") == 0)
+		PG_RETURN_INT32(EROFS);
+
+	ereport(ERROR,
+			errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			errmsg_internal("%s is not a supported errno value", sym));
+	PG_RETURN_INT32(0);
+}
+
+PG_FUNCTION_INFO_V1(grow_rel);
+Datum
+grow_rel(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	uint32		nblocks = PG_GETARG_UINT32(1);
+	Relation	rel;
+#define MAX_BUFFERS_TO_EXTEND_BY 64
+	Buffer		victim_buffers[MAX_BUFFERS_TO_EXTEND_BY];
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	while (nblocks > 0)
+	{
+		uint32		extend_by_pages;
+
+		extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY);
+
+		ExtendBufferedRelBy(BMR_REL(rel),
+							MAIN_FORKNUM,
+							NULL,
+							0,
+							extend_by_pages,
+							victim_buffers,
+							&extend_by_pages);
+
+		nblocks -= extend_by_pages;
+
+		for (uint32 i = 0; i < extend_by_pages; i++)
+		{
+			ReleaseBuffer(victim_buffers[i]);
+		}
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(modify_rel_block);
+Datum
+modify_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	bool		zero = PG_GETARG_BOOL(2);
+	bool		corrupt_header = PG_GETARG_BOOL(3);
+	bool		corrupt_checksum = PG_GETARG_BOOL(4);
+	Page		page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	Relation	rel;
+	Buffer		buf;
+	PageHeader	ph;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+							 RBM_ZERO_ON_ERROR, NULL);
+
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	/*
+	 * copy the page to local memory, seems nicer than to directly modify in
+	 * the buffer pool.
+	 */
+	memcpy(page, BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	ReleaseBuffer(buf);
+
+	/*
+	 * Don't want to have a buffer in-memory that's marked valid where the
+	 * on-disk contents are invalid. Particularly not if the in-memory buffer
+	 * could be dirty...
+	 *
+	 * While we hold an AEL on the relation nobody else should be able to read
+	 * the buffer in.
+	 *
+	 * NB: This is probably racy, better don't copy this to non-test code.
+	 */
+	if (BufferIsLocal(buf))
+		InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+	else
+		EvictUnpinnedBuffer(buf);
+
+	/*
+	 * Now modify the page as asked for by the caller.
+	 */
+	if (zero)
+		memset(page, 0, BufferGetPageSize(buf));
+
+	if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum))
+		PageInit(page, BufferGetPageSize(buf), 0);
+
+	ph = (PageHeader) page;
+
+	if (corrupt_header)
+		ph->pd_special = BLCKSZ + 1;
+
+	if (corrupt_checksum)
+	{
+		bool		successfully_corrupted = 0;
+
+		/*
+		 * Any single modification of the checksum could just end up being
+		 * valid again. To be sure
+		 */
+		for (int i = 0; i < 100; i++)
+		{
+			uint16		verify_checksum;
+			uint16		old_checksum;
+
+			old_checksum = ph->pd_checksum;
+			ph->pd_checksum = old_checksum + 1;
+
+			elog(LOG, "corrupting checksum of blk %u from %u to %u",
+				 blkno, old_checksum, ph->pd_checksum);
+
+			verify_checksum = pg_checksum_page(page, blkno);
+			if (verify_checksum != ph->pd_checksum)
+			{
+				successfully_corrupted = true;
+				break;
+			}
+		}
+
+		if (!successfully_corrupted)
+			elog(ERROR, "could not corrupt checksum, what's going on?");
+	}
+	else
+	{
+		PageSetChecksumInplace(page, blkno);
+	}
+
+	smgrwrite(RelationGetSmgr(rel),
+			  MAIN_FORKNUM, blkno, page, true);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Ensures a buffer for rel & blkno is in shared buffers, without actually
+ * caring about the buffer contents. Used to set up test scenarios.
+ */
+static Buffer
+create_toy_buffer(Relation rel, BlockNumber blkno)
+{
+	Buffer		buf;
+	BufferDesc *buf_hdr;
+	uint32		buf_state;
+	bool		was_pinned = false;
+
+	/* place buffer in shared buffers without erroring out */
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL);
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		buf_hdr = GetLocalBufferDescriptor(-buf - 1);
+		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+	}
+	else
+	{
+		buf_hdr = GetBufferDescriptor(buf - 1);
+		buf_state = LockBufHdr(buf_hdr);
+	}
+
+	/*
+	 * We should be the only backend accessing this buffer. This is just a
+	 * small bit of belt-and-suspenders defense, none of this code should ever
+	 * run in a cluster with real data.
+	 */
+	if (BUF_STATE_GET_REFCOUNT(buf_state) > 1)
+		was_pinned = true;
+	else
+		buf_state &= ~(BM_VALID | BM_DIRTY);
+
+	if (RelationUsesLocalBuffers(rel))
+		pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+	else
+		UnlockBufHdr(buf_hdr, buf_state);
+
+	if (was_pinned)
+		elog(ERROR, "toy buffer %d was already pinned",
+			 buf);
+
+	return buf;
+}
+
+/*
+ * A "low level" read. This does similar things to what
+ * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and
+ * less sanity).
+ */
+PG_FUNCTION_INFO_V1(read_rel_block_ll);
+Datum
+read_rel_block_ll(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	int			nblocks = PG_GETARG_INT32(2);
+	bool		wait_complete = PG_GETARG_BOOL(3);
+	bool		batchmode_enter = PG_GETARG_BOOL(4);
+	bool		call_smgrreleaseall = PG_GETARG_BOOL(5);
+	bool		batchmode_exit = PG_GETARG_BOOL(6);
+	bool		zero_on_error = PG_GETARG_BOOL(7);
+	Relation	rel;
+	Buffer		bufs[PG_IOV_MAX];
+	BufferDesc *buf_hdrs[PG_IOV_MAX];
+	Page		pages[PG_IOV_MAX];
+	uint8		srb_flags = 0;
+	PgAioReturn ior;
+	PgAioHandle *ioh;
+	PgAioWaitRef iow;
+	SMgrRelation smgr;
+
+	if (nblocks <= 0 || nblocks > PG_IOV_MAX)
+		elog(ERROR, "nblocks is out of range");
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		bufs[i] = create_toy_buffer(rel, blkno + i);
+		pages[i] = BufferGetBlock(bufs[i]);
+		buf_hdrs[i] = BufferIsLocal(bufs[i]) ?
+			GetLocalBufferDescriptor(-bufs[i] - 1) :
+			GetBufferDescriptor(bufs[i] - 1);
+	}
+
+	smgr = RelationGetSmgr(rel);
+
+	pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid);
+
+	ioh = pgaio_io_acquire(CurrentResourceOwner, &ior);
+	pgaio_io_get_wref(ioh, &iow);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartLocalBufferIO(buf_hdrs[i], true, false);
+		pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL);
+	}
+	else
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartBufferIO(buf_hdrs[i], true, false);
+	}
+
+	pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks);
+
+	if (zero_on_error | zero_damaged_pages)
+		srb_flags |= READ_BUFFERS_ZERO_ON_ERROR;
+	if (ignore_checksum_failure)
+		srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
+
+	pgaio_io_register_callbacks(ioh,
+								RelationUsesLocalBuffers(rel) ?
+								PGAIO_HCB_LOCAL_BUFFER_READV :
+								PGAIO_HCB_SHARED_BUFFER_READV,
+								srb_flags);
+
+	if (batchmode_enter)
+		pgaio_enter_batchmode();
+
+	smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno,
+				   (void *) pages, nblocks);
+
+	if (call_smgrreleaseall)
+		smgrreleaseall();
+
+	if (batchmode_exit)
+		pgaio_exit_batchmode();
+
+	for (int i = 0; i < nblocks; i++)
+		ReleaseBuffer(bufs[i]);
+
+	if (wait_complete)
+	{
+		pgaio_wref_wait(&iow);
+
+		if (ior.result.status != PGAIO_RS_OK)
+			pgaio_result_report(ior.result,
+								&ior.target_data,
+								ior.result.status == PGAIO_RS_ERROR ?
+								ERROR : WARNING);
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(invalidate_rel_block);
+Datum
+invalidate_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	PrefetchBufferResult pr;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	/*
+	 * This is a gross hack, but there's no other API exposed that allows to
+	 * get a buffer ID without actually reading the block in.
+	 */
+	pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno);
+	buf = pr.recent_buffer;
+
+	if (BufferIsValid(buf))
+	{
+		/* if the buffer contents aren't valid, this'll return false */
+		if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf))
+		{
+			BufferDesc *buf_hdr = BufferIsLocal(buf) ?
+				GetLocalBufferDescriptor(-buf - 1)
+				: GetBufferDescriptor(buf - 1);
+
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+			if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY)
+			{
+				if (BufferIsLocal(buf))
+					FlushLocalBuffer(buf_hdr, NULL);
+				else
+					FlushOneBuffer(buf);
+			}
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			ReleaseBuffer(buf);
+
+			if (BufferIsLocal(buf))
+				InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+			else if (!EvictUnpinnedBuffer(buf))
+				elog(ERROR, "couldn't evict");
+		}
+	}
+
+	relation_close(rel, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(buffer_create_toy);
+Datum
+buffer_create_toy(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = create_toy_buffer(rel, blkno);
+	ReleaseBuffer(buf);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_INT32(buf);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_start_io);
+Datum
+buffer_call_start_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		nowait = PG_GETARG_BOOL(2);
+	bool		can_start;
+
+	if (BufferIsLocal(buf))
+		can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+									   for_input, nowait);
+	else
+		can_start = StartBufferIO(GetBufferDescriptor(buf - 1),
+								  for_input, nowait);
+
+	/*
+	 * For tests we don't want the resowner release preventing us from
+	 * orchestrating odd scenarios.
+	 */
+	if (can_start && !BufferIsLocal(buf))
+		ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+									buf);
+
+	ereport(LOG,
+			errmsg("buffer %d after StartBufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_BOOL(can_start);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_terminate_io);
+Datum
+buffer_call_terminate_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		succeed = PG_GETARG_BOOL(2);
+	bool		io_error = PG_GETARG_BOOL(3);
+	bool		release_aio = PG_GETARG_BOOL(4);
+	bool		clear_dirty = false;
+	uint32		set_flag_bits = 0;
+
+	if (io_error)
+		set_flag_bits |= BM_IO_ERROR;
+
+	if (for_input)
+	{
+		clear_dirty = false;
+
+		if (succeed)
+			set_flag_bits |= BM_VALID;
+	}
+	else
+	{
+		if (succeed)
+			clear_dirty = true;
+	}
+
+	ereport(LOG,
+			errmsg("buffer %d before Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	if (BufferIsLocal(buf))
+		TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+							   clear_dirty, set_flag_bits, release_aio);
+	else
+		TerminateBufferIO(GetBufferDescriptor(buf - 1),
+						  clear_dirty, set_flag_bits, false, release_aio);
+
+	ereport(LOG,
+			errmsg("buffer %d after Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get);
+Datum
+handle_get(PG_FUNCTION_ARGS)
+{
+	last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_release_last);
+Datum
+handle_release_last(PG_FUNCTION_ARGS)
+{
+	if (!last_handle)
+		elog(ERROR, "no handle");
+
+	pgaio_io_release(last_handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_and_error);
+Datum
+handle_get_and_error(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	elog(ERROR, "as you command");
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_twice);
+Datum
+handle_get_twice(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_release);
+Datum
+handle_get_release(PG_FUNCTION_ARGS)
+{
+	PgAioHandle *handle;
+
+	handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_release(handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_start);
+Datum
+batch_start(PG_FUNCTION_ARGS)
+{
+	pgaio_enter_batchmode();
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_end);
+Datum
+batch_end(PG_FUNCTION_ARGS)
+{
+	pgaio_exit_batchmode();
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_get);
+Datum
+bb_get(PG_FUNCTION_ARGS)
+{
+	last_bb = pgaio_bounce_buffer_get();
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_release_last);
+Datum
+bb_release_last(PG_FUNCTION_ARGS)
+{
+	if (!last_bb)
+		elog(ERROR, "no bb");
+
+	pgaio_bounce_buffer_release(last_bb);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_get_and_error);
+Datum
+bb_get_and_error(PG_FUNCTION_ARGS)
+{
+	pgaio_bounce_buffer_get();
+
+	elog(ERROR, "as you command");
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(bb_get_twice);
+Datum
+bb_get_twice(PG_FUNCTION_ARGS)
+{
+	pgaio_bounce_buffer_get();
+	pgaio_bounce_buffer_get();
+
+	PG_RETURN_VOID();
+}
+
+
+PG_FUNCTION_INFO_V1(bb_get_release);
+Datum
+bb_get_release(PG_FUNCTION_ARGS)
+{
+	PgAioBounceBuffer *bb;
+
+	bb = pgaio_bounce_buffer_get();
+	pgaio_bounce_buffer_release(bb);
+
+	PG_RETURN_VOID();
+}
+
+#ifdef USE_INJECTION_POINTS
+extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data);
+extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data);
+
+void
+inj_io_short_read(const char *name, const void *private_data)
+{
+	PgAioHandle *ioh;
+
+	ereport(LOG,
+			errmsg("short read injection point called, is enabled: %d",
+				   inj_io_error_state->enabled_reopen),
+			errhidestmt(true), errhidecontext(true));
+
+	if (inj_io_error_state->enabled_short_read)
+	{
+		ioh = pgaio_inj_io_get();
+
+		/*
+		 * Only shorten reads that are actually longer than the target size,
+		 * otherwise we can trigger over-reads.
+		 */
+		if (inj_io_error_state->short_read_result_set
+			&& ioh->op == PGAIO_OP_READV
+			&& inj_io_error_state->short_read_result <= ioh->result)
+		{
+			struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off];
+			int32		old_result = ioh->result;
+			int32		new_result = inj_io_error_state->short_read_result;
+			int32		processed = 0;
+
+			ereport(LOG,
+					errmsg("short read inject point, changing result from %d to %d",
+						   old_result, new_result),
+					errhidestmt(true), errhidecontext(true));
+
+			/*
+			 * The underlying IO actually completed OK, and thus the "invalid"
+			 * portion of the IOV actually contains valid data. That can hide
+			 * a lot of problems, e.g. if we were to wrongly mark a buffer,
+			 * that wasn't read according to the shortened-read, IO as valid,
+			 * the contents would look valid and we might miss a bug.
+			 *
+			 * To avoid that, iterate through the IOV and zero out the
+			 * "failed" portion of the IO.
+			 */
+			for (int i = 0; i < ioh->op_data.read.iov_length; i++)
+			{
+				if (processed + iov[i].iov_len <= new_result)
+					processed += iov[i].iov_len;
+				else if (processed <= new_result)
+				{
+					uint32		ok_part = new_result - processed;
+
+					memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part);
+					processed += iov[i].iov_len;
+				}
+				else
+				{
+					memset((char *) iov[i].iov_base, 0, iov[i].iov_len);
+				}
+			}
+
+			ioh->result = new_result;
+		}
+	}
+}
+
+void
+inj_io_reopen(const char *name, const void *private_data)
+{
+	ereport(LOG,
+			errmsg("reopen injection point called, is enabled: %d",
+				   inj_io_error_state->enabled_reopen),
+			errhidestmt(true), errhidecontext(true));
+
+	if (inj_io_error_state->enabled_reopen)
+		elog(ERROR, "injection point triggering failure to reopen ");
+}
+#endif
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_attach);
+Datum
+inj_io_short_read_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = true;
+	inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0);
+	if (inj_io_error_state->short_read_result_set)
+		inj_io_error_state->short_read_result = PG_GETARG_INT32(0);
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_detach);
+Datum
+inj_io_short_read_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_attach);
+Datum
+inj_io_reopen_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = true;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_detach);
+Datum
+inj_io_reopen_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_aio/test_aio.control b/src/test/modules/test_aio/test_aio.control
new file mode 100644
index 000000000000..cd91c3ed16b9
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.control
@@ -0,0 +1,3 @@
+comment = 'Test code for AIO'
+default_version = '1.0'
+module_pathname = '$libdir/test_aio'
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 954f549555e2..5588d83e1bfb 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -3132,6 +3132,12 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
@@ -3145,6 +3151,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 (1 row)
 
 GRANT pg_read_all_stats TO regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
  has_table_privilege 
 ---------------------
@@ -3159,6 +3171,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
+ ok 
+----
+ t
+(1 row)
+
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
  ok 
 ----
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 474789691357..d9533deb04e8 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1286,6 +1286,22 @@ drop table cchild;
 SELECT viewname, definition FROM pg_views
 WHERE schemaname = 'pg_catalog'
 ORDER BY viewname;
+pg_aios| SELECT pid,
+    io_id,
+    io_generation,
+    state,
+    operation,
+    off,
+    length,
+    target,
+    handle_data_len,
+    raw_result,
+    result,
+    target_desc,
+    f_sync,
+    f_localmem,
+    f_buffered
+   FROM pg_get_aios() pg_get_aios(pid, io_id, io_generation, state, operation, off, length, target, handle_data_len, raw_result, result, target_desc, f_sync, f_localmem, f_buffered);
 pg_available_extension_versions| SELECT e.name,
     e.version,
     (x.extname IS NOT NULL) AS installed,
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index b81694c24f28..286b1d037569 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -1919,16 +1919,19 @@ DROP USER regress_locktable_user;
 
 CREATE ROLE regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
 SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations;
 RESET ROLE;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b66cecd87991..e89c501fa8af 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -349,6 +349,7 @@ BufferManagerRelation
 BufferStrategyControl
 BufferTag
 BufferUsage
+BuffersToWrite
 BuildAccumulator
 BuiltinScript
 BulkInsertState
@@ -1196,6 +1197,7 @@ IOContext
 IOFuncSelector
 IOObject
 IOOp
+IOQueue
 IO_STATUS_BLOCK
 IPCompareMethod
 ITEM
@@ -2137,6 +2139,7 @@ PermutationStep
 PermutationStepBlocker
 PermutationStepBlockerType
 PgAioBackend
+PgAioBounceBuffer
 PgAioCtl
 PgAioHandle
 PgAioHandleCallbackID
@@ -3021,6 +3024,7 @@ TocEntry
 TokenAuxData
 TokenizedAuthLine
 TrackItem
+TrackedIO
 TransApplyAction
 TransInvalidationInfo
 TransState