aio: Add pg_aios view
authorAndres Freund <[email protected]>
Tue, 1 Apr 2025 17:30:33 +0000 (13:30 -0400)
committerAndres Freund <[email protected]>
Tue, 1 Apr 2025 17:30:33 +0000 (13:30 -0400)
The new view lists all IO handles that are currently in use and is mainly
useful for PG developers, but may also be useful when tuning PG.

Bumps catversion.

Reviewed-by: Noah Misch <[email protected]>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt

doc/src/sgml/system-views.sgml
src/backend/catalog/system_views.sql
src/backend/storage/aio/Makefile
src/backend/storage/aio/aio_funcs.c [new file with mode: 0644]
src/backend/storage/aio/meson.build
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/test/regress/expected/privileges.out
src/test/regress/expected/rules.out
src/test/regress/sql/privileges.sql

index 3f5a306247e65c3061a4c7488c81ff311b9896d7..e9a59af8c34bb65c7b0b51b3a49d5ff565d3a096 100644 (file)
     </thead>
 
     <tbody>
+     <row>
+      <entry><link linkend="view-pg-aios"><structname>pg_aios</structname></link></entry>
+      <entry>In-use asynchronous IO handles</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-available-extensions"><structname>pg_available_extensions</structname></link></entry>
       <entry>available extensions</entry>
   </table>
  </sect1>
 
+ <sect1 id="view-pg-aios">
+  <title><structname>pg_aios</structname></title>
+
+  <indexterm zone="view-pg-aios">
+   <primary>pg_aios</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_aios</structname> view lists all <xref
+   linkend="glossary-aio"/> handles that are currently in-use.  An I/O handle
+   is used to reference an I/O operation that is being prepared, executed or
+   is in the process of completing.  <structname>pg_aios</structname> contains
+   one row for each I/O handle.
+  </para>
+
+  <para>
+   This view is mainly useful for developers of
+   <productname>PostgreSQL</productname>, but may also be useful when tuning
+   <productname>PostgreSQL</productname>.
+  </para>
+
+  <table>
+   <title><structname>pg_aios</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pid</structfield> <type>int4</type>
+      </para>
+      <para>
+       Process ID of the server process that is issuing this I/O.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>io_id</structfield> <type>int4</type>
+      </para>
+      <para>
+       Identifier of the I/O handle. Handles are reused once the I/O
+       completed (or if the handle is released before I/O is started). On reuse
+       <link linkend="view-pg-aios-io-generation">
+        <structname>pg_aios</structname>.<structfield>io_generation</structfield>
+       </link>
+       is incremented.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry" id="view-pg-aios-io-generation"><para role="column_definition">
+       <structfield>io_generation</structfield> <type>int8</type>
+      </para>
+      <para>
+       Generation of the I/O handle.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>state</structfield> <type>text</type>
+      </para>
+      <para>
+       State of the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>HANDED_OUT</literal>, referenced by code but not yet used
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>DEFINED</literal>, information necessary for execution is known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>STAGED</literal>, ready for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>SUBMITTED</literal>, submitted for execution
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_IO</literal>, finished, but result has not yet been processed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_SHARED</literal>, shared completion processing completed
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>COMPLETED_LOCAL</literal>, backend local completion processing completed
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>operation</structfield> <type>text</type>
+      </para>
+      <para>
+       Operation performed using the I/O handle:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>invalid</literal>, not yet known
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>readv</literal>, a vectored read
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>writev</literal>, a vectored write
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>off</structfield> <type>int8</type>
+      </para>
+      <para>
+       Offset of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>length</structfield> <type>int8</type>
+      </para>
+      <para>
+       Length of the I/O operation.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target</structfield> <type>text</type>
+      </para>
+      <para>
+       What kind of object is the I/O targeting:
+       <itemizedlist spacing="compact">
+        <listitem>
+         <para>
+          <literal>smgr</literal>, I/O on relations
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>handle_data_len</structfield> <type>int2</type>
+      </para>
+      <para>
+       Length of the data associated with the I/O operation. For I/O to/from
+       <xref linkend="guc-shared-buffers"/> and <xref
+       linkend="guc-temp-buffers"/>, this indicates the number of buffers the
+       I/O is operating on.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>raw_result</structfield> <type>int4</type>
+      </para>
+      <para>
+       Low-level result of the I/O operation, or NULL if the operation has not
+       yet completed.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>result</structfield> <type>text</type>
+      </para>
+      <para>
+       High-level result of the I/O operation:
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>UNKNOWN</literal> means that the result of the
+          operation is not yet known.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>OK</literal> means the I/O completed successfully.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>PARTIAL</literal> means that the I/O completed without
+          error, but did not process all data. Commonly callers will need to
+          retry and perform the remainder of the work in a separate I/O.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>WARNING</literal> means that the I/O completed without
+          error, but that execution of the IO triggered a warning. E.g. when
+          encountering a corrupted buffer with <xref
+          linkend="guc-zero-damaged-pages"/> enabled.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>ERROR</literal> means the I/O failed with an error.
+         </para>
+        </listitem>
+       </itemizedlist>
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>target_desc</structfield> <type>text</type>
+      </para>
+      <para>
+       Description of what the I/O operation is targeting.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_sync</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is executed synchronously.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_localmem</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O references process local memory.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>f_buffered</structfield> <type>bool</type>
+      </para>
+      <para>
+       Flag indicating whether the I/O is buffered I/O.
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The <structname>pg_aios</structname> view is read-only.
+  </para>
+
+  <para>
+   By default, the <structname>pg_aios</structname> view can be read only by
+   superusers or roles with privileges of the
+   <literal>pg_read_all_stats</literal> role.
+  </para>
+ </sect1>
+
  <sect1 id="view-pg-available-extensions">
   <title><structname>pg_available_extensions</structname></title>
 
index 31d269b7ee0c42f35f28326cde9939a537f76339..64a7240aa772e0f68dada39f0d298a0467e2eb26 100644 (file)
@@ -1391,3 +1391,10 @@ CREATE VIEW pg_stat_subscription_stats AS
 
 CREATE VIEW pg_wait_events AS
     SELECT * FROM pg_get_wait_events();
+
+CREATE VIEW pg_aios AS
+    SELECT * FROM pg_get_aios();
+REVOKE ALL ON pg_aios FROM PUBLIC;
+GRANT SELECT ON pg_aios TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_aios() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_aios() TO pg_read_all_stats;
index c06c50771e023834a79300d0d65b857d9811972d..3f2469cc399450d52b7f27ad46270c7cf50a0cc5 100644 (file)
@@ -11,6 +11,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
    aio.o \
    aio_callback.o \
+   aio_funcs.o \
    aio_init.o \
    aio_io.o \
    aio_target.o \
diff --git a/src/backend/storage/aio/aio_funcs.c b/src/backend/storage/aio/aio_funcs.c
new file mode 100644 (file)
index 0000000..584e683
--- /dev/null
@@ -0,0 +1,230 @@
+/*-------------------------------------------------------------------------
+ *
+ * aio_funcs.c
+ *    AIO - SQL interface for AIO
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *    src/backend/storage/aio/aio_funcs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "nodes/execnodes.h"
+#include "port/atomics.h"
+#include "storage/aio_internal.h"
+#include "storage/lock.h"
+#include "storage/proc.h"
+#include "storage/procnumber.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/tuplestore.h"
+
+
+/*
+ * Byte length of an iovec.
+ */
+static size_t
+iov_byte_length(const struct iovec *iov, int cnt)
+{
+   size_t      len = 0;
+
+   for (int i = 0; i < cnt; i++)
+   {
+       len += iov[i].iov_len;
+   }
+
+   return len;
+}
+
+Datum
+pg_get_aios(PG_FUNCTION_ARGS)
+{
+   ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+   InitMaterializedSRF(fcinfo, 0);
+
+#define PG_GET_AIOS_COLS   15
+
+   for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++)
+   {
+       PgAioHandle *live_ioh = &pgaio_ctl->io_handles[i];
+       uint32      ioh_id = pgaio_io_get_id(live_ioh);
+       Datum       values[PG_GET_AIOS_COLS] = {0};
+       bool        nulls[PG_GET_AIOS_COLS] = {0};
+       ProcNumber  owner;
+       PGPROC     *owner_proc;
+       int32       owner_pid;
+       PgAioHandleState start_state;
+       uint64      start_generation;
+       PgAioHandle ioh_copy;
+       struct iovec iov_copy[PG_IOV_MAX];
+
+
+       /*
+        * There is no lock that could prevent the state of the IO to advance
+        * concurrently - and we don't want to introduce one, as that would
+        * introduce atomics into a very common path. Instead we
+        *
+        * 1) Determine the state + generation of the IO.
+        *
+        * 2) Copy the IO to local memory.
+        *
+        * 3) Check if state or generation of the IO changed. If the state
+        * changed, retry, if the generation changed don't display the IO.
+        */
+
+       /* 1) from above */
+       start_generation = live_ioh->generation;
+
+       /*
+        * Retry at this point, so we can accept changing states, but not
+        * changing generations.
+        */
+retry:
+       pg_read_barrier();
+       start_state = live_ioh->state;
+
+       if (start_state == PGAIO_HS_IDLE)
+           continue;
+
+       /* 2) from above */
+       memcpy(&ioh_copy, live_ioh, sizeof(PgAioHandle));
+
+       /*
+        * Safe to copy even if no iovec is used - we always reserve the
+        * required space.
+        */
+       memcpy(&iov_copy, &pgaio_ctl->iovecs[ioh_copy.iovec_off],
+              PG_IOV_MAX * sizeof(struct iovec));
+
+       /*
+        * Copy information about owner before 3) below, if the process exited
+        * it'd have to wait for the IO to finish first, which we would detect
+        * in 3).
+        */
+       owner = ioh_copy.owner_procno;
+       owner_proc = GetPGProcByNumber(owner);
+       owner_pid = owner_proc->pid;
+
+       /* 3) from above */
+       pg_read_barrier();
+
+       /*
+        * The IO completed and a new one was started with the same ID. Don't
+        * display it - it really started after this function was called.
+        * There be a risk of a livelock if we just retried endlessly, if IOs
+        * complete very quickly.
+        */
+       if (live_ioh->generation != start_generation)
+           continue;
+
+       /*
+        * The IO's state changed while we were "rendering" it. Just start
+        * from scratch. There's no risk of a livelock here, as an IO has a
+        * limited sets of states it can be in, and state changes go only in a
+        * single direction.
+        */
+       if (live_ioh->state != start_state)
+           goto retry;
+
+       /*
+        * Now that we have copied the IO into local memory and checked that
+        * it's still in the same state, we are not allowed to access "live"
+        * memory anymore. To make it slightly easier to catch such cases, set
+        * the "live" pointers to NULL.
+        */
+       live_ioh = NULL;
+       owner_proc = NULL;
+
+
+       /* column: owning pid */
+       if (owner_pid != 0)
+           values[0] = Int32GetDatum(owner_pid);
+       else
+           nulls[0] = false;
+
+       /* column: IO's id */
+       values[1] = ioh_id;
+
+       /* column: IO's generation */
+       values[2] = Int64GetDatum(start_generation);
+
+       /* column: IO's state */
+       values[3] = CStringGetTextDatum(pgaio_io_get_state_name(&ioh_copy));
+
+       /*
+        * If the IO is in PGAIO_HS_HANDED_OUT state, none of the following
+        * fields are valid yet (or are in the process of being set).
+        * Therefore we don't want to display any other columns.
+        */
+       if (start_state == PGAIO_HS_HANDED_OUT)
+       {
+           memset(nulls + 4, 1, (lengthof(nulls) - 4) * sizeof(bool));
+           goto display;
+       }
+
+       /* column: IO's operation */
+       values[4] = CStringGetTextDatum(pgaio_io_get_op_name(&ioh_copy));
+
+       /* columns: details about the IO's operation (offset, length) */
+       switch (ioh_copy.op)
+       {
+           case PGAIO_OP_INVALID:
+               nulls[5] = true;
+               nulls[6] = true;
+               break;
+           case PGAIO_OP_READV:
+               values[5] = Int64GetDatum(ioh_copy.op_data.read.offset);
+               values[6] =
+                   Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.read.iov_length));
+               break;
+           case PGAIO_OP_WRITEV:
+               values[5] = Int64GetDatum(ioh_copy.op_data.write.offset);
+               values[6] =
+                   Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.write.iov_length));
+               break;
+       }
+
+       /* column: IO's target */
+       values[7] = CStringGetTextDatum(pgaio_io_get_target_name(&ioh_copy));
+
+       /* column: length of IO's data array */
+       values[8] = Int16GetDatum(ioh_copy.handle_data_len);
+
+       /* column: raw result (i.e. some form of syscall return value) */
+       if (start_state == PGAIO_HS_COMPLETED_IO
+           || start_state == PGAIO_HS_COMPLETED_SHARED
+           || start_state == PGAIO_HS_COMPLETED_LOCAL)
+           values[9] = Int32GetDatum(ioh_copy.result);
+       else
+           nulls[9] = true;
+
+       /*
+        * column: result in the higher level representation (unknown if not
+        * finished)
+        */
+       values[10] =
+           CStringGetTextDatum(pgaio_result_status_string(ioh_copy.distilled_result.status));
+
+       /* column: target description */
+       values[11] = CStringGetTextDatum(pgaio_io_get_target_description(&ioh_copy));
+
+       /* columns: one for each flag */
+       values[12] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_SYNCHRONOUS);
+       values[13] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_REFERENCES_LOCAL);
+       values[14] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_BUFFERED);
+
+display:
+       tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+   }
+
+   return (Datum) 0;
+}
index 2f0f03d807122b254a025fa5c841cce11b27ccea..da6df2d3654f97c057f4f7078f783d120fd420a6 100644 (file)
@@ -3,6 +3,7 @@
 backend_sources += files(
   'aio.c',
   'aio_callback.c',
+  'aio_funcs.c',
   'aio_init.c',
   'aio_io.c',
   'aio_target.c',
index 798a186e8939237d6f02c16f56a9c253c941ce92..8b96f3b8bf2eb31da753260139d800c67f1e2302 100644 (file)
@@ -57,6 +57,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202503262
+#define CATALOG_VERSION_NO 202504011
 
 #endif
index 8b68b16d79daee57062ac6db35c47300363ede2d..d9c41fa426b3aa58686eb87bee26b604b4c2f12d 100644 (file)
   proargtypes => 'int4',
   prosrc => 'gist_stratnum_common' },
 
+# AIO related functions
+{ oid => '9200', descr => 'information about in-progress asynchronous IOs',
+  proname => 'pg_get_aios', prorows => '100', proretset => 't',
+  provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int4,int4,int8,text,text,int8,int8,text,int2,int4,text,text,bool,bool,bool}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}',
+  prosrc => 'pg_get_aios' },
+
 ]
index 954f549555e2eeec4b4fa81f7d6fead2d8f88b0e..5588d83e1bfb5aff3b61e3881bb6df6db4c241d7 100644 (file)
@@ -3132,6 +3132,12 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 CREATE ROLE regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
+ has_table_privilege 
+---------------------
+ f
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
@@ -3145,6 +3151,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 (1 row)
 
 GRANT pg_read_all_stats TO regress_readallstats;
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
+ has_table_privilege 
+---------------------
+ t
+(1 row)
+
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
  has_table_privilege 
 ---------------------
@@ -3159,6 +3171,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
+ ok 
+----
+ t
+(1 row)
+
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
  ok 
 ----
index 474789691357bf1f0df2348e9c82b462907b9c2f..d9533deb04e893c40714e14d4cd55f774b8115ff 100644 (file)
@@ -1286,6 +1286,22 @@ drop table cchild;
 SELECT viewname, definition FROM pg_views
 WHERE schemaname = 'pg_catalog'
 ORDER BY viewname;
+pg_aios| SELECT pid,
+    io_id,
+    io_generation,
+    state,
+    operation,
+    off,
+    length,
+    target,
+    handle_data_len,
+    raw_result,
+    result,
+    target_desc,
+    f_sync,
+    f_localmem,
+    f_buffered
+   FROM pg_get_aios() pg_get_aios(pid, io_id, io_generation, state, operation, off, length, target, handle_data_len, raw_result, result, target_desc, f_sync, f_localmem, f_buffered);
 pg_available_extension_versions| SELECT e.name,
     e.version,
     (x.extname IS NOT NULL) AS installed,
index b81694c24f28a3d64da470794095eebeb1ac94a5..286b1d037569250762108ea63619c5379cededc9 100644 (file)
@@ -1919,16 +1919,19 @@ DROP USER regress_locktable_user;
 
 CREATE ROLE regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
 
 GRANT pg_read_all_stats TO regress_readallstats;
 
+SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes
 
 -- run query to ensure that functions within views can be executed
 SET ROLE regress_readallstats;
+SELECT COUNT(*) >= 0 AS ok FROM pg_aios;
 SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts;
 SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations;
 RESET ROLE;