Re-add GUC track_wal_io_timing
authorMichael Paquier <[email protected]>
Wed, 26 Feb 2025 00:49:59 +0000 (09:49 +0900)
committerMichael Paquier <[email protected]>
Wed, 26 Feb 2025 00:49:59 +0000 (09:49 +0900)
This commit is a rework of 2421e9a51d20, about which Andres Freund has
raised some concerns as it is valuable to have both track_io_timing and
track_wal_io_timing in some cases, as the WAL write and fsync paths can
be a major bottleneck for some workloads.  Hence, it can be relevant to
not calculate the WAL timings in environments where pg_test_timing
performs poorly while capturing some IO data under track_io_timing for
the non-WAL IO paths.  The opposite can be also true: it should be
possible to disable the non-WAL timings and enable the WAL timings (the
previous GUC setups allowed this possibility).

track_wal_io_timing is added back in this commit, controlling if WAL
timings should be calculated in pg_stat_io for the read, fsync and write
paths, as done previously with pg_stat_wal.  pg_stat_wal previously
tracked only the sync and write parts (now removed), read stats is new
data tracked in pg_stat_io, all three are aggregated if
track_wal_io_timing is enabled.  The read part matters during recovery
or if a XLogReader is used.

Extra note: more control over if the types of timings calculated in
pg_stat_io could be done with a GUC that lists pairs of (IOObject,IOOp).

Reported-by: Andres Freund <[email protected]>
Author: Bertrand Drouvot <[email protected]>
Co-authored-by: Michael Paquier <[email protected]>
Discussion: https://postgr.es/m/3opf2wh2oljco6ldyqf7ukabw3jijnnhno6fjb4mlu6civ5h24@fcwmhsgmlmzu

15 files changed:
doc/src/sgml/config.sgml
doc/src/sgml/monitoring.sgml
doc/src/sgml/wal.sgml
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogreader.c
src/backend/access/transam/xlogrecovery.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/smgr/md.c
src/backend/utils/activity/pgstat_backend.c
src/backend/utils/activity/pgstat_io.c
src/backend/utils/misc/guc_tables.c
src/backend/utils/misc/postgresql.conf.sample
src/include/access/xlog.h
src/include/pgstat.h

index a835457610854ca8c592df7273d02bbee53e6254..e55700f35b8973df6687ede9a19a176d33d1f557 100644 (file)
@@ -8378,9 +8378,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
         displayed in <link linkend="monitoring-pg-stat-database-view">
         <structname>pg_stat_database</structname></link>,
         <link linkend="monitoring-pg-stat-io-view">
-        <structname>pg_stat_io</structname></link>, in the output of the
+        <structname>pg_stat_io</structname></link> (if <varname>object</varname>
+        is not <literal>wal</literal>), in the output of the
         <link linkend="pg-stat-get-backend-io">
-        <function>pg_stat_get_backend_io()</function></link> function, in the
+        <function>pg_stat_get_backend_io()</function></link> function (if
+        <varname>object</varname> is not <literal>wal</literal>), in the
         output of <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal>
         option is used, in the output of <xref linkend="sql-vacuum"/> when
         the <literal>VERBOSE</literal> option is used, by autovacuum
@@ -8393,6 +8395,32 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-track-wal-io-timing" xreflabel="track_wal_io_timing">
+      <term><varname>track_wal_io_timing</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>track_wal_io_timing</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables timing of WAL I/O calls. This parameter is off by default,
+        as it will repeatedly query the operating system for the current time,
+        which may cause significant overhead on some platforms.
+        You can use the <application>pg_test_timing</application> tool to
+        measure the overhead of timing on your system.
+        I/O timing information is displayed in
+        <link linkend="monitoring-pg-stat-io-view">
+        <structname>pg_stat_io</structname></link> for the
+        <varname>object</varname> <literal>wal</literal> and in the output of
+        the <link linkend="pg-stat-get-backend-io">
+        <function>pg_stat_get_backend_io()</function></link> function for the
+        <varname>object</varname> <literal>wal</literal>.
+        Only superusers and users with the appropriate <literal>SET</literal>
+        privilege can change this setting.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-track-functions" xreflabel="track_functions">
       <term><varname>track_functions</varname> (<type>enum</type>)
       <indexterm>
index 3dfd059b7ee8bf75188ea2fe2c8b4e4dee1e4cc6..9178f1d34efdccaa91c28e26825af2a1f9edb12c 100644 (file)
@@ -183,6 +183,11 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
    of block read, write, extend, and fsync times.
   </para>
 
+  <para>
+   The parameter <xref linkend="guc-track-wal-io-timing"/> enables monitoring
+   of WAL read, write and fsync times.
+  </para>
+
   <para>
    Normally these parameters are set in <filename>postgresql.conf</filename> so
    that they apply to all server processes, but it is possible to turn
@@ -2723,7 +2728,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
        </para>
        <para>
         Time spent in read operations in milliseconds (if
-        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+        <xref linkend="guc-track-io-timing"/> is enabled and
+        <varname>object</varname> is not <literal>wal</literal>,
+        or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+        and <varname>object</varname> is <literal>wal</literal>,
+        otherwise zero)
        </para>
       </entry>
      </row>
@@ -2757,7 +2766,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
        </para>
        <para>
         Time spent in write operations in milliseconds (if
-        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+        <xref linkend="guc-track-io-timing"/> is enabled and
+        <varname>object</varname> is not <literal>wal</literal>,
+        or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+        and <varname>object</varname> is <literal>wal</literal>,
+        otherwise zero)
        </para>
       </entry>
      </row>
@@ -2816,8 +2829,12 @@ description | Waiting for a newly initialized WAL file to reach durable storage
         <structfield>extend_time</structfield> <type>double precision</type>
        </para>
        <para>
-        Time spent in extend operations in milliseconds (if
-        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+        Time spent in extend operations in milliseconds. (if
+        <xref linkend="guc-track-io-timing"/> is enabled and
+        <varname>object</varname> is not <literal>wal</literal>,
+        or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+        and <varname>object</varname> is <literal>wal</literal>,
+        otherwise zero)
        </para>
       </entry>
      </row>
@@ -2887,7 +2904,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
        </para>
        <para>
         Time spent in fsync operations in milliseconds (if
-        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+        <xref linkend="guc-track-io-timing"/> is enabled and
+        <varname>object</varname> is not <literal>wal</literal>,
+        or if <xref linkend="guc-track-wal-io-timing"/> is enabled
+        and <varname>object</varname> is <literal>wal</literal>,
+        otherwise zero)
        </para>
       </entry>
      </row>
index 4fc09761115e9234df5c9fd741a07d8c8dcc9143..f3b86b26be90571f96466dd61debeec26f55fa31 100644 (file)
   <para>
    There are two internal functions to write WAL data to disk:
    <function>XLogWrite</function> and <function>issue_xlog_fsync</function>.
-   When <xref linkend="guc-track-io-timing"/> is enabled, the total
+   When <xref linkend="guc-track-wal-io-timing"/> is enabled, the total
    amounts of time <function>XLogWrite</function> writes and
    <function>issue_xlog_fsync</function> syncs WAL data to disk are counted as
    <varname>write_time</varname> and <varname>fsync_time</varname> in
    <literal>fsync</literal>, or <literal>fsync_writethrough</literal>,
    the write operation moves WAL buffers to kernel cache and
    <function>issue_xlog_fsync</function> syncs them to disk. Regardless
-   of the setting of <varname>track_io_timing</varname>, the number
+   of the setting of <varname>track_wal_io_timing</varname>, the number
    of times <function>XLogWrite</function> writes and
    <function>issue_xlog_fsync</function> syncs WAL data to disk are also
    counted as <varname>writes</varname> and <varname>fsyncs</varname>
index 919314f82587ec2f39e88395978485fefc7085af..799fc739e18cd8a7d122dd54adbe21968933e565 100644 (file)
@@ -134,6 +134,7 @@ int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 int            wal_retrieve_retry_interval = 5000;
 int            max_slot_wal_keep_size_mb = -1;
 int            wal_decode_buffer_size = 512 * 1024;
+bool       track_wal_io_timing = false;
 
 #ifdef WAL_DEBUG
 bool       XLOG_DEBUG = false;
@@ -2437,7 +2438,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
                /*
                 * Measure I/O timing to write WAL data, for pg_stat_io.
                 */
-               start = pgstat_prepare_io_time();
+               start = pgstat_prepare_io_time(track_wal_io_timing);
 
                pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
                written = pg_pwrite(openLogFile, from, nleft, startoffset);
@@ -3248,7 +3249,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
                 errmsg("could not create file \"%s\": %m", tmppath)));
 
    /* Measure I/O timing when initializing segment */
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_wal_io_timing);
 
    pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
    save_errno = 0;
@@ -3310,7 +3311,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
    }
 
    /* Measure I/O timing when flushing segment */
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_wal_io_timing);
 
    pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
    if (pg_fsync(fd) != 0)
@@ -8744,7 +8745,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
    /*
     * Measure I/O timing to sync the WAL file for pg_stat_io.
     */
-   start = pgstat_prepare_io_time();
+   start = pgstat_prepare_io_time(track_wal_io_timing);
 
    pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
    switch (wal_sync_method)
index 995e4f7c26fd5a3536cdd6ad9ec383c567e36769..5483d4f0dd2b64d40a455fa85dbfd7ff3e41aecc 100644 (file)
@@ -1557,7 +1557,7 @@ WALRead(XLogReaderState *state,
 
 #ifndef FRONTEND
        /* Measure I/O timing when reading segment */
-       io_start = pgstat_prepare_io_time();
+       io_start = pgstat_prepare_io_time(track_wal_io_timing);
 
        pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
 #endif
index 915cb330295cc1baf55f192f346b2c68969a184f..c5475aaa97a15f22f3e0a2f2e45da42092f36868 100644 (file)
@@ -3401,7 +3401,7 @@ retry:
    readOff = targetPageOff;
 
    /* Measure I/O timing when reading segment */
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_wal_io_timing);
 
    pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
    r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
index 40e9efec3128ae0bfe5f2f7f282d1e310dfba673..7915ed624c1214dfd7ba7eb01047a8cb76c52b3d 100644 (file)
@@ -1509,7 +1509,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
            io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
        }
 
-       io_start = pgstat_prepare_io_time();
+       io_start = pgstat_prepare_io_time(track_io_timing);
        smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
        pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
                                1, io_buffers_len * BLCKSZ);
@@ -2402,7 +2402,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
        }
    }
 
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_io_timing);
 
    /*
     * Note: if smgrzeroextend fails, we will end up with buffers that are
@@ -3857,7 +3857,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
     */
    bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_io_timing);
 
    /*
     * bufToWrite is either the shared buffer or a copy, as appropriate.
@@ -4459,7 +4459,7 @@ FlushRelationBuffers(Relation rel)
 
                PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-               io_start = pgstat_prepare_io_time();
+               io_start = pgstat_prepare_io_time(track_io_timing);
 
                smgrwrite(srel,
                          BufTagGetForkNum(&bufHdr->tag),
@@ -5905,7 +5905,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
    sort_pending_writebacks(wb_context->pending_writebacks,
                            wb_context->nr_pending);
 
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_io_timing);
 
    /*
     * Coalesce neighbouring writes, but nothing else. For that we iterate
index d2f8158d69798b16b81e994595c528016a3d3653..80b83444eb2d8e913a14f3ec29bae851a8547a24 100644 (file)
@@ -244,7 +244,7 @@ GetLocalVictimBuffer(void)
 
        PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
-       io_start = pgstat_prepare_io_time();
+       io_start = pgstat_prepare_io_time(track_io_timing);
 
        /* And write... */
        smgrwrite(oreln,
@@ -414,7 +414,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
        }
    }
 
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_io_timing);
 
    /* actually extend relation */
    smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
index 4994c97795c47a77ee392d959a94947407cf03e4..f3220f98dc4295db8ca4932d8c1a745b3a690253 100644 (file)
@@ -1391,7 +1391,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
        ereport(DEBUG1,
                (errmsg_internal("could not forward fsync request because request queue is full")));
 
-       io_start = pgstat_prepare_io_time();
+       io_start = pgstat_prepare_io_time(track_io_timing);
 
        if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
            ereport(data_sync_elevel(ERROR),
@@ -1790,7 +1790,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
        need_to_close = true;
    }
 
-   io_start = pgstat_prepare_io_time();
+   io_start = pgstat_prepare_io_time(track_io_timing);
 
    /* Sync the file. */
    result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
index 4a667e7019cc23638c08870f84b89a7d442c62b1..338da73a9a91bf55138101a8685e1bbbc6eec8b8 100644 (file)
@@ -24,6 +24,7 @@
 
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "storage/bufmgr.h"
 #include "utils/memutils.h"
 #include "utils/pgstat_internal.h"
@@ -43,7 +44,7 @@ void
 pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
                                IOOp io_op, instr_time io_time)
 {
-   Assert(track_io_timing);
+   Assert(track_io_timing || track_wal_io_timing);
 
    if (!pgstat_tracks_backend_bktype(MyBackendType))
        return;
index e7362b52a37beaf8f0a003a65023fa9f59240b45..ba11545a17f3deffcf882d0198f16eb203ca70b4 100644 (file)
@@ -83,20 +83,22 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
 }
 
 /*
- * Initialize the internal timing for an IO operation.
+ * Initialize the internal timing for an IO operation, depending on an
+ * IO timing GUC.
  */
 instr_time
-pgstat_prepare_io_time(void)
+pgstat_prepare_io_time(bool track_io_guc)
 {
    instr_time  io_start;
 
-   if (track_io_timing)
+   if (track_io_guc)
        INSTR_TIME_SET_CURRENT(io_start);
    else
    {
        /*
-        * There is no need to set io_start when an IO timing GUC is disabled,
-        * still initialize it to zero to avoid compiler warnings.
+        * There is no need to set io_start when an IO timing GUC is disabled.
+        * Initialize it to zero to avoid compiler warnings and to let
+        * pgstat_count_io_op_time() know that timings should be ignored.
         */
        INSTR_TIME_SET_ZERO(io_start);
    }
@@ -119,7 +121,7 @@ void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
                        instr_time start_time, uint32 cnt, uint64 bytes)
 {
-   if (track_io_timing)
+   if (!INSTR_TIME_IS_ZERO(start_time))
    {
        instr_time  io_time;
 
index 690bf96ef030b4daba8e87b139324fdf45cfd569..ad25cbb39c555ec567f2049d03ff1a100b24c28c 100644 (file)
@@ -1501,6 +1501,15 @@ struct config_bool ConfigureNamesBool[] =
        false,
        NULL, NULL, NULL
    },
+   {
+       {"track_wal_io_timing", PGC_SUSET, STATS_CUMULATIVE,
+           gettext_noop("Collects timing statistics for WAL I/O activity."),
+           NULL
+       },
+       &track_wal_io_timing,
+       false,
+       NULL, NULL, NULL
+   },
 
    {
        {"update_process_title", PGC_SUSET, PROCESS_TITLE,
index e771d87da1fb33d9110b520d7a3cc698a50b4bfb..5362ff805195f9a059c412dbbe6027824088f446 100644 (file)
 #track_counts = on
 #track_cost_delay_timing = off
 #track_io_timing = off
+#track_wal_io_timing = off
 #track_functions = none            # none, pl, all
 #stats_fetch_consistency = cache   # cache, none, snapshot
 
index 0a50d160bba8f2c412e090ff5f22bb64e0de30b8..d313099c027f0af3e76e2a4eb97b706e80702700 100644 (file)
@@ -54,6 +54,7 @@ extern PGDLLIMPORT char *wal_consistency_checking_string;
 extern PGDLLIMPORT bool log_checkpoints;
 extern PGDLLIMPORT int CommitDelay;
 extern PGDLLIMPORT int CommitSiblings;
+extern PGDLLIMPORT bool track_wal_io_timing;
 extern PGDLLIMPORT int wal_decode_buffer_size;
 
 extern PGDLLIMPORT int CheckPointSegments;
index fc651d03cf971dd044d8d4290929567155be3461..e64c527265b9f7884782ab4d1e424f4129217467 100644 (file)
@@ -564,7 +564,7 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
                                         BackendType bktype);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context,
                               IOOp io_op, uint32 cnt, uint64 bytes);
-extern instr_time pgstat_prepare_io_time(void);
+extern instr_time pgstat_prepare_io_time(bool track_io_guc);
 extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
                                    IOOp io_op, instr_time start_time,
                                    uint32 cnt, uint64 bytes);