Change shutdown sequence to terminate checkpointer last
authorAndres Freund <[email protected]>
Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
committerAndres Freund <[email protected]>
Sat, 25 Jan 2025 16:37:13 +0000 (11:37 -0500)
The main motivation for this change is to have a process that can serialize
stats after all other processes have terminated. Serializing stats already
happens in checkpointer, even though walsenders can be active longer.

The only reason the current shutdown sequence does not actively cause problems
is that walsender currently does not generate any stats. However, there is an
upcoming patch changing that.

Another need for this change originates in the AIO patchset, where IO
workers (which, in some edge cases, can emit stats of their own) need to run
while the shutdown checkpoint is being written.

This commit changes the shutdown sequence so checkpointer is signalled (via
SIGINT) to trigger writing the shutdown checkpoint without also causing
checkpointer to exit.  Once checkpointer wrote the shutdown checkpoint it
notifies postmaster via PMSIGNAL_XLOG_IS_SHUTDOWN and waits for the
termination signal (SIGUSR2, as before).  Checkpointer now is terminated after
all children, other than dead-end children and logger, have been terminated,
tracked using the new PM_WAIT_CHECKPOINTER PMState.

Reviewed-by: Heikki Linnakangas <[email protected]>
Reviewed-by: Bertrand Drouvot <[email protected]>
Reviewed-by: Nazir Bilal Yavuz <[email protected]>
Discussion: https://postgr.es/m/kgng5nrvnlv335evmsuvpnh354rw7qyazl73kdysev2cr2v5zu@m3cfzxicm5kp

src/backend/postmaster/checkpointer.c
src/backend/postmaster/postmaster.c
src/backend/utils/activity/wait_event_names.txt
src/include/storage/pmsignal.h

index dd2c8376c6efee037d5095908110e4a325b63aae..b94f9cdff21c4ff0eba12a1af10a4736eec7f29d 100644 (file)
  * fill WAL segments; the checkpointer itself doesn't watch for the
  * condition.)
  *
- * Normal termination is by SIGUSR2, which instructs the checkpointer to
- * execute a shutdown checkpoint and then exit(0).  (All backends must be
- * stopped before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT;
- * like any backend, the checkpointer will simply abort and exit on SIGQUIT.
+ * The normal termination sequence is that checkpointer is instructed to
+ * execute the shutdown checkpoint by SIGINT.  After that checkpointer waits
+ * to be terminated via SIGUSR2, which instructs the checkpointer to exit(0).
+ * All backends must be stopped before SIGINT or SIGUSR2 is issued!
+ *
+ * Emergency termination is by SIGQUIT; like any backend, the checkpointer
+ * will simply abort and exit on SIGQUIT.
  *
  * If the checkpointer exits unexpectedly, the postmaster treats that the same
  * as a backend crash: shared memory may be corrupted, so remaining backends
@@ -51,6 +54,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/shmem.h"
@@ -141,6 +145,7 @@ double      CheckPointCompletionTarget = 0.9;
  * Private state
  */
 static bool ckpt_active = false;
+static volatile sig_atomic_t ShutdownXLOGPending = false;
 
 /* these values are valid when ckpt_active is true: */
 static pg_time_t ckpt_start_time;
@@ -159,6 +164,9 @@ static bool ImmediateCheckpointRequested(void);
 static bool CompactCheckpointerRequestQueue(void);
 static void UpdateSharedMemoryConfig(void);
 
+/* Signal handlers */
+static void ReqShutdownXLOG(SIGNAL_ARGS);
+
 
 /*
  * Main entry point for checkpointer process
@@ -188,7 +196,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
     * tell us it's okay to shut down (via SIGUSR2).
     */
    pqsignal(SIGHUP, SignalHandlerForConfigReload);
-   pqsignal(SIGINT, SIG_IGN);
+   pqsignal(SIGINT, ReqShutdownXLOG);
    pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
    /* SIGQUIT handler was already set up by InitPostmasterChild */
    pqsignal(SIGALRM, SIG_IGN);
@@ -211,8 +219,11 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
     * process during a normal shutdown, and since checkpointer is shut down
     * very late...
     *
-    * Walsenders are shut down after the checkpointer, but currently don't
-    * report stats. If that changes, we need a more complicated solution.
+    * While e.g. walsenders are active after the shutdown checkpoint has been
+    * written (and thus could produce more stats), checkpointer stays around
+    * after the shutdown checkpoint has been written. postmaster will only
+    * signal checkpointer to exit after all processes that could emit stats
+    * have been shut down.
     */
    before_shmem_exit(pgstat_before_server_shutdown, 0);
 
@@ -327,7 +338,8 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
    ProcGlobal->checkpointerProc = MyProcNumber;
 
    /*
-    * Loop forever
+    * Loop until we've been asked to write the shutdown checkpoint or
+    * terminate.
     */
    for (;;)
    {
@@ -346,7 +358,10 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
         * Process any requests or signals received recently.
         */
        AbsorbSyncRequests();
+
        HandleCheckpointerInterrupts();
+       if (ShutdownXLOGPending || ShutdownRequestPending)
+           break;
 
        /*
         * Detect a pending checkpoint request by checking whether the flags
@@ -517,8 +532,13 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 
            ckpt_active = false;
 
-           /* We may have received an interrupt during the checkpoint. */
+           /*
+            * We may have received an interrupt during the checkpoint and the
+            * latch might have been reset (e.g. in CheckpointWriteDelay).
+            */
            HandleCheckpointerInterrupts();
+           if (ShutdownXLOGPending || ShutdownRequestPending)
+               break;
        }
 
        /* Check for archive_timeout and switch xlog files if necessary. */
@@ -557,6 +577,57 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
                         cur_timeout * 1000L /* convert to ms */ ,
                         WAIT_EVENT_CHECKPOINTER_MAIN);
    }
+
+   /*
+    * From here on, elog(ERROR) should end with exit(1), not send control
+    * back to the sigsetjmp block above.
+    */
+   ExitOnAnyError = true;
+
+   if (ShutdownXLOGPending)
+   {
+       /*
+        * Close down the database.
+        *
+        * Since ShutdownXLOG() creates restartpoint or checkpoint, and
+        * updates the statistics, increment the checkpoint request and flush
+        * out pending statistic.
+        */
+       PendingCheckpointerStats.num_requested++;
+       ShutdownXLOG(0, 0);
+       pgstat_report_checkpointer();
+       pgstat_report_wal(true);
+
+       /*
+        * Tell postmaster that we're done.
+        */
+       SendPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN);
+       ShutdownXLOGPending = false;
+   }
+
+   /*
+    * Wait until we're asked to shut down. By separating the writing of the
+    * shutdown checkpoint from checkpointer exiting, checkpointer can perform
+    * some should-be-as-late-as-possible work like writing out stats.
+    */
+   for (;;)
+   {
+       /* Clear any already-pending wakeups */
+       ResetLatch(MyLatch);
+
+       HandleCheckpointerInterrupts();
+
+       if (ShutdownRequestPending)
+           break;
+
+       (void) WaitLatch(MyLatch,
+                        WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+                        0,
+                        WAIT_EVENT_CHECKPOINTER_SHUTDOWN);
+   }
+
+   /* Normal exit from the checkpointer is here */
+   proc_exit(0);               /* done */
 }
 
 /*
@@ -586,29 +657,6 @@ HandleCheckpointerInterrupts(void)
         */
        UpdateSharedMemoryConfig();
    }
-   if (ShutdownRequestPending)
-   {
-       /*
-        * From here on, elog(ERROR) should end with exit(1), not send control
-        * back to the sigsetjmp block above
-        */
-       ExitOnAnyError = true;
-
-       /*
-        * Close down the database.
-        *
-        * Since ShutdownXLOG() creates restartpoint or checkpoint, and
-        * updates the statistics, increment the checkpoint request and flush
-        * out pending statistic.
-        */
-       PendingCheckpointerStats.num_requested++;
-       ShutdownXLOG(0, 0);
-       pgstat_report_checkpointer();
-       pgstat_report_wal(true);
-
-       /* Normal exit from the checkpointer is here */
-       proc_exit(0);           /* done */
-   }
 
    /* Perform logging of memory contexts of this process */
    if (LogMemoryContextPending)
@@ -729,6 +777,7 @@ CheckpointWriteDelay(int flags, double progress)
     * in which case we just try to catch up as quickly as possible.
     */
    if (!(flags & CHECKPOINT_IMMEDIATE) &&
+       !ShutdownXLOGPending &&
        !ShutdownRequestPending &&
        !ImmediateCheckpointRequested() &&
        IsCheckpointOnSchedule(progress))
@@ -857,6 +906,20 @@ IsCheckpointOnSchedule(double progress)
 }
 
 
+/* --------------------------------
+ *     signal handler routines
+ * --------------------------------
+ */
+
+/* SIGINT: set flag to trigger writing of shutdown checkpoint */
+static void
+ReqShutdownXLOG(SIGNAL_ARGS)
+{
+   ShutdownXLOGPending = true;
+   SetLatch(MyLatch);
+}
+
+
 /* --------------------------------
  *     communication with backends
  * --------------------------------
index f410600f7a4ed385055fa5a3bd2d18e25a12d047..bb22b13adef8760f6a08b94a3bee82105b3e62f0 100644 (file)
@@ -341,6 +341,7 @@ typedef enum
                                 * ckpt */
    PM_WAIT_XLOG_ARCHIVAL,      /* waiting for archiver and walsenders to
                                 * finish */
+   PM_WAIT_CHECKPOINTER,       /* waiting for checkpointer to shut down */
    PM_WAIT_DEAD_END,           /* waiting for dead-end children to exit */
    PM_NO_CHILDREN,             /* all important children have exited */
 } PMState;
@@ -2363,35 +2364,20 @@ process_pm_child_exit(void)
        {
            ReleasePostmasterChildSlot(CheckpointerPMChild);
            CheckpointerPMChild = NULL;
-           if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_XLOG_SHUTDOWN)
+           if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_CHECKPOINTER)
            {
                /*
                 * OK, we saw normal exit of the checkpointer after it's been
-                * told to shut down.  We expect that it wrote a shutdown
-                * checkpoint.  (If for some reason it didn't, recovery will
-                * occur on next postmaster start.)
+                * told to shut down.  We know checkpointer wrote a shutdown
+                * checkpoint, otherwise we'd still be in
+                * PM_WAIT_XLOG_SHUTDOWN state.
                 *
-                * At this point we should have no normal backend children
-                * left (else we'd not be in PM_WAIT_XLOG_SHUTDOWN state) but
-                * we might have dead-end children to wait for.
-                *
-                * If we have an archiver subprocess, tell it to do a last
-                * archive cycle and quit. Likewise, if we have walsender
-                * processes, tell them to send any remaining WAL and quit.
+                * At this point only dead-end children and logger should be
+                * left.
                 */
-               Assert(Shutdown > NoShutdown);
-
-               /* Waken archiver for the last time */
-               if (PgArchPMChild != NULL)
-                   signal_child(PgArchPMChild, SIGUSR2);
-
-               /*
-                * Waken walsenders for the last time. No regular backends
-                * should be around anymore.
-                */
-               SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
-
-               UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+               UpdatePMState(PM_WAIT_DEAD_END);
+               ConfigurePostmasterWaitSet(false);
+               SignalChildren(SIGTERM, btmask_all_except(B_LOGGER));
            }
            else
            {
@@ -2737,6 +2723,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
 
        case PM_WAIT_XLOG_SHUTDOWN:
        case PM_WAIT_XLOG_ARCHIVAL:
+       case PM_WAIT_CHECKPOINTER:
 
            /*
             * NB: Similar code exists in PostmasterStateMachine()'s handling
@@ -3012,10 +2999,10 @@ PostmasterStateMachine(void)
                /* Start the checkpointer if not running */
                if (CheckpointerPMChild == NULL)
                    CheckpointerPMChild = StartChildProcess(B_CHECKPOINTER);
-               /* And tell it to shut down */
+               /* And tell it to write the shutdown checkpoint */
                if (CheckpointerPMChild != NULL)
                {
-                   signal_child(CheckpointerPMChild, SIGUSR2);
+                   signal_child(CheckpointerPMChild, SIGINT);
                    UpdatePMState(PM_WAIT_XLOG_SHUTDOWN);
                }
                else
@@ -3043,22 +3030,40 @@ PostmasterStateMachine(void)
        }
    }
 
+   /*
+    * The state transition from PM_WAIT_XLOG_SHUTDOWN to
+    * PM_WAIT_XLOG_ARCHIVAL is in process_pm_pmsignal(), in response to
+    * PMSIGNAL_XLOG_IS_SHUTDOWN.
+    */
+
    if (pmState == PM_WAIT_XLOG_ARCHIVAL)
    {
        /*
-        * PM_WAIT_XLOG_ARCHIVAL state ends when there's no other children
-        * than dead-end children left. There shouldn't be any regular
-        * backends left by now anyway; what we're really waiting for is
-        * walsenders and archiver.
+        * PM_WAIT_XLOG_ARCHIVAL state ends when there are no children other
+        * than checkpointer, dead-end children and logger left. There
+        * shouldn't be any regular backends left by now anyway; what we're
+        * really waiting for is for walsenders and archiver to exit.
         */
-       if (CountChildren(btmask_all_except(B_LOGGER, B_DEAD_END_BACKEND)) == 0)
+       if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_LOGGER, B_DEAD_END_BACKEND)) == 0)
        {
-           UpdatePMState(PM_WAIT_DEAD_END);
-           ConfigurePostmasterWaitSet(false);
-           SignalChildren(SIGTERM, btmask_all_except(B_LOGGER));
+           UpdatePMState(PM_WAIT_CHECKPOINTER);
+
+           /*
+            * Now that the processes mentioned above are gone, tell
+            * checkpointer to shut down too. That allows checkpointer to
+            * perform some last bits of cleanup without other processes
+            * interfering.
+            */
+           if (CheckpointerPMChild != NULL)
+               signal_child(CheckpointerPMChild, SIGUSR2);
        }
    }
 
+   /*
+    * The state transition from PM_WAIT_CHECKPOINTER to PM_WAIT_DEAD_END is
+    * in process_pm_child_exit().
+    */
+
    if (pmState == PM_WAIT_DEAD_END)
    {
        /*
@@ -3195,6 +3200,7 @@ pmstate_name(PMState state)
            PM_TOSTR_CASE(PM_WAIT_XLOG_SHUTDOWN);
            PM_TOSTR_CASE(PM_WAIT_XLOG_ARCHIVAL);
            PM_TOSTR_CASE(PM_WAIT_DEAD_END);
+           PM_TOSTR_CASE(PM_WAIT_CHECKPOINTER);
            PM_TOSTR_CASE(PM_NO_CHILDREN);
    }
 #undef PM_TOSTR_CASE
@@ -3613,6 +3619,8 @@ ExitPostmaster(int status)
 static void
 process_pm_pmsignal(void)
 {
+   bool        request_state_update = false;
+
    pending_pm_pmsignal = false;
 
    ereport(DEBUG2,
@@ -3724,9 +3732,67 @@ process_pm_pmsignal(void)
        WalReceiverRequested = true;
    }
 
+   if (CheckPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN))
+   {
+       /* Checkpointer completed the shutdown checkpoint */
+       if (pmState == PM_WAIT_XLOG_SHUTDOWN)
+       {
+           /*
+            * If we have an archiver subprocess, tell it to do a last archive
+            * cycle and quit. Likewise, if we have walsender processes, tell
+            * them to send any remaining WAL and quit.
+            */
+           Assert(Shutdown > NoShutdown);
+
+           /* Waken archiver for the last time */
+           if (PgArchPMChild != NULL)
+               signal_child(PgArchPMChild, SIGUSR2);
+
+           /*
+            * Waken walsenders for the last time. No regular backends should
+            * be around anymore.
+            */
+           SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
+
+           UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+       }
+       else if (!FatalError && Shutdown != ImmediateShutdown)
+       {
+           /*
+            * Checkpointer only ought to perform the shutdown checkpoint
+            * during shutdown.  If somehow checkpointer did so in another
+            * situation, we have no choice but to crash-restart.
+            *
+            * It's possible however that we get PMSIGNAL_XLOG_IS_SHUTDOWN
+            * outside of PM_WAIT_XLOG_SHUTDOWN if an orderly shutdown was
+            * "interrupted" by a crash or an immediate shutdown.
+            */
+           ereport(LOG,
+                   (errmsg("WAL was shut down unexpectedly")));
+
+           /*
+            * Doesn't seem likely to help to take send_abort_for_crash into
+            * account here.
+            */
+           HandleFatalError(PMQUIT_FOR_CRASH, false);
+       }
+
+       /*
+        * Need to run PostmasterStateMachine() to check if we already can go
+        * to the next state.
+        */
+       request_state_update = true;
+   }
+
    /*
     * Try to advance postmaster's state machine, if a child requests it.
-    *
+    */
+   if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+   {
+       request_state_update = true;
+   }
+
+   /*
     * Be careful about the order of this action relative to this function's
     * other actions.  Generally, this should be after other actions, in case
     * they have effects PostmasterStateMachine would need to know about.
@@ -3734,7 +3800,7 @@ process_pm_pmsignal(void)
     * cannot have any (immediate) effect on the state machine, but does
     * depend on what state we're in now.
     */
-   if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+   if (request_state_update)
    {
        PostmasterStateMachine();
    }
@@ -4045,6 +4111,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
    switch (pmState)
    {
        case PM_NO_CHILDREN:
+       case PM_WAIT_CHECKPOINTER:
        case PM_WAIT_DEAD_END:
        case PM_WAIT_XLOG_ARCHIVAL:
        case PM_WAIT_XLOG_SHUTDOWN:
index 0b53cba807d4be841b549a9a402b41a25475d8b2..e199f071628987ec0626847d8c18632293dd3e89 100644 (file)
@@ -56,6 +56,7 @@ AUTOVACUUM_MAIN   "Waiting in main loop of autovacuum launcher process."
 BGWRITER_HIBERNATE "Waiting in background writer process, hibernating."
 BGWRITER_MAIN  "Waiting in main loop of background writer process."
 CHECKPOINTER_MAIN  "Waiting in main loop of checkpointer process."
+CHECKPOINTER_SHUTDOWN  "Waiting for checkpointer process to be terminated."
 LOGICAL_APPLY_MAIN "Waiting in main loop of logical replication apply process."
 LOGICAL_LAUNCHER_MAIN  "Waiting in main loop of logical replication launcher process."
 LOGICAL_PARALLEL_APPLY_MAIN    "Waiting in main loop of logical replication parallel apply process."
index 3fbe5bf11367036c34c907ce3f1622f49777b8ee..d84a383047e02d8b21665ef7fdfb2d0318522461 100644 (file)
@@ -40,9 +40,10 @@ typedef enum
    PMSIGNAL_BACKGROUND_WORKER_CHANGE,  /* background worker state change */
    PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
    PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
+   PMSIGNAL_XLOG_IS_SHUTDOWN,  /* ShutdownXLOG() completed */
 } PMSignalReason;
 
-#define NUM_PMSIGNALS (PMSIGNAL_ADVANCE_STATE_MACHINE+1)
+#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1)
 
 /*
  * Reasons why the postmaster would send SIGQUIT to its children.