Refactor code in charge of running shell-based recovery commands
authorMichael Paquier <[email protected]>
Mon, 16 Jan 2023 07:31:43 +0000 (16:31 +0900)
committerMichael Paquier <[email protected]>
Mon, 16 Jan 2023 07:31:43 +0000 (16:31 +0900)
The code specific to the execution of archive_cleanup_command,
recovery_end_command and restore_command is moved to a new file named
shell_restore.c.  The code is split into three functions:
- shell_restore(), that attempts the execution of a shell-based
restore_command.
- shell_archive_cleanup(), for archive_cleanup_command.
- shell_recovery_end(), for recovery_end_command.

This introduces no functional changes, with failure patterns and logs
generated in consequence being the same as before (one case actually
generates one less DEBUG2 message "could not restore" when a restore
command succeeds but the follow-up stat() to check the size fails, but
that only matters with a elevel high enough).

This is preparatory work for allowing recovery modules, a facility
similar to archive modules, with callbacks shaped similarly to the
functions introduced here.

Author: Nathan Bossart
Reviewed-by: Andres Freund, Michael Paquier
Discussion: https://postgr.es/m/20221227192449.GA3672473@nathanxps13

src/backend/access/transam/Makefile
src/backend/access/transam/meson.build
src/backend/access/transam/shell_restore.c [new file with mode: 0644]
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogarchive.c
src/include/access/xlogarchive.h

index 661c55a9db789760b8ec95e22b1d02922d3a2671..099c315d0327b8a539fb273f940a9c86c16aa7ec 100644 (file)
@@ -19,6 +19,7 @@ OBJS = \
    multixact.o \
    parallel.o \
    rmgr.o \
+   shell_restore.o \
    slru.o \
    subtrans.o \
    timeline.o \
index 8920c1bfce21fbeaa09ab3b70b00cc9219a53070..3031c2f6cfbbaefdf08f068d523cb8d78d011759 100644 (file)
@@ -7,6 +7,7 @@ backend_sources += files(
   'multixact.c',
   'parallel.c',
   'rmgr.c',
+  'shell_restore.c',
   'slru.c',
   'subtrans.c',
   'timeline.c',
diff --git a/src/backend/access/transam/shell_restore.c b/src/backend/access/transam/shell_restore.c
new file mode 100644 (file)
index 0000000..7753a7d
--- /dev/null
@@ -0,0 +1,175 @@
+/*-------------------------------------------------------------------------
+ *
+ * shell_restore.c
+ *     Recovery functions for a user-specified shell command.
+ *
+ * These recovery functions use a user-specified shell command (e.g. based
+ * on the GUC restore_command).
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/shell_restore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "access/xlogarchive.h"
+#include "access/xlogrecovery.h"
+#include "common/archive.h"
+#include "common/percentrepl.h"
+#include "storage/ipc.h"
+#include "utils/wait_event.h"
+
+static void ExecuteRecoveryCommand(const char *command,
+                                  const char *commandName,
+                                  bool failOnSignal,
+                                  uint32 wait_event_info,
+                                  const char *lastRestartPointFileName);
+
+/*
+ * Attempt to execute a shell-based restore command.
+ *
+ * Returns true if the command has succeeded, false otherwise.
+ */
+bool
+shell_restore(const char *file, const char *path,
+             const char *lastRestartPointFileName)
+{
+   char       *cmd;
+   int         rc;
+
+   /* Build the restore command to execute */
+   cmd = BuildRestoreCommand(recoveryRestoreCommand, path, file,
+                             lastRestartPointFileName);
+
+   ereport(DEBUG3,
+           (errmsg_internal("executing restore command \"%s\"", cmd)));
+
+   /*
+    * Copy xlog from archival storage to XLOGDIR
+    */
+   fflush(NULL);
+   pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND);
+   rc = system(cmd);
+   pgstat_report_wait_end();
+
+   pfree(cmd);
+
+   /*
+    * Remember, we rollforward UNTIL the restore fails so failure here is
+    * just part of the process... that makes it difficult to determine
+    * whether the restore failed because there isn't an archive to restore,
+    * or because the administrator has specified the restore program
+    * incorrectly.  We have to assume the former.
+    *
+    * However, if the failure was due to any sort of signal, it's best to
+    * punt and abort recovery.  (If we "return false" here, upper levels will
+    * assume that recovery is complete and start up the database!) It's
+    * essential to abort on child SIGINT and SIGQUIT, because per spec
+    * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
+    * those it's a good bet we should have gotten it too.
+    *
+    * On SIGTERM, assume we have received a fast shutdown request, and exit
+    * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+    * child process. If we receive it first, the signal handler will call
+    * proc_exit, otherwise we do it here. If we or the child process received
+    * SIGTERM for any other reason than a fast shutdown request, postmaster
+    * will perform an immediate shutdown when it sees us exiting
+    * unexpectedly.
+    *
+    * We treat hard shell errors such as "command not found" as fatal, too.
+    */
+   if (rc != 0)
+   {
+       if (wait_result_is_signal(rc, SIGTERM))
+           proc_exit(1);
+
+       ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
+               (errmsg("could not restore file \"%s\" from archive: %s",
+                       file, wait_result_to_str(rc))));
+   }
+
+   return (rc == 0);
+}
+
+/*
+ * Attempt to execute a shell-based archive cleanup command.
+ */
+void
+shell_archive_cleanup(const char *lastRestartPointFileName)
+{
+   ExecuteRecoveryCommand(archiveCleanupCommand, "archive_cleanup_command",
+                          false, WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND,
+                          lastRestartPointFileName);
+}
+
+/*
+ * Attempt to execute a shell-based end-of-recovery command.
+ */
+void
+shell_recovery_end(const char *lastRestartPointFileName)
+{
+   ExecuteRecoveryCommand(recoveryEndCommand, "recovery_end_command", true,
+                          WAIT_EVENT_RECOVERY_END_COMMAND,
+                          lastRestartPointFileName);
+}
+
+/*
+ * Attempt to execute an external shell command during recovery.
+ *
+ * 'command' is the shell command to be executed, 'commandName' is a
+ * human-readable name describing the command emitted in the logs. If
+ * 'failOnSignal' is true and the command is killed by a signal, a FATAL
+ * error is thrown. Otherwise a WARNING is emitted.
+ *
+ * This is currently used for recovery_end_command and archive_cleanup_command.
+ */
+static void
+ExecuteRecoveryCommand(const char *command, const char *commandName,
+                      bool failOnSignal, uint32 wait_event_info,
+                      const char *lastRestartPointFileName)
+{
+   char       *xlogRecoveryCmd;
+   int         rc;
+
+   Assert(command && commandName);
+
+   /*
+    * construct the command to be executed
+    */
+   xlogRecoveryCmd = replace_percent_placeholders(command, commandName, "r",
+                                                  lastRestartPointFileName);
+
+   ereport(DEBUG3,
+           (errmsg_internal("executing %s \"%s\"", commandName, command)));
+
+   /*
+    * execute the constructed command
+    */
+   fflush(NULL);
+   pgstat_report_wait_start(wait_event_info);
+   rc = system(xlogRecoveryCmd);
+   pgstat_report_wait_end();
+
+   pfree(xlogRecoveryCmd);
+
+   if (rc != 0)
+   {
+       /*
+        * If the failure was due to any sort of signal, it's best to punt and
+        * abort recovery.  See comments in shell_restore().
+        */
+       ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
+       /*------
+          translator: First %s represents a postgresql.conf parameter name like
+         "recovery_end_command", the 2nd is the value of that parameter, the
+         third an already translated error message. */
+               (errmsg("%s \"%s\": %s", commandName,
+                       command, wait_result_to_str(rc))));
+   }
+}
index 0070d56b0b0429c0ae35bc538f278cbd2d363dd2..8f47fb7570099208817c9a66d3b0eb43ac509dd9 100644 (file)
@@ -692,6 +692,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+static void GetOldestRestartPointFileName(char *fname);
 
 static void WALInsertLockAcquire(void);
 static void WALInsertLockAcquireExclusive(void);
@@ -4887,10 +4888,12 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
     * Execute the recovery_end_command, if any.
     */
    if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
-       ExecuteRecoveryCommand(recoveryEndCommand,
-                              "recovery_end_command",
-                              true,
-                              WAIT_EVENT_RECOVERY_END_COMMAND);
+   {
+       char        lastRestartPointFname[MAXFNAMELEN];
+
+       GetOldestRestartPointFileName(lastRestartPointFname);
+       shell_recovery_end(lastRestartPointFname);
+   }
 
    /*
     * We switched to a new timeline. Clean up segments on the old timeline.
@@ -7307,10 +7310,12 @@ CreateRestartPoint(int flags)
     * Finally, execute archive_cleanup_command, if any.
     */
    if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
-       ExecuteRecoveryCommand(archiveCleanupCommand,
-                              "archive_cleanup_command",
-                              false,
-                              WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
+   {
+       char        lastRestartPointFname[MAXFNAMELEN];
+
+       GetOldestRestartPointFileName(lastRestartPointFname);
+       shell_archive_cleanup(lastRestartPointFname);
+   }
 
    return true;
 }
@@ -8884,6 +8889,22 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
    LWLockRelease(ControlFileLock);
 }
 
+/*
+ * Returns the WAL file name for the last checkpoint or restartpoint.  This is
+ * the oldest WAL file that we still need if we have to restart recovery.
+ */
+static void
+GetOldestRestartPointFileName(char *fname)
+{
+   XLogRecPtr  restartRedoPtr;
+   TimeLineID  restartTli;
+   XLogSegNo   restartSegNo;
+
+   GetOldestRestartPoint(&restartRedoPtr, &restartTli);
+   XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
+   XLogFileName(fname, restartTli, restartSegNo, wal_segment_size);
+}
+
 /* Thin wrapper around ShutdownWalRcv(). */
 void
 XLogShutdownWalRcv(void)
index fcc87ff44fd26f02a43fea0d98d48ca3c084aa07..b5cb060d55635609a4ffb098baaf190e22b4f3ed 100644 (file)
@@ -23,7 +23,6 @@
 #include "access/xlog_internal.h"
 #include "access/xlogarchive.h"
 #include "common/archive.h"
-#include "common/percentrepl.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/startup.h"
@@ -57,9 +56,8 @@ RestoreArchivedFile(char *path, const char *xlogfname,
                    bool cleanupEnabled)
 {
    char        xlogpath[MAXPGPATH];
-   char       *xlogRestoreCmd;
    char        lastRestartPointFname[MAXPGPATH];
-   int         rc;
+   bool        ret;
    struct stat stat_buf;
    XLogSegNo   restartSegNo;
    XLogRecPtr  restartRedoPtr;
@@ -150,15 +148,6 @@ RestoreArchivedFile(char *path, const char *xlogfname,
    else
        XLogFileName(lastRestartPointFname, 0, 0L, wal_segment_size);
 
-   /* Build the restore command to execute */
-   xlogRestoreCmd = BuildRestoreCommand(recoveryRestoreCommand,
-                                        xlogpath, xlogfname,
-                                        lastRestartPointFname);
-
-   ereport(DEBUG3,
-           (errmsg_internal("executing restore command \"%s\"",
-                            xlogRestoreCmd)));
-
    /*
     * Check signals before restore command and reset afterwards.
     */
@@ -167,15 +156,11 @@ RestoreArchivedFile(char *path, const char *xlogfname,
    /*
     * Copy xlog from archival storage to XLOGDIR
     */
-   fflush(NULL);
-   pgstat_report_wait_start(WAIT_EVENT_RESTORE_COMMAND);
-   rc = system(xlogRestoreCmd);
-   pgstat_report_wait_end();
+   ret = shell_restore(xlogfname, xlogpath, lastRestartPointFname);
 
    PostRestoreCommand();
-   pfree(xlogRestoreCmd);
 
-   if (rc == 0)
+   if (ret)
    {
        /*
         * command apparently succeeded, but let's make sure the file is
@@ -231,37 +216,6 @@ RestoreArchivedFile(char *path, const char *xlogfname,
        }
    }
 
-   /*
-    * Remember, we rollforward UNTIL the restore fails so failure here is
-    * just part of the process... that makes it difficult to determine
-    * whether the restore failed because there isn't an archive to restore,
-    * or because the administrator has specified the restore program
-    * incorrectly.  We have to assume the former.
-    *
-    * However, if the failure was due to any sort of signal, it's best to
-    * punt and abort recovery.  (If we "return false" here, upper levels will
-    * assume that recovery is complete and start up the database!) It's
-    * essential to abort on child SIGINT and SIGQUIT, because per spec
-    * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
-    * those it's a good bet we should have gotten it too.
-    *
-    * On SIGTERM, assume we have received a fast shutdown request, and exit
-    * cleanly. It's pure chance whether we receive the SIGTERM first, or the
-    * child process. If we receive it first, the signal handler will call
-    * proc_exit, otherwise we do it here. If we or the child process received
-    * SIGTERM for any other reason than a fast shutdown request, postmaster
-    * will perform an immediate shutdown when it sees us exiting
-    * unexpectedly.
-    *
-    * We treat hard shell errors such as "command not found" as fatal, too.
-    */
-   if (wait_result_is_signal(rc, SIGTERM))
-       proc_exit(1);
-
-   ereport(wait_result_is_any_signal(rc, true) ? FATAL : DEBUG2,
-           (errmsg("could not restore file \"%s\" from archive: %s",
-                   xlogfname, wait_result_to_str(rc))));
-
 not_available:
 
    /*
@@ -275,74 +229,6 @@ not_available:
    return false;
 }
 
-/*
- * Attempt to execute an external shell command during recovery.
- *
- * 'command' is the shell command to be executed, 'commandName' is a
- * human-readable name describing the command emitted in the logs. If
- * 'failOnSignal' is true and the command is killed by a signal, a FATAL
- * error is thrown. Otherwise a WARNING is emitted.
- *
- * This is currently used for recovery_end_command and archive_cleanup_command.
- */
-void
-ExecuteRecoveryCommand(const char *command, const char *commandName,
-                      bool failOnSignal, uint32 wait_event_info)
-{
-   char       *xlogRecoveryCmd;
-   char        lastRestartPointFname[MAXPGPATH];
-   int         rc;
-   XLogSegNo   restartSegNo;
-   XLogRecPtr  restartRedoPtr;
-   TimeLineID  restartTli;
-
-   Assert(command && commandName);
-
-   /*
-    * Calculate the archive file cutoff point for use during log shipping
-    * replication. All files earlier than this point can be deleted from the
-    * archive, though there is no requirement to do so.
-    */
-   GetOldestRestartPoint(&restartRedoPtr, &restartTli);
-   XLByteToSeg(restartRedoPtr, restartSegNo, wal_segment_size);
-   XLogFileName(lastRestartPointFname, restartTli, restartSegNo,
-                wal_segment_size);
-
-   /*
-    * construct the command to be executed
-    */
-   xlogRecoveryCmd = replace_percent_placeholders(command, commandName, "r", lastRestartPointFname);
-
-   ereport(DEBUG3,
-           (errmsg_internal("executing %s \"%s\"", commandName, command)));
-
-   /*
-    * execute the constructed command
-    */
-   fflush(NULL);
-   pgstat_report_wait_start(wait_event_info);
-   rc = system(xlogRecoveryCmd);
-   pgstat_report_wait_end();
-
-   pfree(xlogRecoveryCmd);
-
-   if (rc != 0)
-   {
-       /*
-        * If the failure was due to any sort of signal, it's best to punt and
-        * abort recovery.  See comments in RestoreArchivedFile().
-        */
-       ereport((failOnSignal && wait_result_is_any_signal(rc, true)) ? FATAL : WARNING,
-       /*------
-          translator: First %s represents a postgresql.conf parameter name like
-         "recovery_end_command", the 2nd is the value of that parameter, the
-         third an already translated error message. */
-               (errmsg("%s \"%s\": %s", commandName,
-                       command, wait_result_to_str(rc))));
-   }
-}
-
-
 /*
  * A file was restored from the archive under a temporary filename (path),
  * and now we want to keep it. Rename it under the permanent filename in
index 31ff20603406b1de5ee2420ef6e27e0ca8ec55f5..299304703e16ce52c0ad58c65550c89ef568274e 100644 (file)
@@ -20,8 +20,6 @@
 extern bool RestoreArchivedFile(char *path, const char *xlogfname,
                                const char *recovername, off_t expectedSize,
                                bool cleanupEnabled);
-extern void ExecuteRecoveryCommand(const char *command, const char *commandName,
-                                  bool failOnSignal, uint32 wait_event_info);
 extern void KeepFileRestoredFromArchive(const char *path, const char *xlogfname);
 extern void XLogArchiveNotify(const char *xlog);
 extern void XLogArchiveNotifySeg(XLogSegNo segno, TimeLineID tli);
@@ -32,4 +30,9 @@ extern bool XLogArchiveIsReady(const char *xlog);
 extern bool XLogArchiveIsReadyOrDone(const char *xlog);
 extern void XLogArchiveCleanup(const char *xlog);
 
+extern bool shell_restore(const char *file, const char *path,
+                         const char *lastRestartPointFileName);
+extern void shell_archive_cleanup(const char *lastRestartPointFileName);
+extern void shell_recovery_end(const char *lastRestartPointFileName);
+
 #endif                         /* XLOG_ARCHIVE_H */