Fix handling of WAL segments ready to be archived during crash recovery

author Michael Paquier <[email protected]>

Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)

committer Michael Paquier <[email protected]>

Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)
author Michael Paquier <[email protected]>
Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)
committer Michael Paquier <[email protected]>
Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index a378df9d76b59adf48c28d8d671dc39d294a63a3..a6cdb0024fa0209c70c96b7da9ac98543ae905c7 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -203,8 +203,9 @@ static TimeLineID receiveTLI = 0;
  static bool lastFullPageWrites;
  
  /*
- * Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state".
+ * Local copy of the state tracked by SharedRecoveryState in shared memory,
+ * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
+ * means "not known, need to check the shared state".
   */
  static bool LocalRecoveryInProgress = true;
  
@@ -608,10 +609,10 @@ typedef struct XLogCtlData
     char        archiveCleanupCommand[MAXPGPATH];
  
     /*
-    * SharedRecoveryInProgress indicates if we're still in crash or archive
+    * SharedRecoveryState indicates if we're still in crash or archive
      * recovery.  Protected by info_lck.
      */
-   bool        SharedRecoveryInProgress;
+   RecoveryState SharedRecoveryState;
  
     /*
      * SharedHotStandbyActive indicates if we're still in crash or archive
@@ -4083,6 +4084,16 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
                 updateMinRecoveryPoint = true;
  
                 UpdateControlFile();
+
+               /*
+                * We update SharedRecoveryState while holding the lock on
+                * ControlFileLock so both states are consistent in shared
+                * memory.
+                */
+               SpinLockAcquire(&XLogCtl->info_lck);
+               XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+               SpinLockRelease(&XLogCtl->info_lck);
+
                 LWLockRelease(ControlFileLock);
  
                 CheckRecoveryConsistency();
@@ -4771,7 +4782,7 @@ XLOGShmemInit(void)
      * in additional info.)
      */
     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
-   XLogCtl->SharedRecoveryInProgress = true;
+   XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
     XLogCtl->SharedHotStandbyActive = false;
     XLogCtl->WalWriterSleeping = false;
  
@@ -6493,7 +6504,13 @@ StartupXLOG(void)
          */
         dbstate_at_startup = ControlFile->state;
         if (InArchiveRecovery)
+       {
             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+
+           SpinLockAcquire(&XLogCtl->info_lck);
+           XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+           SpinLockRelease(&XLogCtl->info_lck);
+       }
         else
         {
             ereport(LOG,
@@ -6506,6 +6523,10 @@ StartupXLOG(void)
                                 ControlFile->checkPointCopy.ThisTimeLineID,
                                 recoveryTargetTLI)));
             ControlFile->state = DB_IN_CRASH_RECOVERY;
+
+           SpinLockAcquire(&XLogCtl->info_lck);
+           XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+           SpinLockRelease(&XLogCtl->info_lck);
         }
         ControlFile->prevCheckPoint = ControlFile->checkPoint;
         ControlFile->checkPoint = checkPointLoc;
@@ -7496,7 +7517,7 @@ StartupXLOG(void)
      * updates to shared memory.)
      */
     SpinLockAcquire(&XLogCtl->info_lck);
-   XLogCtl->SharedRecoveryInProgress = false;
+   XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
     SpinLockRelease(&XLogCtl->info_lck);
  
     /*
@@ -7639,7 +7660,7 @@ RecoveryInProgress(void)
          */
         volatile XLogCtlData *xlogctl = XLogCtl;
  
-       LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+       LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
  
         /*
          * Initialize TimeLineID and RedoRecPtr when we discover that recovery
@@ -7651,8 +7672,8 @@ RecoveryInProgress(void)
         {
             /*
              * If we just exited recovery, make sure we read TimeLineID and
-            * RedoRecPtr after SharedRecoveryInProgress (for machines with
-            * weak memory ordering).
+            * RedoRecPtr after SharedRecoveryState (for machines with weak
+            * memory ordering).
              */
             pg_memory_barrier();
             InitXLOGAccess();
@@ -7668,6 +7689,24 @@ RecoveryInProgress(void)
     }
  }
  
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is kept consistent with the contents of the control
+ * file.  See details about the possible values of RecoveryState in xlog.h.
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+   RecoveryState retval;
+
+   SpinLockAcquire(&XLogCtl->info_lck);
+   retval = XLogCtl->SharedRecoveryState;
+   SpinLockRelease(&XLogCtl->info_lck);
+
+   return retval;
+}
+
  /*
   * Is HotStandby active yet? This is only important in special backends
   * since normal backends won't ever be able to connect until this returns
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c

index c899594fb677714a25b3d24d3ef2d34e4cfa6d53..e36a85ccaba07ee030b552d11a87bc5692b23769 100644 (file)
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -619,18 +619,25 @@ XLogArchiveCheckDone(const char *xlog)
  {
     char        archiveStatusPath[MAXPGPATH];
     struct stat stat_buf;
-   bool        inRecovery = RecoveryInProgress();
+
+   /* The file is always deletable if archive_mode is "off". */
+   if (!XLogArchivingActive())
+       return true;
  
     /*
-    * The file is always deletable if archive_mode is "off".  On standbys
-    * archiving is disabled if archive_mode is "on", and enabled with
-    * "always".  On a primary, archiving is enabled if archive_mode is "on"
-    * or "always".
+    * During archive recovery, the file is deletable if archive_mode is not
+    * "always".
      */
-   if (!((XLogArchivingActive() && !inRecovery) ||
-         (XLogArchivingAlways() && inRecovery)))
+   if (!XLogArchivingAlways() &&
+       GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
         return true;
  
+   /*
+    * At this point of the logic, note that we are either a primary with
+    * archive_mode set to "on" or "always", or a standby with archive_mode
+    * set to "always".
+    */
+
     /* First check for .done --- this means archiver is done with it */
     StatusFilePath(archiveStatusPath, xlog, ".done");
     if (stat(archiveStatusPath, &stat_buf) == 0)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h

index 59ebccb19e968b3a1a1bd3a29f7b2800984c7ae6..d051ee02bc39d577cc2ef39fcdb5b8c99ab679c0 100644 (file)
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -126,6 +126,14 @@ typedef enum WalLevel
  } WalLevel;
  extern int wal_level;
  
+/* Recovery states */
+typedef enum RecoveryState
+{
+   RECOVERY_STATE_CRASH = 0,   /* crash recovery */
+   RECOVERY_STATE_ARCHIVE,     /* archive recovery */
+   RECOVERY_STATE_DONE         /* currently in production */
+} RecoveryState;
+
  /* Is WAL archiving enabled (always or only while server is running normally)? */
  #define XLogArchivingActive() \
     (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level >= WAL_LEVEL_ARCHIVE)
@@ -230,6 +238,7 @@ extern const char *xlog_identify(uint8 info);
  extern void issue_xlog_fsync(int fd, XLogSegNo segno);
  
  extern bool RecoveryInProgress(void);
+extern RecoveryState GetRecoveryState(void);
  extern bool HotStandbyActive(void);
  extern bool HotStandbyActiveInReplay(void);
  extern bool XLogInsertAllowed(void);
diff --git a/src/test/recovery/t/020_archive_status.pl b/src/test/recovery/t/020_archive_status.pl

new file mode 100644 (file)

index 0000000..c2d78f7
--- /dev/null
+++ b/src/test/recovery/t/020_archive_status.pl
@@ -0,0 +1,232 @@
+#
+# Tests related to WAL archiving and recovery.
+#
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 16;
+use Config;
+
+my $primary = get_new_node('master');
+$primary->init(
+   has_archiving    => 1,
+   allows_streaming => 1);
+$primary->append_conf('postgresql.conf', 'autovacuum = off');
+$primary->start;
+my $primary_data = $primary->data_dir;
+
+# Temporarily use an archive_command value to make the archiver fail,
+# knowing that archiving is enabled.  Note that we cannot use a command
+# that does not exist as in this case the archiver process would just exit
+# without reporting the failure to pg_stat_archiver.  This also cannot
+# use a plain "false" as that's unportable on Windows.  So, instead, as
+# a portable solution, use an archive command based on a command known to
+# work but will fail: copy with an incorrect original path.
+my $incorrect_command =
+  $TestLib::windows_os
+  ? qq{copy "%p_does_not_exist" "%f_does_not_exist"}
+  : qq{cp "%p_does_not_exist" "%f_does_not_exist"};
+$primary->safe_psql(
+   'postgres', qq{
+    ALTER SYSTEM SET archive_command TO '$incorrect_command';
+    SELECT pg_reload_conf();
+});
+
+# Save the WAL segment currently in use and switch to a new segment.
+# This will be used to track the activity of the archiver.
+my $segment_name_1 = $primary->safe_psql('postgres',
+   q{SELECT pg_xlogfile_name(pg_current_xlog_location())});
+my $segment_path_1       = "pg_xlog/archive_status/$segment_name_1";
+my $segment_path_1_ready = "$segment_path_1.ready";
+my $segment_path_1_done  = "$segment_path_1.done";
+$primary->safe_psql(
+   'postgres', q{
+   CREATE TABLE mine AS SELECT generate_series(1,10) AS x;
+   SELECT pg_switch_xlog();
+   CHECKPOINT;
+});
+
+# Wait for an archive failure.
+$primary->poll_query_until('postgres',
+   q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't')
+  or die "Timed out while waiting for archiving to fail";
+ok( -f "$primary_data/$segment_path_1_ready",
+   ".ready file exists for WAL segment $segment_name_1 waiting to be archived"
+);
+ok( !-f "$primary_data/$segment_path_1_done",
+   ".done file does not exist for WAL segment $segment_name_1 waiting to be archived"
+);
+
+is( $primary->safe_psql(
+       'postgres', q{
+       SELECT archived_count, last_failed_wal
+       FROM pg_stat_archiver
+   }),
+   "0|$segment_name_1",
+   "pg_stat_archiver failed to archive $segment_name_1");
+
+# Crash the cluster for the next test in charge of checking that non-archived
+# WAL segments are not removed.
+$primary->stop('immediate');
+
+# Recovery tests for the archiving with a standby partially check
+# the recovery behavior when restoring a backup taken using a
+# snapshot with no pg_start/stop_backup.  In this situation,
+# the recovered standby should enter first crash recovery then
+# switch to regular archive recovery.  Note that the base backup
+# is taken here so as archive_command will fail.  This is necessary
+# for the assumptions of the tests done with the standbys below.
+$primary->backup_fs_cold('backup');
+
+$primary->start;
+ok( -f "$primary_data/$segment_path_1_ready",
+   ".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary"
+);
+
+# Allow WAL archiving again and wait for a success.
+$primary->safe_psql(
+   'postgres', q{
+   ALTER SYSTEM RESET archive_command;
+   SELECT pg_reload_conf();
+});
+
+$primary->poll_query_until('postgres',
+   q{SELECT archived_count = 1 FROM pg_stat_archiver})
+  or die "Timed out while waiting for archiving to finish";
+
+ok(!-f "$primary_data/$segment_path_1_ready",
+   ".ready file for archived WAL segment $segment_name_1 removed");
+
+ok(-f "$primary_data/$segment_path_1_done",
+   ".done file for archived WAL segment $segment_name_1 exists");
+
+is( $primary->safe_psql(
+       'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }),
+   $segment_name_1,
+   "archive success reported in pg_stat_archiver for WAL segment $segment_name_1"
+);
+
+# Create some WAL activity and a new checkpoint so as the next standby can
+# create a restartpoint.  As this standby starts in crash recovery because
+# of the cold backup taken previously, it needs a clean restartpoint to deal
+# with existing status files.
+$primary->safe_psql(
+   'postgres', q{
+   INSERT INTO mine SELECT generate_series(10,20) AS x;
+   SELECT pg_switch_xlog();
+   CHECKPOINT;
+});
+my $segment_name_2 = $primary->safe_psql('postgres',
+   q{SELECT pg_xlogfile_name(pg_current_xlog_location())});
+my $segment_path_2       = "pg_xlog/archive_status/$segment_name_2";
+my $segment_path_2_ready = "$segment_path_2.ready";
+my $segment_path_2_done  = "$segment_path_2.done";
+
+# Test standby with archive_mode = on.
+my $standby1 = get_new_node('standby');
+$standby1->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby1->append_conf('postgresql.conf', q{
+archive_mode = on
+wal_keep_segments = 0
+});
+my $standby1_data = $standby1->data_dir;
+$standby1->start;
+# First restartpoint
+$standby1->safe_psql('postgres', q{CHECKPOINT});
+
+# Segments are cleaned after the second restartpoint, so create a second
+# one.
+$primary->safe_psql(
+   'postgres', q{
+   INSERT INTO mine SELECT generate_series(21,30) AS x;
+   CHECKPOINT;
+   SELECT pg_switch_xlog();
+});
+
+# Make sure that the standby has caught here.
+my $primary_lsn = $primary->safe_psql('postgres',
+   q{SELECT pg_current_xlog_location()});
+$standby1->poll_query_until('postgres',
+   qq{ SELECT pg_xlog_location_diff(pg_last_xlog_replay_location(), '$primary_lsn') >= 0 })
+  or die "Timed out while waiting for xlog replay";
+
+# Second restartpoint.
+$standby1->safe_psql('postgres', q{CHECKPOINT});
+
+# Recovery with archive_mode=on removed .ready signal files inherited
+# from backup after two checkpoints.  Note that this WAL segment
+# existed in the backup.
+ok( !-f "$standby1_data/$segment_path_1_ready",
+   ".ready file for WAL segment $segment_name_1 present in backup removed with archive_mode=on on standby"
+);
+
+# Recovery with archive_mode=on should not create .ready files.
+# Note that this segment did not exist in the backup.
+ok( !-f "$standby1_data/$segment_path_2_ready",
+   ".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby"
+);
+
+# Recovery with archive_mode = on creates .done files.
+ok( -f "$standby1_data/$segment_path_2_done",
+   ".done file for WAL segment $segment_name_2 created when archive_mode=on on standby"
+);
+
+# Test recovery with archive_mode = always, which should always keep
+# .ready files if archiving is enabled, though here we want the archive
+# command to fail to persist the .ready files.  Note that this node
+# has inherited the archive command of the previous cold backup that
+# will cause archiving failures.
+my $standby2 = get_new_node('standby2');
+$standby2->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby2->append_conf('postgresql.conf', 'archive_mode = always');
+my $standby2_data = $standby2->data_dir;
+$standby2->start;
+
+$standby2->safe_psql('postgres', q{CHECKPOINT});
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+   ".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_2_ready",
+   ".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby"
+);
+
+# Reset statistics of the archiver for the next checks.
+$standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')});
+
+# Now crash the cluster to check that recovery step does not
+# remove non-archived WAL segments on a standby where archiving
+# is enabled.
+$standby2->stop('immediate');
+$standby2->start;
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+   "WAL segment still ready to archive after crash recovery on standby with archive_mode=always"
+);
+
+# Allow WAL archiving again, and wait for the segments to be archived.
+$standby2->safe_psql(
+   'postgres', q{
+   ALTER SYSTEM RESET archive_command;
+   SELECT pg_reload_conf();
+});
+$standby2->poll_query_until('postgres',
+   qq{SELECT last_archived_wal = '$segment_name_2' FROM pg_stat_archiver})
+  or die "Timed out while waiting for archiving to finish";
+
+is( $standby2->safe_psql(
+       'postgres', q{SELECT archived_count FROM pg_stat_archiver}),
+   '3',
+   'correct number of WAL segments archived from standby');
+
+ok( !-f "$standby2_data/$segment_path_1_ready"
+     && !-f "$standby2_data/$segment_path_2_ready",
+   ".ready files removed after archive success with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_1_done"
+     && -f "$standby2_data/$segment_path_2_done",
+   ".done files created after archive success with archive_mode=always on standby"
+);
author	Michael Paquier <[email protected]>
	Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)
committer	Michael Paquier <[email protected]>
	Thu, 23 Apr 2020 23:48:55 +0000 (08:48 +0900)
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/access/transam/xlogarchive.c		patch \| blob \| blame \| history
src/include/access/xlog.h		patch \| blob \| blame \| history
src/test/recovery/t/020_archive_status.pl	[new file with mode: 0644]	patch \| blob