Fix pg_rewind bugs when rewinding a standby server.

author Heikki Linnakangas <[email protected]>

Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)

committer Heikki Linnakangas <[email protected]>

Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
author Heikki Linnakangas <[email protected]>
Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
committer Heikki Linnakangas <[email protected]>
Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c

index eae1797f948f6445cc43b2a3f2d9dd74f30093e6..9275cba51bd65afc22009204b22c31b994a8f3bb 100644 (file)
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -55,6 +55,9 @@ static int    SimpleXLogPageRead(XLogReaderState *xlogreader,
   * Read WAL from the datadir/pg_wal, starting from 'startpoint' on timeline
   * index 'tliIndex' in target timeline history, until 'endpoint'. Make note of
   * the data blocks touched by the WAL records, and return them in a page map.
+ *
+ * 'endpoint' is the end of the last record to read. The record starting at
+ * 'endpoint' is the first one that is not read.
   */
  void
  extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
@@ -93,7 +96,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
  
         extractPageInfo(xlogreader);
  
-   } while (xlogreader->ReadRecPtr != endpoint);
+   } while (xlogreader->EndRecPtr < endpoint);
+
+   /*
+    * If 'endpoint' didn't point exactly at a record boundary, the caller
+    * messed up.
+    */
+   Assert(xlogreader->EndRecPtr == endpoint);
  
     XLogReaderFree(xlogreader);
     if (xlogreadfd != -1)
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c

index 2bbf8e74385fb7f672d8c7148e006f2767e9b3fb..d89c08f81da08216d96be1f95a44c3e7e5ea94a1 100644 (file)
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -129,6 +129,7 @@ main(int argc, char **argv)
     XLogRecPtr  chkptrec;
     TimeLineID  chkpttli;
     XLogRecPtr  chkptredo;
+   XLogRecPtr  target_wal_endrec;
     size_t      size;
     char       *buffer;
     bool        no_ensure_shutdown = false;
@@ -335,43 +336,56 @@ main(int argc, char **argv)
     {
         pg_log_info("source and target cluster are on the same timeline");
         rewind_needed = false;
+       target_wal_endrec = 0;
     }
     else
     {
+       XLogRecPtr  chkptendrec;
+
         findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
         pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
                     (uint32) (divergerec >> 32), (uint32) divergerec,
                     targetHistory[lastcommontliIndex].tli);
  
+       /*
+        * Determine the end-of-WAL on the target.
+        *
+        * The WAL ends at the last shutdown checkpoint, or at
+        * minRecoveryPoint if it was a standby. (If we supported rewinding a
+        * server that was not shut down cleanly, we would need to replay
+        * until we reach the first invalid record, like crash recovery does.)
+        */
+
+       /* read the checkpoint record on the target to see where it ends. */
+       chkptendrec = readOneRecord(datadir_target,
+                                   ControlFile_target.checkPoint,
+                                   targetNentries - 1,
+                                   restore_command);
+
+       if (ControlFile_target.minRecoveryPoint > chkptendrec)
+       {
+           target_wal_endrec = ControlFile_target.minRecoveryPoint;
+       }
+       else
+       {
+           target_wal_endrec = chkptendrec;
+       }
+
         /*
          * Check for the possibility that the target is in fact a direct
          * ancestor of the source. In that case, there is no divergent history
          * in the target that needs rewinding.
          */
-       if (ControlFile_target.checkPoint >= divergerec)
+       if (target_wal_endrec > divergerec)
         {
             rewind_needed = true;
         }
         else
         {
-           XLogRecPtr  chkptendrec;
+           /* the last common checkpoint record must be part of target WAL */
+           Assert(target_wal_endrec == divergerec);
  
-           /* Read the checkpoint record on the target to see where it ends. */
-           chkptendrec = readOneRecord(datadir_target,
-                                       ControlFile_target.checkPoint,
-                                       targetNentries - 1,
-                                       restore_command);
-
-           /*
-            * If the histories diverged exactly at the end of the shutdown
-            * checkpoint record on the target, there are no WAL records in
-            * the target that don't belong in the source's history, and no
-            * rewind is needed.
-            */
-           if (chkptendrec == divergerec)
-               rewind_needed = false;
-           else
-               rewind_needed = true;
+           rewind_needed = false;
         }
     }
  
@@ -407,14 +421,12 @@ main(int argc, char **argv)
     /*
      * Read the target WAL from last checkpoint before the point of fork, to
      * extract all the pages that were modified on the target cluster after
-    * the fork. We can stop reading after reaching the final shutdown record.
-    * XXX: If we supported rewinding a server that was not shut down cleanly,
-    * we would need to replay until the end of WAL here.
+    * the fork.
      */
     if (showprogress)
         pg_log_info("reading WAL in target");
     extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
-                  ControlFile_target.checkPoint, restore_command);
+                  target_wal_endrec, restore_command);
  
     /*
      * We have collected all information we need from both systems. Decide
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl

new file mode 100644 (file)

index 0000000..8f42016
--- /dev/null
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -0,0 +1,156 @@
+#
+# Test situation where a target data directory contains
+# WAL records beyond both the last checkpoint and the divergence
+# point:
+#
+# Target WAL (TLI 2):
+#
+# backup ... Checkpoint A ... INSERT 'rewind this'
+#            (TLI 1 -> 2)
+#
+#            ^ last common                        ^ minRecoveryPoint
+#              checkpoint
+#
+# Source WAL (TLI 3):
+#
+# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
+#            (TLI 1 -> 2)     (TLI 2 -> 3)
+#
+#
+# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
+# after the last common checkpoint that needs to be rewound. We used to
+# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
+# that the target doesn't need rewinding in this scenario, because the
+# last checkpoint on the target TLI was an ancestor of the source TLI.
+#
+#
+# This test does not make use of RewindTest as it requires three
+# nodes.
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+use File::Copy;
+
+my $tmp_folder = TestLib::tempdir;
+
+my $node_1 = get_new_node('node_1');
+$node_1->init(allows_streaming => 1);
+$node_1->append_conf('postgresql.conf', qq(
+wal_keep_size='100 MB'
+));
+
+$node_1->start;
+
+# Create a couple of test tables
+$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
+$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
+$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
+
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_1->backup($backup_name);
+
+# Create streaming standby from backup
+my $node_2 = get_new_node('node_2');
+$node_2->init_from_backup($node_1, $backup_name,
+   has_streaming => 1);
+$node_2->start;
+
+# Create streaming standby from backup
+my $node_3 = get_new_node('node_3');
+$node_3->init_from_backup($node_1, $backup_name,
+   has_streaming => 1);
+$node_3->start;
+
+# Stop node_1
+
+$node_1->stop('fast');
+
+# Promote node_3
+$node_3->promote;
+
+# node_1 rejoins node_3
+
+my $node_3_connstr = $node_3->connstr;
+
+$node_1->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_1->set_standby_mode();
+$node_1->start();
+
+# node_2 follows node_3
+
+$node_2->append_conf('postgresql.conf', qq(
+primary_conninfo='$node_3_connstr'
+));
+$node_2->restart();
+
+# Promote node_1
+
+$node_1->promote;
+
+# We now have a split-brain with two primaries. Insert a row on both to
+# demonstratively create a split brain. After the rewind, we should only
+# see the insert on 1, as the insert on node 3 is rewound away.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
+
+# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
+# rewind doesn't correctly rewind the changes made on the other node,
+# we might fail to notice if the inserts are invisible because the XIDs
+# are not marked as committed.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
+
+# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
+# overwritten by replaying the WAL from node 1.
+$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
+
+# Wait for node 2 to catch up
+$node_2->poll_query_until('postgres',
+   q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
+
+# At this point node_2 will shut down without a shutdown checkpoint,
+# but with WAL entries beyond the preceding shutdown checkpoint.
+$node_2->stop('fast');
+$node_3->stop('fast');
+
+my $node_2_pgdata = $node_2->data_dir;
+my $node_1_connstr = $node_1->connstr;
+
+# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
+copy(
+   "$node_2_pgdata/postgresql.conf",
+   "$tmp_folder/node_2-postgresql.conf.tmp");
+
+command_ok(
+    [
+        'pg_rewind',
+        "--source-server=$node_1_connstr",
+        "--target-pgdata=$node_2_pgdata"
+    ],
+   'pg_rewind detects rewind needed');
+
+# Now move back postgresql.conf with old settings
+move(
+   "$tmp_folder/node_2-postgresql.conf.tmp",
+   "$node_2_pgdata/postgresql.conf");
+
+$node_2->start;
+
+# Check contents of the test tables after rewind. The rows inserted in node 3
+# before rewind should've been overwritten with the data from node 1.
+my $result;
+$result = $node_2->safe_psql('postgres', 'checkpoint');
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
+is($result, qq(keep this
+and this
+and this too), 'table foo after rewind');
+
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
+is($result, qq(in both), 'table bar after rewind');
author	Heikki Linnakangas <[email protected]>
	Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
committer	Heikki Linnakangas <[email protected]>
	Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
src/bin/pg_rewind/parsexlog.c		patch \| blob \| blame \| history
src/bin/pg_rewind/pg_rewind.c		patch \| blob \| blame \| history
src/bin/pg_rewind/t/008_min_recovery_point.pl	[new file with mode: 0644]	patch \| blob