Fix pg_rewind bugs when rewinding a standby server.

author Heikki Linnakangas <[email protected]>

Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)

committer Heikki Linnakangas <[email protected]>

Thu, 3 Dec 2020 13:58:06 +0000 (15:58 +0200)
author Heikki Linnakangas <[email protected]>
Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
committer Heikki Linnakangas <[email protected]>
Thu, 3 Dec 2020 13:58:06 +0000 (15:58 +0200)
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c

index a5d1a5dddde5a4e83fe732ba108455c1eec4aea6..ac14b3dd9ca086f40a8a6f9a801e928176c1b29b 100644 (file)
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -57,6 +57,9 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader,
   * Read WAL from the datadir/pg_xlog, starting from 'startpoint' on timeline
   * 'tli', until 'endpoint'. Make note of the data blocks touched by the WAL
   * records, and return them in a page map.
+ *
+ * 'endpoint' is the end of the last record to read. The record starting at
+ * 'endpoint' is the first one that is not read.
   */
  void
  extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
@@ -96,7 +99,13 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, TimeLineID tli,
  
         startpoint = InvalidXLogRecPtr; /* continue reading at next record */
  
-   } while (xlogreader->ReadRecPtr != endpoint);
+   } while (xlogreader->EndRecPtr < endpoint);
+
+   /*
+    * If 'endpoint' didn't point exactly at a record boundary, the caller
+    * messed up.
+    */
+   Assert(xlogreader->EndRecPtr == endpoint);
  
     XLogReaderFree(xlogreader);
     if (xlogreadfd != -1)
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c

index cc0568064f98a8b4adc94a403661c5f25f3c7180..7256976af3e780c624dbb7ccf7e3c430426fb956 100644 (file)
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -93,6 +93,7 @@ main(int argc, char **argv)
     XLogRecPtr  chkptrec;
     TimeLineID  chkpttli;
     XLogRecPtr  chkptredo;
+   XLogRecPtr  target_wal_endrec;
     size_t      size;
     char       *buffer;
     bool        rewind_needed;
@@ -223,41 +224,54 @@ main(int argc, char **argv)
     {
         printf(_("source and target cluster are on the same timeline\n"));
         rewind_needed = false;
+       target_wal_endrec = 0;
     }
     else
     {
+       XLogRecPtr  chkptendrec;
+
         findCommonAncestorTimeline(&divergerec, &lastcommontli);
         printf(_("servers diverged at WAL position %X/%X on timeline %u\n"),
                (uint32) (divergerec >> 32), (uint32) divergerec, lastcommontli);
  
         /*
-        * Check for the possibility that the target is in fact a direct ancestor
-        * of the source. In that case, there is no divergent history in the
-        * target that needs rewinding.
+        * Determine the end-of-WAL on the target.
+        *
+        * The WAL ends at the last shutdown checkpoint, or at
+        * minRecoveryPoint if it was a standby. (If we supported rewinding a
+        * server that was not shut down cleanly, we would need to replay
+        * until we reach the first invalid record, like crash recovery does.)
          */
-       if (ControlFile_target.checkPoint >= divergerec)
+
+       /* read the checkpoint record on the target to see where it ends. */
+       chkptendrec = readOneRecord(datadir_target,
+                                   ControlFile_target.checkPoint,
+                          ControlFile_target.checkPointCopy.ThisTimeLineID);
+
+       if (ControlFile_target.minRecoveryPoint > chkptendrec)
         {
-           rewind_needed = true;
+           target_wal_endrec = ControlFile_target.minRecoveryPoint;
         }
         else
         {
-           XLogRecPtr  chkptendrec;
+           target_wal_endrec = chkptendrec;
+       }
  
-           /* Read the checkpoint record on the target to see where it ends. */
-           chkptendrec = readOneRecord(datadir_target,
-                                       ControlFile_target.checkPoint,
-                          ControlFile_target.checkPointCopy.ThisTimeLineID);
+       /*
+        * Check for the possibility that the target is in fact a direct
+        * ancestor of the source. In that case, there is no divergent history
+        * in the target that needs rewinding.
+        */
+       if (target_wal_endrec > divergerec)
+       {
+           rewind_needed = true;
+       }
+       else
+       {
+           /* the last common checkpoint record must be part of target WAL */
+           Assert(target_wal_endrec == divergerec);
  
-           /*
-            * If the histories diverged exactly at the end of the shutdown
-            * checkpoint record on the target, there are no WAL records in the
-            * target that don't belong in the source's history, and no rewind is
-            * needed.
-            */
-           if (chkptendrec == divergerec)
-               rewind_needed = false;
-           else
-               rewind_needed = true;
+           rewind_needed = false;
         }
     }
  
@@ -285,13 +299,11 @@ main(int argc, char **argv)
     /*
      * Read the target WAL from last checkpoint before the point of fork, to
      * extract all the pages that were modified on the target cluster after
-    * the fork. We can stop reading after reaching the final shutdown record.
-    * XXX: If we supported rewinding a server that was not shut down cleanly,
-    * we would need to replay until the end of WAL here.
+    * the fork.
      */
     pg_log(PG_PROGRESS, "reading WAL in target\n");
     extractPageMap(datadir_target, chkptrec, lastcommontli,
-                  ControlFile_target.checkPoint);
+                  target_wal_endrec);
     filemap_finalize();
  
     if (showprogress)
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl

new file mode 100644 (file)

index 0000000..60114e6
--- /dev/null
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -0,0 +1,165 @@
+#
+# Test situation where a target data directory contains
+# WAL records beyond both the last checkpoint and the divergence
+# point:
+#
+# Target WAL (TLI 2):
+#
+# backup ... Checkpoint A ... INSERT 'rewind this'
+#            (TLI 1 -> 2)
+#
+#            ^ last common                        ^ minRecoveryPoint
+#              checkpoint
+#
+# Source WAL (TLI 3):
+#
+# backup ... Checkpoint A ... Checkpoint B ... INSERT 'keep this'
+#            (TLI 1 -> 2)     (TLI 2 -> 3)
+#
+#
+# The last common checkpoint is Checkpoint A. But there is WAL on TLI 2
+# after the last common checkpoint that needs to be rewound. We used to
+# have a bug where minRecoveryPoint was ignored, and pg_rewind concluded
+# that the target doesn't need rewinding in this scenario, because the
+# last checkpoint on the target TLI was an ancestor of the source TLI.
+#
+#
+# This test does not make use of RewindTest as it requires three
+# nodes.
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 3;
+
+use File::Copy;
+
+my $tmp_folder = TestLib::tempdir;
+
+my $node_1 = get_new_node('node_1');
+$node_1->init(allows_streaming => 1);
+$node_1->append_conf('postgresql.conf', qq(
+wal_keep_segments='5'
+));
+
+$node_1->start;
+
+# Create a couple of test tables
+$node_1->safe_psql('postgres', 'CREATE TABLE public.foo (t TEXT)');
+$node_1->safe_psql('postgres', 'CREATE TABLE public.bar (t TEXT)');
+$node_1->safe_psql('postgres', "INSERT INTO public.bar VALUES ('in both')");
+
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_1->backup($backup_name);
+
+# Create streaming standby from backup
+my $node_2 = get_new_node('node_2');
+$node_2->init_from_backup($node_1, $backup_name,
+   has_streaming => 1);
+$node_2->start;
+
+# Create streaming standby from backup
+my $node_3 = get_new_node('node_3');
+$node_3->init_from_backup($node_1, $backup_name,
+   has_streaming => 1);
+$node_3->start;
+
+# Stop node_1
+
+$node_1->stop('fast');
+
+# Promote node_3
+$node_3->promote;
+
+# node_1 rejoins node_3
+
+my $node_3_connstr = $node_3->connstr;
+
+unlink($node_2->data_dir . '/recovery.conf');
+$node_1->append_conf('recovery.conf', qq(
+standby_mode=on
+primary_conninfo='$node_3_connstr'
+recovery_target_timeline='latest'
+));
+$node_1->start();
+
+# node_2 follows node_3
+
+unlink($node_2->data_dir . '/recovery.conf');
+$node_2->append_conf('recovery.conf', qq(
+standby_mode=on
+primary_conninfo='$node_3_connstr'
+recovery_target_timeline='latest'
+));
+$node_2->restart();
+
+# Promote node_1
+
+$node_1->promote;
+
+# Wait until nodes 1 and 3 have been fully promoted.
+$node_1->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> true");
+$node_3->poll_query_until('postgres', "SELECT pg_is_in_recovery() <> true");
+
+# We now have a split-brain with two primaries. Insert a row on both to
+# demonstratively create a split brain. After the rewind, we should only
+# see the insert on 1, as the insert on node 3 is rewound away.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('keep this')");
+
+# Insert more rows in node 1, to bump up the XID counter. Otherwise, if
+# rewind doesn't correctly rewind the changes made on the other node,
+# we might fail to notice if the inserts are invisible because the XIDs
+# are not marked as committed.
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this')");
+$node_1->safe_psql('postgres', "INSERT INTO public.foo (t) VALUES ('and this too')");
+
+# Also insert a row in 'bar' on node 3. It is unmodified in node 1, so it won't get
+# overwritten by replaying the WAL from node 1.
+$node_3->safe_psql('postgres', "INSERT INTO public.bar (t) VALUES ('rewind this')");
+
+# Wait for node 2 to catch up
+$node_2->poll_query_until('postgres',
+   q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
+
+# At this point node_2 will shut down without a shutdown checkpoint,
+# but with WAL entries beyond the preceding shutdown checkpoint.
+$node_2->stop('fast');
+$node_3->stop('fast');
+
+my $node_2_pgdata = $node_2->data_dir;
+my $node_1_connstr = $node_1->connstr;
+
+# Keep a temporary postgresql.conf or it would be overwritten during the rewind.
+copy(
+   "$node_2_pgdata/postgresql.conf",
+   "$tmp_folder/node_2-postgresql.conf.tmp");
+
+command_ok(
+    [
+        'pg_rewind',
+        "--source-server=$node_1_connstr",
+        "--target-pgdata=$node_2_pgdata"
+    ],
+   'pg_rewind detects rewind needed');
+
+# Now move back postgresql.conf with old settings
+move(
+   "$tmp_folder/node_2-postgresql.conf.tmp",
+   "$node_2_pgdata/postgresql.conf");
+
+$node_2->start;
+
+# Check contents of the test tables after rewind. The rows inserted in node 3
+# before rewind should've been overwritten with the data from node 1.
+my $result;
+$result = $node_2->safe_psql('postgres', 'checkpoint');
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.foo');
+is($result, qq(keep this
+and this
+and this too), 'table foo after rewind');
+
+$result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
+is($result, qq(in both), 'table bar after rewind');
author	Heikki Linnakangas <[email protected]>
	Thu, 3 Dec 2020 13:57:48 +0000 (15:57 +0200)
committer	Heikki Linnakangas <[email protected]>
	Thu, 3 Dec 2020 13:58:06 +0000 (15:58 +0200)
src/bin/pg_rewind/parsexlog.c		patch \| blob \| blame \| history
src/bin/pg_rewind/pg_rewind.c		patch \| blob \| blame \| history
src/bin/pg_rewind/t/008_min_recovery_point.pl	[new file with mode: 0644]	patch \| blob