From: Robert Haas <rhaas@postgresql.org>
Date: Mon, 18 Apr 2022 19:40:12 +0000 (-0400)
Subject: Remove the end-of-recovery checkpoint in all cases.
X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=59b804658ff9fc97964a9296b9612d5039d5a3f3;p=users%2Frhaas%2Fpostgres.git

Remove the end-of-recovery checkpoint in all cases.

For many years, we've been issuing a special end-of-recovery WAL
record rather than performing an end-of-recovery checkpoint when a
standby is promoted.  This has the advantage of speeding up promotion.
However, it's advantageous to do this at the end of crash recovery as
well, and for the same reasons.  Plus, it's actually simpler this way,
because having only one way to do something rather than two means we
need less code.

We now always request a checkpoint at the end of recovery. If we
crash again before that checkpoint completes, recovery will begin
from the previous checkpoint that we performed - or replayed. This
behavior is not new as far as archive recovery is concerned, but now
it can also happen for crash recovery.

At initdb time, we now initialize the nextXid counter to 4 rather
than to 3. The code for logging standby shapshots and handling them
on th standby can't handle 3, because we set latestCompleteXid =
nextXid - 1, and 2 is not a normal XID. This is arguably a preexisting
bug, but because we didn't log a standby snapshot when starting from
an end-of-recovery checkpoint, we happened not to ever hit the
problematic case.
---

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 5eabd32cf6..420e93bed3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -665,7 +665,6 @@ static void UpdateLastRemovedPtr(char *filename);
 static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
-static bool PerformRecoveryXLogAction(void);
 static void InitControlFile(uint64 sysidentifier);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
@@ -2516,8 +2515,7 @@ XLogFlush(XLogRecPtr record)
 	 * During REDO, we are reading not writing WAL.  Therefore, instead of
 	 * trying to flush the WAL, we should update minRecoveryPoint instead. We
 	 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
-	 * to act this way too, and because when it tries to write the
-	 * end-of-recovery checkpoint, it should indeed flush.
+	 * to act this way too.
 	 */
 	if (!XLogInsertAllowed())
 	{
@@ -2654,10 +2652,7 @@ XLogFlush(XLogRecPtr record)
 	 * the bad page is encountered again during recovery then we would be
 	 * unable to restart the database at all!  (This scenario actually
 	 * happened in the field several times with 7.1 releases.)	As of 8.4, bad
-	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
-	 * the only time we can reach here during recovery is while flushing the
-	 * end-of-recovery checkpoint record, and we don't expect that to have a
-	 * bad LSN.
+	 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem.
 	 *
 	 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
 	 * since xact.c calls this routine inside a critical section.  However,
@@ -4495,6 +4490,7 @@ BootStrapXLOG(void)
 	uint64		sysidentifier;
 	struct timeval tv;
 	pg_crc32c	crc;
+	FullTransactionId	starting_fxid;
 
 	/* allow ordinary WAL segment creation, like StartupXLOG() would */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
@@ -4523,6 +4519,17 @@ BootStrapXLOG(void)
 	page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
 	memset(page, 0, XLOG_BLCKSZ);
 
+	/*
+	 * We set nextXid to FirstNormalTransactionId + 1, because when restarting
+	 * we'll set latestCompletedXid to nextXid - 1, and we want that to be a
+	 * normal XID in all cases. If it isn't, a number of functions, including
+	 * GetRunningTransactionData() and ProcArrayApplyRecoveryInfo(), will
+	 * fail assertions.
+	 */
+	starting_fxid =
+		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
+	FullTransactionIdAdvance(&starting_fxid);
+
 	/*
 	 * Set up information for the initial checkpoint record
 	 *
@@ -4534,8 +4541,7 @@ BootStrapXLOG(void)
 	checkPoint.ThisTimeLineID = BootstrapTimeLineID;
 	checkPoint.PrevTimeLineID = BootstrapTimeLineID;
 	checkPoint.fullPageWrites = fullPageWrites;
-	checkPoint.nextXid =
-		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
+	checkPoint.nextXid = starting_fxid;
 	checkPoint.nextOid = FirstGenbkiObjectId;
 	checkPoint.nextMulti = FirstMultiXactId;
 	checkPoint.nextMultiOffset = 0;
@@ -4890,7 +4896,6 @@ StartupXLOG(void)
 	XLogRecPtr	abortedRecPtr;
 	XLogRecPtr	missingContrecPtr;
 	TransactionId oldestActiveXID;
-	bool		promoted = false;
 
 	/*
 	 * We should have an aux process resource owner to use, and we should not
@@ -5546,10 +5551,10 @@ StartupXLOG(void)
 	UpdateFullPageWrites();
 
 	/*
-	 * Emit checkpoint or end-of-recovery record in XLOG, if required.
+	 * Emit end-of-recovery record in XLOG, if required.
 	 */
 	if (performedWalRecovery)
-		promoted = PerformRecoveryXLogAction();
+		CreateEndOfRecoveryRecord();
 
 	/*
 	 * If any of the critical GUCs have changed, log them before we allow
@@ -5611,12 +5616,12 @@ StartupXLOG(void)
 	WalSndWakeup();
 
 	/*
-	 * If this was a promotion, request an (online) checkpoint now. This isn't
-	 * required for consistency, but the last restartpoint might be far back,
-	 * and in case of a crash, recovering from it might take a longer than is
-	 * appropriate now that we're not in standby mode anymore.
+	 * If we performed recovery, request an (online) checkpoint now. This
+	 * isn't required for consistency, but the last restartpoint might be far
+	 * back, and in case of a crash, recovering from it might take a longer
+	 * than is appropriate now that we're not in standby mode anymore.
 	 */
-	if (promoted)
+	if (performedWalRecovery)
 		RequestCheckpoint(CHECKPOINT_FORCE);
 }
 
@@ -5689,60 +5694,6 @@ ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
 	LWLockRelease(ControlFileLock);
 }
 
-/*
- * Perform whatever XLOG actions are necessary at end of REDO.
- *
- * The goal here is to make sure that we'll be able to recover properly if
- * we crash again. If we choose to write a checkpoint, we'll write a shutdown
- * checkpoint rather than an on-line one. This is not particularly critical,
- * but since we may be assigning a new TLI, using a shutdown checkpoint allows
- * us to have the rule that TLI only changes in shutdown checkpoints, which
- * allows some extra error checking in xlog_redo.
- */
-static bool
-PerformRecoveryXLogAction(void)
-{
-	bool		promoted = false;
-
-	/*
-	 * Perform a checkpoint to update all our recovery activity to disk.
-	 *
-	 * Note that we write a shutdown checkpoint rather than an on-line one.
-	 * This is not particularly critical, but since we may be assigning a new
-	 * TLI, using a shutdown checkpoint allows us to have the rule that TLI
-	 * only changes in shutdown checkpoints, which allows some extra error
-	 * checking in xlog_redo.
-	 *
-	 * In promotion, only create a lightweight end-of-recovery record instead
-	 * of a full checkpoint. A checkpoint is requested later, after we're
-	 * fully out of recovery mode and already accepting queries.
-	 */
-	if (ArchiveRecoveryRequested && IsUnderPostmaster &&
-		PromoteIsTriggered())
-	{
-		promoted = true;
-
-		/*
-		 * Insert a special WAL record to mark the end of recovery, since we
-		 * aren't doing a checkpoint. That means that the checkpointer process
-		 * may likely be in the middle of a time-smoothed restartpoint and
-		 * could continue to be for minutes after this.  That sounds strange,
-		 * but the effect is roughly the same and it would be stranger to try
-		 * to come out of the restartpoint and then checkpoint. We request a
-		 * checkpoint later anyway, just for safety.
-		 */
-		CreateEndOfRecoveryRecord();
-	}
-	else
-	{
-		RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-						  CHECKPOINT_IMMEDIATE |
-						  CHECKPOINT_WAIT);
-	}
-
-	return promoted;
-}
-
 /*
  * Is the system still in recovery?
  *
@@ -6053,9 +6004,8 @@ LogCheckpointStart(int flags, bool restartpoint)
 	if (restartpoint)
 		ereport(LOG,
 		/* translator: the placeholders show checkpoint options */
-				(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
+				(errmsg("restartpoint starting:%s%s%s%s%s%s%s",
 						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
 						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
 						(flags & CHECKPOINT_FORCE) ? " force" : "",
 						(flags & CHECKPOINT_WAIT) ? " wait" : "",
@@ -6065,9 +6015,8 @@ LogCheckpointStart(int flags, bool restartpoint)
 	else
 		ereport(LOG,
 		/* translator: the placeholders show checkpoint options */
-				(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
+				(errmsg("checkpoint starting:%s%s%s%s%s%s%s",
 						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
 						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
 						(flags & CHECKPOINT_FORCE) ? " force" : "",
 						(flags & CHECKPOINT_WAIT) ? " wait" : "",
@@ -6219,7 +6168,7 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
 	 * pg_stat_activity to see the status of the checkpointer or the startup
 	 * process.
 	 */
-	if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
+	if ((flags & CHECKPOINT_IS_SHUTDOWN) == 0)
 		return;
 
 	if (reset)
@@ -6228,8 +6177,7 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
 	{
 		char		activitymsg[128];
 
-		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
-				 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
+		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s",
 				 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
 				 restartpoint ? "restartpoint" : "checkpoint");
 		set_ps_display(activitymsg);
@@ -6242,12 +6190,10 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
- *		CHECKPOINT_END_OF_RECOVERY).
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
  *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
  *
  * Note: flags contains other bits, of interest here only for logging purposes.
@@ -6269,7 +6215,7 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset)
 void
 CreateCheckPoint(int flags)
 {
-	bool		shutdown;
+	bool		shutdown = (flags & CHECKPOINT_IS_SHUTDOWN) != 0;
 	CheckPoint	checkPoint;
 	XLogRecPtr	recptr;
 	XLogSegNo	_logSegNo;
@@ -6280,19 +6226,9 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	last_important_lsn;
 	VirtualTransactionId *vxids;
 	int			nvxids;
-	int			oldXLogAllowed = 0;
-
-	/*
-	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
-	 * issued at a different time.
-	 */
-	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
-		shutdown = true;
-	else
-		shutdown = false;
 
 	/* sanity check */
-	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+	if (RecoveryInProgress())
 		elog(ERROR, "can't create a checkpoint during recovery");
 
 	/*
@@ -6358,8 +6294,7 @@ CreateCheckPoint(int flags)
 	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
 	 * avoid inserting duplicate checkpoints when the system is idle.
 	 */
-	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
-				  CHECKPOINT_FORCE)) == 0)
+	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
 	{
 		if (last_important_lsn == ControlFile->checkPoint)
 		{
@@ -6371,19 +6306,8 @@ CreateCheckPoint(int flags)
 		}
 	}
 
-	/*
-	 * An end-of-recovery checkpoint is created before anyone is allowed to
-	 * write WAL. To allow us to write the checkpoint record, temporarily
-	 * enable XLogInsertAllowed.
-	 */
-	if (flags & CHECKPOINT_END_OF_RECOVERY)
-		oldXLogAllowed = LocalSetXLogInsertAllowed();
-
 	checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
-	if (flags & CHECKPOINT_END_OF_RECOVERY)
-		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
-	else
-		checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
+	checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
 
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
@@ -6540,8 +6464,7 @@ CreateCheckPoint(int flags)
 	 * allows us to reconstruct the state of running transactions during
 	 * archive recovery, if required. Skip, if this info disabled.
 	 *
-	 * If we are shutting down, or Startup process is completing crash
-	 * recovery we don't need to write running xact data.
+	 * If we are shutting down, we don't need to write running xact data.
 	 */
 	if (!shutdown && XLogStandbyInfoActive())
 		LogStandbySnapshot();
@@ -6562,17 +6485,10 @@ CreateCheckPoint(int flags)
 	/*
 	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
 	 * overwritten at next startup.  No-one should even try, this just allows
-	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
-	 * to just temporarily disable writing until the system has exited
-	 * recovery.
+	 * sanity-checking.
 	 */
 	if (shutdown)
-	{
-		if (flags & CHECKPOINT_END_OF_RECOVERY)
-			LocalXLogInsertAllowed = oldXLogAllowed;
-		else
-			LocalXLogInsertAllowed = 0; /* never again write WAL */
-	}
+		LocalXLogInsertAllowed = 0; /* never again write WAL */
 
 	/*
 	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
@@ -7017,7 +6933,7 @@ CreateRestartPoint(int flags)
 	 * Update pg_control, using current time.  Check that it still shows
 	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
 	 * this is a quick hack to make sure nothing really bad happens if somehow
-	 * we get here after the end-of-recovery checkpoint.
+	 * we get here after end of recovery.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index c937c39f50..624c728d95 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -411,13 +411,6 @@ CheckpointerMain(void)
 
 			ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
 
-			/*
-			 * The end-of-recovery checkpoint is a real checkpoint that's
-			 * performed while we're still in recovery.
-			 */
-			if (flags & CHECKPOINT_END_OF_RECOVERY)
-				do_restartpoint = false;
-
 			/*
 			 * We will warn if (a) too soon since last checkpoint (whatever
 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
@@ -916,12 +909,10 @@ CheckpointerShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
- *		CHECKPOINT_END_OF_RECOVERY).
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN).
  *	CHECKPOINT_WAIT: wait for completion before returning (otherwise,
  *		just signal checkpointer to do it, and return).
  *	CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 6303647fe0..de6849d083 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -349,10 +349,10 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 				 * Abort all transactions that we keep track of, that are
 				 * older than the record's oldestRunningXid. This is the most
 				 * convenient spot for doing so since, in contrast to shutdown
-				 * or end-of-recovery checkpoints, we have information about
-				 * all running transactions which includes prepared ones,
-				 * while shutdown checkpoints just know that no non-prepared
-				 * transactions are in progress.
+				 * recovery checkpoints, we have information about all running
+				 * transactions which includes prepared ones, while shutdown
+				 * checkpoints just know that no non-prepared transactions are
+				 * in progress.
 				 */
 				ReorderBufferAbortOld(ctx->reorder, running->oldestRunningXid);
 			}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e02ea3a977..9ac42a9fd4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1933,10 +1933,9 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
  *
  * This is called at checkpoint time to write out all dirty shared buffers.
  * The checkpoint request flags should be passed in.  If CHECKPOINT_IMMEDIATE
- * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
- * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
- * unlogged buffers, which are otherwise skipped.  The remaining flags
- * currently have no effect here.
+ * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN
+ * or CHECKPOINT_FLUSH_ALL is set, we write even unlogged buffers, which are
+ * otherwise skipped.  The remaining flags currently have no effect here.
  */
 static void
 BufferSync(int flags)
@@ -1962,8 +1961,7 @@ BufferSync(int flags)
 	 * we write only permanent, dirty buffers.  But at shutdown or end of
 	 * recovery, we write all dirty buffers.
 	 */
-	if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
-					CHECKPOINT_FLUSH_ALL))))
+	if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FLUSH_ALL))))
 		mask |= BM_PERMANENT;
 
 	/*
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d9f2487a96..04091d8576 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -132,8 +132,7 @@ extern PGDLLIMPORT bool XLOG_DEBUG;
 
 /* These directly affect the behavior of CreateCheckPoint and subsidiaries */
 #define CHECKPOINT_IS_SHUTDOWN	0x0001	/* Checkpoint is for shutdown */
-#define CHECKPOINT_END_OF_RECOVERY	0x0002	/* Like shutdown checkpoint, but
-											 * issued at end of WAL recovery */
+/* 0x0002 was CHECKPOINT_END_OF_RECOVERY, now unused */
 #define CHECKPOINT_IMMEDIATE	0x0004	/* Do it without delays */
 #define CHECKPOINT_FORCE		0x0008	/* Force even if no activity */
 #define CHECKPOINT_FLUSH_ALL	0x0010	/* Flush all pages, including those
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index fae0bef8f5..b71f1eed90 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -258,7 +258,7 @@ typedef struct xl_overwrite_contrecord
 	TimestampTz overwrite_time;
 } xl_overwrite_contrecord;
 
-/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+/* End of recovery mark, and possible TLI change */
 typedef struct xl_end_of_recovery
 {
 	TimestampTz end_time;