* xlog.c
* PostgreSQL write-ahead log manager
*
+ * The Write-Ahead Log (WAL) functionality is split into several source
+ * files, in addition to this one:
+ *
+ * xloginsert.c - Functions for constructing WAL records
+ * xlogrecovery.c - WAL recovery and standby code
+ * xlogreader.c - Facility for reading WAL files and parsing WAL records
+ * xlogutils.c - Helper functions for WAL redo routines
+ *
+ * This file contains functions for coordinating database startup and
+ * checkpointing, and managing the write-ahead log buffers when the
+ * system is running.
+ *
+ * StartupXLOG() is the main entry point of the startup process. It
+ * coordinates database startup, performing WAL recovery, and the
+ * transition from WAL recovery into normal operations.
+ *
+ * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
+ * callers should not call this directly, but use the functions in
+ * xloginsert.c to construct the WAL record. XLogFlush() can be used
+ * to force the WAL to disk.
+ *
+ * In addition to those, there are many other functions for interrogating
+ * the current system state, and for starting/stopping backups.
+ *
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
#include "access/xlogarchive.h"
#include "access/xloginsert.h"
#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
#include "access/xlogutils.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
-#include "commands/progress.h"
-#include "commands/tablespace.h"
#include "common/controldata_utils.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "storage/sync.h"
-#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
extern uint32 bootstrap_data_checksum_version;
-/* Unsupported old recovery command file names (relative to $PGDATA) */
-#define RECOVERY_COMMAND_FILE "recovery.conf"
-#define RECOVERY_COMMAND_DONE "recovery.done"
-
/* timeline ID to be used when bootstrapping */
#define BootstrapTimeLineID 1
{NULL, 0, false}
};
-const struct config_enum_entry recovery_target_action_options[] = {
- {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
- {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
- {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
- {NULL, 0, false}
-};
-
/*
* Statistics for current checkpoint are collected in this global struct.
* Because only the checkpointer or a stand-alone backend can perform
*/
CheckpointStatsData CheckpointStats;
-/* Local copy of WalRcv->flushedUpto */
-static XLogRecPtr flushedUpto = 0;
-static TimeLineID receiveTLI = 0;
-
-/*
- * abortedRecPtr is the start pointer of a broken record at end of WAL when
- * recovery completes; missingContrecPtr is the location of the first
- * contrecord that went missing. See CreateOverwriteContrecordRecord for
- * details.
- */
-static XLogRecPtr abortedRecPtr;
-static XLogRecPtr missingContrecPtr;
-
/*
* During recovery, lastFullPageWrites keeps track of full_page_writes that
* the replayed WAL records indicate. It's initialized with full_page_writes
*/
static bool LocalRecoveryInProgress = true;
-/*
- * Local copy of SharedHotStandbyActive variable. False actually means "not
- * known, need to check the shared state".
- */
-static bool LocalHotStandbyActive = false;
-
-/*
- * Local copy of SharedPromoteIsTriggered variable. False actually means "not
- * known, need to check the shared state".
- */
-static bool LocalPromoteIsTriggered = false;
-
/*
* Local state for XLogInsertAllowed():
* 1: unconditionally allowed to insert XLOG
*/
static int LocalXLogInsertAllowed = -1;
-/*
- * When ArchiveRecoveryRequested is set, archive recovery was requested,
- * ie. signal files were present. When InArchiveRecovery is set, we are
- * currently recovering using offline XLOG archives. These variables are only
- * valid in the startup process.
- *
- * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
- * currently performing crash recovery using only XLOG files in pg_wal, but
- * will switch to using offline XLOG archives as soon as we reach the end of
- * WAL in pg_wal.
-*/
-bool ArchiveRecoveryRequested = false;
-bool InArchiveRecovery = false;
-
-static bool standby_signal_file_found = false;
-static bool recovery_signal_file_found = false;
-
-/* Buffers dedicated to consistency checks of size BLCKSZ */
-static char *replay_image_masked = NULL;
-static char *primary_image_masked = NULL;
-
-/* options formerly taken from recovery.conf for archive recovery */
-char *recoveryRestoreCommand = NULL;
-char *recoveryEndCommand = NULL;
-char *archiveCleanupCommand = NULL;
-RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
-bool recoveryTargetInclusive = true;
-int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
-TransactionId recoveryTargetXid;
-char *recovery_target_time_string;
-static TimestampTz recoveryTargetTime;
-const char *recoveryTargetName;
-XLogRecPtr recoveryTargetLSN;
-int recovery_min_apply_delay = 0;
-
-/* options formerly taken from recovery.conf for XLOG streaming */
-bool StandbyModeRequested = false;
-char *PrimaryConnInfo = NULL;
-char *PrimarySlotName = NULL;
-char *PromoteTriggerFile = NULL;
-bool wal_receiver_create_temp_slot = false;
-
-/* are we currently in standby mode? */
-bool StandbyMode = false;
-
-/*
- * if recoveryStopsBefore/After returns true, it saves information of the stop
- * point here
- */
-static TransactionId recoveryStopXid;
-static TimestampTz recoveryStopTime;
-static XLogRecPtr recoveryStopLSN;
-static char recoveryStopName[MAXFNAMELEN];
-static bool recoveryStopAfter;
-
-/*
- * recoveryTargetTimeLineGoal: what the user requested, if any
- *
- * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
- *
- * recoveryTargetTLI: the currently understood target timeline; changes
- *
- * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
- * its known parents, newest first (so recoveryTargetTLI is always the
- * first list member). Only these TLIs are expected to be seen in the WAL
- * segments we read, and indeed only these TLIs will be considered as
- * candidate WAL files to open at all.
- *
- * curFileTLI: the TLI appearing in the name of the current input WAL file.
- * (This is not necessarily the same as the timeline from which we are
- * replaying WAL, which StartupXLOG calls replayTLI, because we could be
- * scanning data that was copied from an ancestor timeline when the current
- * file was created.) During a sequential scan we do not allow this value
- * to decrease.
- */
-RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
-TimeLineID recoveryTargetTLIRequested = 0;
-TimeLineID recoveryTargetTLI = 0;
-static List *expectedTLEs;
-static TimeLineID curFileTLI;
-
/*
* ProcLastRecPtr points to the start of the last XLOG record inserted by the
* current backend. It is updated for all inserts. XactLastRecEnd points to
*/
static bool doPageWrites;
-/* Has the recovery code requested a walreceiver wakeup? */
-static bool doRequestWalReceiverReply;
-
-/*
- * RedoStartLSN points to the checkpoint's REDO location which is specified
- * in a backup label file, backup history file or control file. In standby
- * mode, XLOG streaming usually starts from the position where an invalid
- * record was found. But if we fail to read even the initial checkpoint
- * record, we use the REDO location instead of the checkpoint location as
- * the start position of XLOG streaming. Otherwise we would have to jump
- * backwards to the REDO location after reading the checkpoint record,
- * because the REDO record can precede the checkpoint record.
- */
-static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
-
/*----------
* Shared-memory data structures for XLOG control
*
*/
RecoveryState SharedRecoveryState;
- /*
- * SharedHotStandbyActive indicates if we allow hot standby queries to be
- * run. Protected by info_lck.
- */
- bool SharedHotStandbyActive;
-
/*
* InstallXLogFileSegmentActive indicates whether the checkpointer should
* arrange for future segments by recycling and/or PreallocXlogFiles().
*/
bool InstallXLogFileSegmentActive;
- /*
- * SharedPromoteIsTriggered indicates if a standby promotion has been
- * triggered. Protected by info_lck.
- */
- bool SharedPromoteIsTriggered;
-
/*
* WalWriterSleeping indicates whether the WAL writer is currently in
* low-power mode (and hence should be nudged if an async commit occurs).
*/
bool WalWriterSleeping;
- /*
- * recoveryWakeupLatch is used to wake up the startup process to continue
- * WAL replay, if it is waiting for WAL to arrive or failover trigger file
- * to appear.
- *
- * Note that the startup process also uses another latch, its procLatch,
- * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
- * signaling the startup process in favor of using its procLatch, which
- * comports better with possible generic signal handlers using that latch.
- * But we should not do that because the startup process doesn't assume
- * that it's waken up by walreceiver process or SIGHUP signal handler
- * while it's waiting for recovery conflict. The separate latches,
- * recoveryWakeupLatch and procLatch, should be used for inter-process
- * communication for WAL replay and recovery conflict, respectively.
- */
- Latch recoveryWakeupLatch;
-
/*
* During recovery, we keep a copy of the latest checkpoint record here.
* lastCheckPointRecPtr points to start of checkpoint record and
XLogRecPtr lastCheckPointEndPtr;
CheckPoint lastCheckPoint;
- /*
- * lastReplayedEndRecPtr points to end+1 of the last record successfully
- * replayed. When we're currently replaying a record, ie. in a redo
- * function, replayEndRecPtr points to the end+1 of the record being
- * replayed, otherwise it's equal to lastReplayedEndRecPtr.
- */
- XLogRecPtr lastReplayedEndRecPtr;
- TimeLineID lastReplayedTLI;
- XLogRecPtr replayEndRecPtr;
- TimeLineID replayEndTLI;
- /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
- TimestampTz recoveryLastXTime;
-
- /*
- * timestamp of when we started replaying the current chunk of WAL data,
- * only relevant for replication or archive recovery
- */
- TimestampTz currentChunkStartTime;
- /* Recovery pause state */
- RecoveryPauseState recoveryPauseState;
- ConditionVariable recoveryNotPausedCV;
-
/*
* lastFpwDisableRecPtr points to the start of the last replayed
* XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
*/
static XLogwrtResult LogwrtResult = {0, 0};
-/*
- * Codes indicating where we got a WAL file from during recovery, or where
- * to attempt to get one.
- */
-typedef enum
-{
- XLOG_FROM_ANY = 0, /* request to read WAL from any source */
- XLOG_FROM_ARCHIVE, /* restored using restore_command */
- XLOG_FROM_PG_WAL, /* existing file in pg_wal */
- XLOG_FROM_STREAM /* streamed from primary */
-} XLogSource;
-
-/* human-readable names for XLogSources, for debugging output */
-static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
-
/*
* openLogFile is -1 or a kernel FD for an open log file segment.
* openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
static XLogSegNo openLogSegNo = 0;
static TimeLineID openLogTLI = 0;
-/*
- * These variables are used similarly to the ones above, but for reading
- * the XLOG. readOff is the offset of the page just read, readLen
- * indicates how much of it has been read into readBuf, and readSource
- * indicates where we got the currently open file from.
- * Note: we could use Reserve/ReleaseExternalFD to track consumption of
- * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
- * not read by general-purpose sessions.
- */
-static int readFile = -1;
-static XLogSegNo readSegNo = 0;
-static uint32 readOff = 0;
-static uint32 readLen = 0;
-static XLogSource readSource = XLOG_FROM_ANY;
-
-/*
- * Keeps track of which source we're currently reading from. This is
- * different from readSource in that this is always set, even when we don't
- * currently have a WAL file open. If lastSourceFailed is set, our last
- * attempt to read from currentSource failed, and we should try another source
- * next.
- *
- * pendingWalRcvRestart is set when a config change occurs that requires a
- * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
- */
-static XLogSource currentSource = XLOG_FROM_ANY;
-static bool lastSourceFailed = false;
-static bool pendingWalRcvRestart = false;
-
-typedef struct XLogPageReadPrivate
-{
- int emode;
- bool fetching_ckpt; /* are we fetching a checkpoint record? */
- bool randAccess;
- TimeLineID replayTLI;
-} XLogPageReadPrivate;
-
-/*
- * These variables track when we last obtained some WAL data to process,
- * and where we got it from. (XLogReceiptSource is initially the same as
- * readSource, but readSource gets reset to zero when we don't have data
- * to process right now. It is also different from currentSource, which
- * also changes when we try to read from a source and fail, while
- * XLogReceiptSource tracks where we last successfully read some WAL.)
- */
-static TimestampTz XLogReceiptTime = 0;
-static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
-
/*
* Local copies of equivalent fields in the control file. When running
- * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
+ * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
* expect to replay all the WAL available, and updateMinRecoveryPoint is
* switched to false to prevent any updates while replaying records.
* Those values are kept consistent as long as crash recovery runs.
*/
-static XLogRecPtr minRecoveryPoint;
-static TimeLineID minRecoveryPointTLI;
+static XLogRecPtr LocalMinRecoveryPoint;
+static TimeLineID LocalMinRecoveryPointTLI;
static bool updateMinRecoveryPoint = true;
-/*
- * Have we reached a consistent database state? In crash recovery, we have
- * to replay all the WAL, so reachedConsistency is never set. During archive
- * recovery, the database is consistent once minRecoveryPoint is reached.
- */
-bool reachedConsistency = false;
-
-static bool InRedo = false;
-
/* For WALInsertLockAcquire/Release functions */
static int MyLockNo = 0;
static bool holdingAllLocks = false;
static MemoryContext walDebugCxt = NULL;
#endif
-static void readRecoverySignalFile(void);
-static void validateRecoveryParameters(void);
-static void XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog,
- TimeLineID newTLI);
static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
XLogRecPtr EndOfLog,
TimeLineID newTLI);
-static bool recoveryStopsBefore(XLogReaderState *record);
-static bool recoveryStopsAfter(XLogReaderState *record);
-static char *getRecoveryStopReason(void);
-static void ConfirmRecoveryPaused(void);
-static void recoveryPausesHere(bool endOfRecovery);
-static bool recoveryApplyDelay(XLogReaderState *record);
-static void SetLatestXTime(TimestampTz xtime);
-static void SetCurrentChunkStartTime(TimestampTz xtime);
static void CheckRequiredParameterValues(void);
static void XLogReportParameters(void);
-static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
- TimeLineID prevTLI, TimeLineID replayTLI);
-static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
- XLogReaderState *state);
static int LocalSetXLogInsertAllowed(void);
static void CreateEndOfRecoveryRecord(void);
static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
bool opportunistic);
-static bool XLogCheckpointNeeded(XLogSegNo new_segno);
static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
bool find_free, XLogSegNo max_segno,
TimeLineID tli);
-static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
- XLogSource source, bool notfoundOk);
-static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
-static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
- int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
-static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
- bool fetching_ckpt, XLogRecPtr tliRecPtr,
- TimeLineID replayTLI,
- XLogRecPtr replayLSN);
-static void XLogShutdownWalRcv(void);
-static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
static void XLogFileClose(void);
static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
static void RemoveTempXlogFiles(void);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
-static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
- int emode, bool fetching_ckpt,
- TimeLineID replayTLI);
-static void CheckRecoveryConsistency(void);
static bool PerformRecoveryXLogAction(void);
-static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
- XLogRecPtr RecPtr, int whichChkpt, bool report,
- TimeLineID replayTLI);
-static bool rescanLatestTimeLine(TimeLineID replayTLI,
- XLogRecPtr replayLSN);
static void InitControlFile(uint64 sysidentifier);
static void WriteControlFile(void);
static void ReadControlFile(void);
+static void UpdateControlFile(void);
static char *str_time(pg_time_t tnow);
-static void SetPromoteIsTriggered(void);
-static bool CheckForStandbyTrigger(void);
-#ifdef WAL_DEBUG
-static void xlog_outrec(StringInfo buf, XLogReaderState *record);
-#endif
-static void xlog_block_info(StringInfo buf, XLogReaderState *record);
-static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
static void pg_start_backup_callback(int code, Datum arg);
static void pg_stop_backup_callback(int code, Datum arg);
-static bool read_backup_label(XLogRecPtr *checkPointLoc,
- TimeLineID *backupLabelTLI,
- bool *backupEndRequired, bool *backupFromStandby);
-static bool read_tablespace_map(List **tablespaces);
-static void rm_redo_error_callback(void *arg);
static int get_sync_bit(int method);
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
-static void checkXLogConsistency(XLogReaderState *record);
static void WALInsertLockAcquire(void);
static void WALInsertLockAcquireExclusive(void);
return true;
}
-/*
- * Checks whether the current buffer page and backup page stored in the
- * WAL record are consistent or not. Before comparing the two pages, a
- * masking can be applied to the pages to ignore certain areas like hint bits,
- * unused space between pd_lower and pd_upper among other things. This
- * function should be called once WAL replay has been completed for a
- * given record.
- */
-static void
-checkXLogConsistency(XLogReaderState *record)
-{
- RmgrId rmid = XLogRecGetRmid(record);
- RelFileNode rnode;
- ForkNumber forknum;
- BlockNumber blkno;
- int block_id;
-
- /* Records with no backup blocks have no need for consistency checks. */
- if (!XLogRecHasAnyBlockRefs(record))
- return;
-
- Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
-
- for (block_id = 0; block_id <= record->max_block_id; block_id++)
- {
- Buffer buf;
- Page page;
-
- if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
- {
- /*
- * WAL record doesn't contain a block reference with the given id.
- * Do nothing.
- */
- continue;
- }
-
- Assert(XLogRecHasBlockImage(record, block_id));
-
- if (XLogRecBlockImageApply(record, block_id))
- {
- /*
- * WAL record has already applied the page, so bypass the
- * consistency check as that would result in comparing the full
- * page stored in the record with itself.
- */
- continue;
- }
-
- /*
- * Read the contents from the current buffer and store it in a
- * temporary page.
- */
- buf = XLogReadBufferExtended(rnode, forknum, blkno,
- RBM_NORMAL_NO_LOG);
- if (!BufferIsValid(buf))
- continue;
-
- LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
- page = BufferGetPage(buf);
-
- /*
- * Take a copy of the local page where WAL has been applied to have a
- * comparison base before masking it...
- */
- memcpy(replay_image_masked, page, BLCKSZ);
-
- /* No need for this page anymore now that a copy is in. */
- UnlockReleaseBuffer(buf);
-
- /*
- * If the block LSN is already ahead of this WAL record, we can't
- * expect contents to match. This can happen if recovery is
- * restarted.
- */
- if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
- continue;
-
- /*
- * Read the contents from the backup copy, stored in WAL record and
- * store it in a temporary page. There is no need to allocate a new
- * page here, a local buffer is fine to hold its contents and a mask
- * can be directly applied on it.
- */
- if (!RestoreBlockImage(record, block_id, primary_image_masked))
- elog(ERROR, "failed to restore block image");
-
- /*
- * If masking function is defined, mask both the primary and replay
- * images
- */
- if (RmgrTable[rmid].rm_mask != NULL)
- {
- RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
- RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
- }
-
- /* Time to compare the primary and replay images. */
- if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
- {
- elog(FATAL,
- "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
- rnode.spcNode, rnode.dbNode, rnode.relNode,
- forknum, blkno);
- }
- }
-}
-
/*
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
* area in the WAL.
*
* Note: it is caller's responsibility that RedoRecPtr is up-to-date.
*/
-static bool
+bool
XLogCheckpointNeeded(XLogSegNo new_segno)
{
XLogSegNo old_segno;
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
/* Quick check using our local copy of the variable */
- if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
+ if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
return;
/*
* available is replayed in this case. This also saves from extra locks
* taken on the control file from the startup process.
*/
- if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
{
updateMinRecoveryPoint = false;
return;
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
/* update local copy */
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
- if (XLogRecPtrIsInvalid(minRecoveryPoint))
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
updateMinRecoveryPoint = false;
- else if (force || minRecoveryPoint < lsn)
+ else if (force || LocalMinRecoveryPoint < lsn)
{
XLogRecPtr newMinRecoveryPoint;
TimeLineID newMinRecoveryPointTLI;
* all. Instead, we just log a warning and continue with recovery.
* (See also the comments about corrupt LSNs in XLogFlush.)
*/
- SpinLockAcquire(&XLogCtl->info_lck);
- newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
- newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
- SpinLockRelease(&XLogCtl->info_lck);
-
+ newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
if (!force && newMinRecoveryPoint < lsn)
elog(WARNING,
"xlog min recovery request %X/%X is past current point %X/%X",
ControlFile->minRecoveryPoint = newMinRecoveryPoint;
ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
UpdateControlFile();
- minRecoveryPoint = newMinRecoveryPoint;
- minRecoveryPointTLI = newMinRecoveryPointTLI;
+ LocalMinRecoveryPoint = newMinRecoveryPoint;
+ LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
ereport(DEBUG2,
(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
- LSN_FORMAT_ARGS(minRecoveryPoint),
+ LSN_FORMAT_ARGS(newMinRecoveryPoint),
newMinRecoveryPointTLI)));
}
}
* which cannot update its local copy of minRecoveryPoint as long as
* it has not replayed all WAL available when doing crash recovery.
*/
- if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
updateMinRecoveryPoint = false;
/* Quick exit if already known to be updated or cannot be updated */
- if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
+ if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
return false;
/*
*/
if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
return true;
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
LWLockRelease(ControlFileLock);
/*
* process doing crash recovery, which should not update the control
* file value if crash recovery is still running.
*/
- if (XLogRecPtrIsInvalid(minRecoveryPoint))
+ if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
updateMinRecoveryPoint = false;
/* check again */
- if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
+ if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
return false;
else
return true;
return fd;
}
-/*
- * Open a logfile segment for reading (during recovery).
- *
- * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
- * Otherwise, it's assumed to be already available in pg_wal.
- */
-static int
-XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
- XLogSource source, bool notfoundOk)
-{
- char xlogfname[MAXFNAMELEN];
- char activitymsg[MAXFNAMELEN + 16];
- char path[MAXPGPATH];
- int fd;
-
- XLogFileName(xlogfname, tli, segno, wal_segment_size);
-
- switch (source)
- {
- case XLOG_FROM_ARCHIVE:
- /* Report recovery progress in PS display */
- snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
- xlogfname);
- set_ps_display(activitymsg);
-
- if (!RestoreArchivedFile(path, xlogfname,
- "RECOVERYXLOG",
- wal_segment_size,
- InRedo))
- return -1;
- break;
-
- case XLOG_FROM_PG_WAL:
- case XLOG_FROM_STREAM:
- XLogFilePath(path, tli, segno, wal_segment_size);
- break;
-
- default:
- elog(ERROR, "invalid XLogFileRead source %d", source);
- }
-
- /*
- * If the segment was fetched from archival storage, replace the existing
- * xlog segment (if any) with the archival version.
- */
- if (source == XLOG_FROM_ARCHIVE)
- {
- Assert(!XLogCtl->InstallXLogFileSegmentActive);
- KeepFileRestoredFromArchive(path, xlogfname);
-
- /*
- * Set path to point at the new file in pg_wal.
- */
- snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
- }
-
- fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
- if (fd >= 0)
- {
- /* Success! */
- curFileTLI = tli;
-
- /* Report recovery progress in PS display */
- snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
- xlogfname);
- set_ps_display(activitymsg);
-
- /* Track source of data in assorted state variables */
- readSource = source;
- XLogReceiptSource = source;
- /* In FROM_STREAM case, caller tracks receipt time, not me */
- if (source != XLOG_FROM_STREAM)
- XLogReceiptTime = GetCurrentTimestamp();
-
- return fd;
- }
- if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
- ereport(PANIC,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", path)));
- return -1;
-}
-
-/*
- * Open a logfile segment for reading (during recovery).
- *
- * This version searches for the segment with any TLI listed in expectedTLEs.
- */
-static int
-XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
-{
- char path[MAXPGPATH];
- ListCell *cell;
- int fd;
- List *tles;
-
- /*
- * Loop looking for a suitable timeline ID: we might need to read any of
- * the timelines listed in expectedTLEs.
- *
- * We expect curFileTLI on entry to be the TLI of the preceding file in
- * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
- * to go backwards; this prevents us from picking up the wrong file when a
- * parent timeline extends to higher segment numbers than the child we
- * want to read.
- *
- * If we haven't read the timeline history file yet, read it now, so that
- * we know which TLIs to scan. We don't save the list in expectedTLEs,
- * however, unless we actually find a valid segment. That way if there is
- * neither a timeline history file nor a WAL segment in the archive, and
- * streaming replication is set up, we'll read the timeline history file
- * streamed from the primary when we start streaming, instead of
- * recovering with a dummy history generated here.
- */
- if (expectedTLEs)
- tles = expectedTLEs;
- else
- tles = readTimeLineHistory(recoveryTargetTLI);
-
- foreach(cell, tles)
- {
- TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
- TimeLineID tli = hent->tli;
-
- if (tli < curFileTLI)
- break; /* don't bother looking at too-old TLIs */
-
- /*
- * Skip scanning the timeline ID that the logfile segment to read
- * doesn't belong to
- */
- if (hent->begin != InvalidXLogRecPtr)
- {
- XLogSegNo beginseg = 0;
-
- XLByteToSeg(hent->begin, beginseg, wal_segment_size);
-
- /*
- * The logfile segment that doesn't belong to the timeline is
- * older or newer than the segment that the timeline started or
- * ended at, respectively. It's sufficient to check only the
- * starting segment of the timeline here. Since the timelines are
- * scanned in descending order in this loop, any segments newer
- * than the ending segment should belong to newer timeline and
- * have already been read before. So it's not necessary to check
- * the ending segment of the timeline here.
- */
- if (segno < beginseg)
- continue;
- }
-
- if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
- {
- fd = XLogFileRead(segno, emode, tli,
- XLOG_FROM_ARCHIVE, true);
- if (fd != -1)
- {
- elog(DEBUG1, "got WAL segment from archive");
- if (!expectedTLEs)
- expectedTLEs = tles;
- return fd;
- }
- }
-
- if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
- {
- fd = XLogFileRead(segno, emode, tli,
- XLOG_FROM_PG_WAL, true);
- if (fd != -1)
- {
- if (!expectedTLEs)
- expectedTLEs = tles;
- return fd;
- }
- }
- }
-
- /* Couldn't find it. For simplicity, complain about front timeline */
- XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
- errno = ENOENT;
- ereport(emode,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", path)));
- return -1;
-}
-
/*
* Close the current logfile segment for writing.
*/
* 'switchpoint' is the current point in WAL where we switch to new timeline,
* and 'newTLI' is the new timeline we switch to.
*/
-static void
+void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
{
DIR *xldir;
}
/*
- * Attempt to read the next XLOG record.
+ * I/O routines for pg_control
*
- * Before first call, the reader needs to be positioned to the first record
- * by calling XLogBeginRead().
+ * *ControlFile is a buffer in shared memory that holds an image of the
+ * contents of pg_control. WriteControlFile() initializes pg_control
+ * given a preloaded buffer, ReadControlFile() loads the buffer from
+ * the pg_control file (during postmaster or standalone-backend startup),
+ * and UpdateControlFile() rewrites pg_control after we modify xlog state.
+ * InitControlFile() fills the buffer with initial values.
*
- * If no valid record is available, returns NULL, or fails if emode is PANIC.
- * (emode must be either PANIC, LOG). In standby mode, retries until a valid
- * record is available.
+ * For simplicity, WriteControlFile() initializes the fields of pg_control
+ * that are related to checking backend/database compatibility, and
+ * ReadControlFile() verifies they are correct. We could split out the
+ * I/O and compatibility-check functions, but there seems no need currently.
*/
-static XLogRecord *
-ReadRecord(XLogReaderState *xlogreader, int emode,
- bool fetching_ckpt, TimeLineID replayTLI)
-{
- XLogRecord *record;
- XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
- /* Pass through parameters to XLogPageRead */
- private->fetching_ckpt = fetching_ckpt;
- private->emode = emode;
- private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
- private->replayTLI = replayTLI;
+static void
+InitControlFile(uint64 sysidentifier)
+{
+ char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
- /* This is the first attempt to read this page. */
- lastSourceFailed = false;
+ /*
+ * Generate a random nonce. This is used for authentication requests that
+ * will fail because the user does not exist. The nonce is used to create
+ * a genuine-looking password challenge for the non-existent user, in lieu
+ * of an actual stored password.
+ */
+ if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
+ ereport(PANIC,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not generate secret authorization token")));
- for (;;)
- {
- char *errormsg;
-
- record = XLogReadRecord(xlogreader, &errormsg);
- if (record == NULL)
- {
- /*
- * When not in standby mode we find that WAL ends in an incomplete
- * record, keep track of that record. After recovery is done,
- * we'll write a record to indicate downstream WAL readers that
- * that portion is to be ignored.
- */
- if (!StandbyMode &&
- !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
- {
- abortedRecPtr = xlogreader->abortedRecPtr;
- missingContrecPtr = xlogreader->missingContrecPtr;
- }
-
- if (readFile >= 0)
- {
- close(readFile);
- readFile = -1;
- }
-
- /*
- * We only end up here without a message when XLogPageRead()
- * failed - in that case we already logged something. In
- * StandbyMode that only happens if we have been triggered, so we
- * shouldn't loop anymore in that case.
- */
- if (errormsg)
- ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
- (errmsg_internal("%s", errormsg) /* already translated */ ));
- }
-
- /*
- * Check page TLI is one of the expected values.
- */
- else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
- {
- char fname[MAXFNAMELEN];
- XLogSegNo segno;
- int32 offset;
-
- XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
- offset = XLogSegmentOffset(xlogreader->latestPagePtr,
- wal_segment_size);
- XLogFileName(fname, xlogreader->seg.ws_tli, segno,
- wal_segment_size);
- ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
- (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
- xlogreader->latestPageTLI,
- fname,
- offset)));
- record = NULL;
- }
-
- if (record)
- {
- /* Great, got a record */
- return record;
- }
- else
- {
- /* No valid record available from this source */
- lastSourceFailed = true;
-
- /*
- * If archive recovery was requested, but we were still doing
- * crash recovery, switch to archive recovery and retry using the
- * offline archive. We have now replayed all the valid WAL in
- * pg_wal, so we are presumably now consistent.
- *
- * We require that there's at least some valid WAL present in
- * pg_wal, however (!fetching_ckpt). We could recover using the
- * WAL from the archive, even if pg_wal is completely empty, but
- * we'd have no idea how far we'd have to replay to reach
- * consistency. So err on the safe side and give up.
- */
- if (!InArchiveRecovery && ArchiveRecoveryRequested &&
- !fetching_ckpt)
- {
- ereport(DEBUG1,
- (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
- InArchiveRecovery = true;
- if (StandbyModeRequested)
- StandbyMode = true;
-
- /* initialize minRecoveryPoint to this record */
- LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
- ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
- if (ControlFile->minRecoveryPoint < xlogreader->EndRecPtr)
- {
- ControlFile->minRecoveryPoint = xlogreader->EndRecPtr;
- ControlFile->minRecoveryPointTLI = replayTLI;
- }
- /* update local copy */
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-
- /*
- * The startup process can update its local copy of
- * minRecoveryPoint from this point.
- */
- updateMinRecoveryPoint = true;
-
- UpdateControlFile();
-
- /*
- * We update SharedRecoveryState while holding the lock on
- * ControlFileLock so both states are consistent in shared
- * memory.
- */
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
- SpinLockRelease(&XLogCtl->info_lck);
-
- LWLockRelease(ControlFileLock);
-
- CheckRecoveryConsistency();
-
- /*
- * Before we retry, reset lastSourceFailed and currentSource
- * so that we will check the archive next.
- */
- lastSourceFailed = false;
- currentSource = XLOG_FROM_ANY;
-
- continue;
- }
-
- /* In standby mode, loop back to retry. Otherwise, give up. */
- if (StandbyMode && !CheckForStandbyTrigger())
- continue;
- else
- return NULL;
- }
- }
-}
-
-/*
- * Scan for new timelines that might have appeared in the archive since we
- * started recovery.
- *
- * If there are any, the function changes recovery target TLI to the latest
- * one and returns 'true'.
- */
-static bool
-rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
-{
- List *newExpectedTLEs;
- bool found;
- ListCell *cell;
- TimeLineID newtarget;
- TimeLineID oldtarget = recoveryTargetTLI;
- TimeLineHistoryEntry *currentTle = NULL;
-
- newtarget = findNewestTimeLine(recoveryTargetTLI);
- if (newtarget == recoveryTargetTLI)
- {
- /* No new timelines found */
- return false;
- }
-
- /*
- * Determine the list of expected TLIs for the new TLI
- */
-
- newExpectedTLEs = readTimeLineHistory(newtarget);
-
- /*
- * If the current timeline is not part of the history of the new timeline,
- * we cannot proceed to it.
- */
- found = false;
- foreach(cell, newExpectedTLEs)
- {
- currentTle = (TimeLineHistoryEntry *) lfirst(cell);
-
- if (currentTle->tli == recoveryTargetTLI)
- {
- found = true;
- break;
- }
- }
- if (!found)
- {
- ereport(LOG,
- (errmsg("new timeline %u is not a child of database system timeline %u",
- newtarget,
- replayTLI)));
- return false;
- }
-
- /*
- * The current timeline was found in the history file, but check that the
- * next timeline was forked off from it *after* the current recovery
- * location.
- */
- if (currentTle->end < replayLSN)
- {
- ereport(LOG,
- (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
- newtarget,
- replayTLI,
- LSN_FORMAT_ARGS(replayLSN))));
- return false;
- }
-
- /* The new timeline history seems valid. Switch target */
- recoveryTargetTLI = newtarget;
- list_free_deep(expectedTLEs);
- expectedTLEs = newExpectedTLEs;
-
- /*
- * As in StartupXLOG(), try to ensure we have all the history files
- * between the old target and new target in pg_wal.
- */
- restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
-
- ereport(LOG,
- (errmsg("new target timeline is %u",
- recoveryTargetTLI)));
-
- return true;
-}
-
-/*
- * I/O routines for pg_control
- *
- * *ControlFile is a buffer in shared memory that holds an image of the
- * contents of pg_control. WriteControlFile() initializes pg_control
- * given a preloaded buffer, ReadControlFile() loads the buffer from
- * the pg_control file (during postmaster or standalone-backend startup),
- * and UpdateControlFile() rewrites pg_control after we modify xlog state.
- * InitControlFile() fills the buffer with initial values.
- *
- * For simplicity, WriteControlFile() initializes the fields of pg_control
- * that are related to checking backend/database compatibility, and
- * ReadControlFile() verifies they are correct. We could split out the
- * I/O and compatibility-check functions, but there seems no need currently.
- */
-
-static void
-InitControlFile(uint64 sysidentifier)
-{
- char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
-
- /*
- * Generate a random nonce. This is used for authentication requests that
- * will fail because the user does not exist. The nonce is used to create
- * a genuine-looking password challenge for the non-existent user, in lieu
- * of an actual stored password.
- */
- if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
- ereport(PANIC,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("could not generate secret authorization token")));
-
- memset(ControlFile, 0, sizeof(ControlFileData));
- /* Initialize pg_control status fields */
- ControlFile->system_identifier = sysidentifier;
- memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
- ControlFile->state = DB_SHUTDOWNED;
- ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
+ memset(ControlFile, 0, sizeof(ControlFileData));
+ /* Initialize pg_control status fields */
+ ControlFile->system_identifier = sysidentifier;
+ memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
+ ControlFile->state = DB_SHUTDOWNED;
+ ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
/* Set important parameter values for use when replaying WAL */
ControlFile->MaxConnections = MaxConnections;
* Utility wrapper to update the control file. Note that the control
* file gets flushed.
*/
-void
+static void
UpdateControlFile(void)
{
update_controlfile(DataDir, ControlFile, true);
*/
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
- XLogCtl->SharedHotStandbyActive = false;
XLogCtl->InstallXLogFileSegmentActive = false;
- XLogCtl->SharedPromoteIsTriggered = false;
XLogCtl->WalWriterSleeping = false;
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
SpinLockInit(&XLogCtl->info_lck);
SpinLockInit(&XLogCtl->ulsn_lck);
- InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
- ConditionVariableInit(&XLogCtl->recoveryNotPausedCV);
}
/*
return buf;
}
-/*
- * See if there are any recovery signal files and if so, set state for
- * recovery.
- *
- * See if there is a recovery command file (recovery.conf), and if so
- * throw an ERROR since as of PG12 we no longer recognize that.
- */
-static void
-readRecoverySignalFile(void)
-{
- struct stat stat_buf;
-
- if (IsBootstrapProcessingMode())
- return;
-
- /*
- * Check for old recovery API file: recovery.conf
- */
- if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("using recovery command file \"%s\" is not supported",
- RECOVERY_COMMAND_FILE)));
-
- /*
- * Remove unused .done file, if present. Ignore if absent.
- */
- unlink(RECOVERY_COMMAND_DONE);
-
- /*
- * Check for recovery signal files and if found, fsync them since they
- * represent server state information. We don't sweat too much about the
- * possibility of fsync failure, however.
- *
- * If present, standby signal file takes precedence. If neither is present
- * then we won't enter archive recovery.
- */
- if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
- {
- int fd;
-
- fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
- if (fd >= 0)
- {
- (void) pg_fsync(fd);
- close(fd);
- }
- standby_signal_file_found = true;
- }
- else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
- {
- int fd;
-
- fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
- S_IRUSR | S_IWUSR);
- if (fd >= 0)
- {
- (void) pg_fsync(fd);
- close(fd);
- }
- recovery_signal_file_found = true;
- }
-
- StandbyModeRequested = false;
- ArchiveRecoveryRequested = false;
- if (standby_signal_file_found)
- {
- StandbyModeRequested = true;
- ArchiveRecoveryRequested = true;
- }
- else if (recovery_signal_file_found)
- {
- StandbyModeRequested = false;
- ArchiveRecoveryRequested = true;
- }
- else
- return;
-
- /*
- * We don't support standby mode in standalone backends; that requires
- * other processes such as the WAL receiver to be alive.
- */
- if (StandbyModeRequested && !IsUnderPostmaster)
- ereport(FATAL,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("standby mode is not supported by single-user servers")));
-}
-
-static void
-validateRecoveryParameters(void)
-{
- if (!ArchiveRecoveryRequested)
- return;
-
- /*
- * Check for compulsory parameters
- */
- if (StandbyModeRequested)
- {
- if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
- (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
- ereport(WARNING,
- (errmsg("specified neither primary_conninfo nor restore_command"),
- errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
- }
- else
- {
- if (recoveryRestoreCommand == NULL ||
- strcmp(recoveryRestoreCommand, "") == 0)
- ereport(FATAL,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("must specify restore_command when standby mode is not enabled")));
- }
-
- /*
- * Override any inconsistent requests. Note that this is a change of
- * behaviour in 9.5; prior to this we simply ignored a request to pause if
- * hot_standby = off, which was surprising behaviour.
- */
- if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
- !EnableHotStandby)
- recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
-
- /*
- * Final parsing of recovery_target_time string; see also
- * check_recovery_target_time().
- */
- if (recoveryTarget == RECOVERY_TARGET_TIME)
- {
- recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
- CStringGetDatum(recovery_target_time_string),
- ObjectIdGetDatum(InvalidOid),
- Int32GetDatum(-1)));
- }
-
- /*
- * If user specified recovery_target_timeline, validate it or compute the
- * "latest" value. We can't do this until after we've gotten the restore
- * command and set InArchiveRecovery, because we need to fetch timeline
- * history files from the archive.
- */
- if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
- {
- TimeLineID rtli = recoveryTargetTLIRequested;
-
- /* Timeline 1 does not have a history file, all else should */
- if (rtli != 1 && !existsTimeLineHistory(rtli))
- ereport(FATAL,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("recovery target timeline %u does not exist",
- rtli)));
- recoveryTargetTLI = rtli;
- }
- else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
- {
- /* We start the "latest" search from pg_control's timeline */
- recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
- }
- else
- {
- /*
- * else we just use the recoveryTargetTLI as already read from
- * ControlFile
- */
- Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
- }
-}
-
/*
* Initialize the first WAL segment on new timeline.
*/
}
/*
- * Extract timestamp from WAL record.
+ * Check to see if required parameters are set high enough on this server
+ * for various aspects of recovery operation.
*
- * If the record contains a timestamp, returns true, and saves the timestamp
- * in *recordXtime. If the record type has no timestamp, returns false.
- * Currently, only transaction commit/abort records and restore points contain
- * timestamps.
+ * Note that all the parameters which this function tests need to be
+ * listed in Administrator's Overview section in high-availability.sgml.
+ * If you change them, don't forget to update the list.
*/
-static bool
-getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
+static void
+CheckRequiredParameterValues(void)
{
- uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
- uint8 xact_info = info & XLOG_XACT_OPMASK;
- uint8 rmid = XLogRecGetRmid(record);
-
- if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
- {
- *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
- return true;
- }
- if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
- xact_info == XLOG_XACT_COMMIT_PREPARED))
- {
- *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
- return true;
- }
- if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
- xact_info == XLOG_XACT_ABORT_PREPARED))
+ /*
+ * For archive recovery, the WAL must be generated with at least 'replica'
+ * wal_level.
+ */
+ if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
{
- *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
- return true;
+ ereport(FATAL,
+ (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
+ errdetail("This happens if you temporarily set wal_level=minimal on the server."),
+ errhint("Use a backup taken after setting wal_level to higher than minimal.")));
}
- return false;
-}
-
-/*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG before the current record.
- *
- * Returns true if we are stopping, false otherwise. If stopping, some
- * information is saved in recoveryStopXid et al for use in annotating the
- * new timeline's history file.
- */
-static bool
-recoveryStopsBefore(XLogReaderState *record)
-{
- bool stopsHere = false;
- uint8 xact_info;
- bool isCommit;
- TimestampTz recordXtime = 0;
- TransactionId recordXid;
/*
- * Ignore recovery target settings when not in archive recovery (meaning
- * we are in crash recovery).
- */
- if (!ArchiveRecoveryRequested)
- return false;
-
- /* Check if we should stop as soon as reaching consistency */
- if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
- {
- ereport(LOG,
- (errmsg("recovery stopping after reaching consistency")));
-
- recoveryStopAfter = false;
- recoveryStopXid = InvalidTransactionId;
- recoveryStopLSN = InvalidXLogRecPtr;
- recoveryStopTime = 0;
- recoveryStopName[0] = '\0';
- return true;
- }
-
- /* Check if target LSN has been reached */
- if (recoveryTarget == RECOVERY_TARGET_LSN &&
- !recoveryTargetInclusive &&
- record->ReadRecPtr >= recoveryTargetLSN)
- {
- recoveryStopAfter = false;
- recoveryStopXid = InvalidTransactionId;
- recoveryStopLSN = record->ReadRecPtr;
- recoveryStopTime = 0;
- recoveryStopName[0] = '\0';
- ereport(LOG,
- (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryStopLSN))));
- return true;
- }
-
- /* Otherwise we only consider stopping before COMMIT or ABORT records. */
- if (XLogRecGetRmid(record) != RM_XACT_ID)
- return false;
-
- xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
-
- if (xact_info == XLOG_XACT_COMMIT)
- {
- isCommit = true;
- recordXid = XLogRecGetXid(record);
- }
- else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
- {
- xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
- xl_xact_parsed_commit parsed;
-
- isCommit = true;
- ParseCommitRecord(XLogRecGetInfo(record),
- xlrec,
- &parsed);
- recordXid = parsed.twophase_xid;
- }
- else if (xact_info == XLOG_XACT_ABORT)
- {
- isCommit = false;
- recordXid = XLogRecGetXid(record);
- }
- else if (xact_info == XLOG_XACT_ABORT_PREPARED)
- {
- xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
- xl_xact_parsed_abort parsed;
-
- isCommit = false;
- ParseAbortRecord(XLogRecGetInfo(record),
- xlrec,
- &parsed);
- recordXid = parsed.twophase_xid;
- }
- else
- return false;
-
- if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
- {
- /*
- * There can be only one transaction end record with this exact
- * transactionid
- *
- * when testing for an xid, we MUST test for equality only, since
- * transactions are numbered in the order they start, not the order
- * they complete. A higher numbered xid will complete before you about
- * 50% of the time...
- */
- stopsHere = (recordXid == recoveryTargetXid);
- }
-
- if (recoveryTarget == RECOVERY_TARGET_TIME &&
- getRecordTimestamp(record, &recordXtime))
- {
- /*
- * There can be many transactions that share the same commit time, so
- * we stop after the last one, if we are inclusive, or stop at the
- * first one if we are exclusive
- */
- if (recoveryTargetInclusive)
- stopsHere = (recordXtime > recoveryTargetTime);
- else
- stopsHere = (recordXtime >= recoveryTargetTime);
- }
-
- if (stopsHere)
- {
- recoveryStopAfter = false;
- recoveryStopXid = recordXid;
- recoveryStopTime = recordXtime;
- recoveryStopLSN = InvalidXLogRecPtr;
- recoveryStopName[0] = '\0';
-
- if (isCommit)
- {
- ereport(LOG,
- (errmsg("recovery stopping before commit of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- }
- else
- {
- ereport(LOG,
- (errmsg("recovery stopping before abort of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- }
- }
-
- return stopsHere;
-}
-
-/*
- * Same as recoveryStopsBefore, but called after applying the record.
- *
- * We also track the timestamp of the latest applied COMMIT/ABORT
- * record in XLogCtl->recoveryLastXTime.
- */
-static bool
-recoveryStopsAfter(XLogReaderState *record)
-{
- uint8 info;
- uint8 xact_info;
- uint8 rmid;
- TimestampTz recordXtime;
-
- /*
- * Ignore recovery target settings when not in archive recovery (meaning
- * we are in crash recovery).
- */
- if (!ArchiveRecoveryRequested)
- return false;
-
- info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
- rmid = XLogRecGetRmid(record);
-
- /*
- * There can be many restore points that share the same name; we stop at
- * the first one.
- */
- if (recoveryTarget == RECOVERY_TARGET_NAME &&
- rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
- {
- xl_restore_point *recordRestorePointData;
-
- recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
-
- if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
- {
- recoveryStopAfter = true;
- recoveryStopXid = InvalidTransactionId;
- recoveryStopLSN = InvalidXLogRecPtr;
- (void) getRecordTimestamp(record, &recoveryStopTime);
- strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
-
- ereport(LOG,
- (errmsg("recovery stopping at restore point \"%s\", time %s",
- recoveryStopName,
- timestamptz_to_str(recoveryStopTime))));
- return true;
- }
- }
-
- /* Check if the target LSN has been reached */
- if (recoveryTarget == RECOVERY_TARGET_LSN &&
- recoveryTargetInclusive &&
- record->ReadRecPtr >= recoveryTargetLSN)
- {
- recoveryStopAfter = true;
- recoveryStopXid = InvalidTransactionId;
- recoveryStopLSN = record->ReadRecPtr;
- recoveryStopTime = 0;
- recoveryStopName[0] = '\0';
- ereport(LOG,
- (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryStopLSN))));
- return true;
- }
-
- if (rmid != RM_XACT_ID)
- return false;
-
- xact_info = info & XLOG_XACT_OPMASK;
-
- if (xact_info == XLOG_XACT_COMMIT ||
- xact_info == XLOG_XACT_COMMIT_PREPARED ||
- xact_info == XLOG_XACT_ABORT ||
- xact_info == XLOG_XACT_ABORT_PREPARED)
- {
- TransactionId recordXid;
-
- /* Update the last applied transaction timestamp */
- if (getRecordTimestamp(record, &recordXtime))
- SetLatestXTime(recordXtime);
-
- /* Extract the XID of the committed/aborted transaction */
- if (xact_info == XLOG_XACT_COMMIT_PREPARED)
- {
- xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
- xl_xact_parsed_commit parsed;
-
- ParseCommitRecord(XLogRecGetInfo(record),
- xlrec,
- &parsed);
- recordXid = parsed.twophase_xid;
- }
- else if (xact_info == XLOG_XACT_ABORT_PREPARED)
- {
- xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
- xl_xact_parsed_abort parsed;
-
- ParseAbortRecord(XLogRecGetInfo(record),
- xlrec,
- &parsed);
- recordXid = parsed.twophase_xid;
- }
- else
- recordXid = XLogRecGetXid(record);
-
- /*
- * There can be only one transaction end record with this exact
- * transactionid
- *
- * when testing for an xid, we MUST test for equality only, since
- * transactions are numbered in the order they start, not the order
- * they complete. A higher numbered xid will complete before you about
- * 50% of the time...
- */
- if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
- recordXid == recoveryTargetXid)
- {
- recoveryStopAfter = true;
- recoveryStopXid = recordXid;
- recoveryStopTime = recordXtime;
- recoveryStopLSN = InvalidXLogRecPtr;
- recoveryStopName[0] = '\0';
-
- if (xact_info == XLOG_XACT_COMMIT ||
- xact_info == XLOG_XACT_COMMIT_PREPARED)
- {
- ereport(LOG,
- (errmsg("recovery stopping after commit of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- }
- else if (xact_info == XLOG_XACT_ABORT ||
- xact_info == XLOG_XACT_ABORT_PREPARED)
- {
- ereport(LOG,
- (errmsg("recovery stopping after abort of transaction %u, time %s",
- recoveryStopXid,
- timestamptz_to_str(recoveryStopTime))));
- }
- return true;
- }
- }
-
- /* Check if we should stop as soon as reaching consistency */
- if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
- {
- ereport(LOG,
- (errmsg("recovery stopping after reaching consistency")));
-
- recoveryStopAfter = true;
- recoveryStopXid = InvalidTransactionId;
- recoveryStopTime = 0;
- recoveryStopLSN = InvalidXLogRecPtr;
- recoveryStopName[0] = '\0';
- return true;
- }
-
- return false;
-}
-
-/*
- * Create a comment for the history file to explain why and where
- * timeline changed.
- */
-static char *
-getRecoveryStopReason(void)
-{
- char reason[200];
-
- if (recoveryTarget == RECOVERY_TARGET_XID)
- snprintf(reason, sizeof(reason),
- "%s transaction %u",
- recoveryStopAfter ? "after" : "before",
- recoveryStopXid);
- else if (recoveryTarget == RECOVERY_TARGET_TIME)
- snprintf(reason, sizeof(reason),
- "%s %s\n",
- recoveryStopAfter ? "after" : "before",
- timestamptz_to_str(recoveryStopTime));
- else if (recoveryTarget == RECOVERY_TARGET_LSN)
- snprintf(reason, sizeof(reason),
- "%s LSN %X/%X\n",
- recoveryStopAfter ? "after" : "before",
- LSN_FORMAT_ARGS(recoveryStopLSN));
- else if (recoveryTarget == RECOVERY_TARGET_NAME)
- snprintf(reason, sizeof(reason),
- "at restore point \"%s\"",
- recoveryStopName);
- else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
- snprintf(reason, sizeof(reason), "reached consistency");
- else
- snprintf(reason, sizeof(reason), "no recovery target specified");
-
- return pstrdup(reason);
-}
-
-/*
- * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
- *
- * endOfRecovery is true if the recovery target is reached and
- * the paused state starts at the end of recovery because of
- * recovery_target_action=pause, and false otherwise.
- */
-static void
-recoveryPausesHere(bool endOfRecovery)
-{
- /* Don't pause unless users can connect! */
- if (!LocalHotStandbyActive)
- return;
-
- /* Don't pause after standby promotion has been triggered */
- if (LocalPromoteIsTriggered)
- return;
-
- if (endOfRecovery)
- ereport(LOG,
- (errmsg("pausing at the end of recovery"),
- errhint("Execute pg_wal_replay_resume() to promote.")));
- else
- ereport(LOG,
- (errmsg("recovery has paused"),
- errhint("Execute pg_wal_replay_resume() to continue.")));
-
- /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
- while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
- {
- HandleStartupProcInterrupts();
- if (CheckForStandbyTrigger())
- return;
-
- /*
- * If recovery pause is requested then set it paused. While we are in
- * the loop, user might resume and pause again so set this every time.
- */
- ConfirmRecoveryPaused();
-
- /*
- * We wait on a condition variable that will wake us as soon as the
- * pause ends, but we use a timeout so we can check the above exit
- * condition periodically too.
- */
- ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
- WAIT_EVENT_RECOVERY_PAUSE);
- }
- ConditionVariableCancelSleep();
-}
-
-/*
- * Get the current state of the recovery pause request.
- */
-RecoveryPauseState
-GetRecoveryPauseState(void)
-{
- RecoveryPauseState state;
-
- SpinLockAcquire(&XLogCtl->info_lck);
- state = XLogCtl->recoveryPauseState;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return state;
-}
-
-/*
- * Set the recovery pause state.
- *
- * If recovery pause is requested then sets the recovery pause state to
- * 'pause requested' if it is not already 'paused'. Otherwise, sets it
- * to 'not paused' to resume the recovery. The recovery pause will be
- * confirmed by the ConfirmRecoveryPaused.
- */
-void
-SetRecoveryPause(bool recoveryPause)
-{
- SpinLockAcquire(&XLogCtl->info_lck);
-
- if (!recoveryPause)
- XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
- else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
- XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
-
- SpinLockRelease(&XLogCtl->info_lck);
-
- if (!recoveryPause)
- ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV);
-}
-
-/*
- * Confirm the recovery pause by setting the recovery pause state to
- * RECOVERY_PAUSED.
- */
-static void
-ConfirmRecoveryPaused(void)
-{
- /* If recovery pause is requested then set it paused */
- SpinLockAcquire(&XLogCtl->info_lck);
- if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
- XLogCtl->recoveryPauseState = RECOVERY_PAUSED;
- SpinLockRelease(&XLogCtl->info_lck);
-}
-
-/*
- * When recovery_min_apply_delay is set, we wait long enough to make sure
- * certain record types are applied at least that interval behind the primary.
- *
- * Returns true if we waited.
- *
- * Note that the delay is calculated between the WAL record log time and
- * the current time on standby. We would prefer to keep track of when this
- * standby received each WAL record, which would allow a more consistent
- * approach and one not affected by time synchronisation issues, but that
- * is significantly more effort and complexity for little actual gain in
- * usability.
- */
-static bool
-recoveryApplyDelay(XLogReaderState *record)
-{
- uint8 xact_info;
- TimestampTz xtime;
- TimestampTz delayUntil;
- long msecs;
-
- /* nothing to do if no delay configured */
- if (recovery_min_apply_delay <= 0)
- return false;
-
- /* no delay is applied on a database not yet consistent */
- if (!reachedConsistency)
- return false;
-
- /* nothing to do if crash recovery is requested */
- if (!ArchiveRecoveryRequested)
- return false;
-
- /*
- * Is it a COMMIT record?
- *
- * We deliberately choose not to delay aborts since they have no effect on
- * MVCC. We already allow replay of records that don't have a timestamp,
- * so there is already opportunity for issues caused by early conflicts on
- * standbys.
- */
- if (XLogRecGetRmid(record) != RM_XACT_ID)
- return false;
-
- xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
-
- if (xact_info != XLOG_XACT_COMMIT &&
- xact_info != XLOG_XACT_COMMIT_PREPARED)
- return false;
-
- if (!getRecordTimestamp(record, &xtime))
- return false;
-
- delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
-
- /*
- * Exit without arming the latch if it's already past time to apply this
- * record
- */
- msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
- if (msecs <= 0)
- return false;
-
- while (true)
- {
- ResetLatch(&XLogCtl->recoveryWakeupLatch);
-
- /*
- * This might change recovery_min_apply_delay or the trigger file's
- * location.
- */
- HandleStartupProcInterrupts();
-
- if (CheckForStandbyTrigger())
- break;
-
- /*
- * Recalculate delayUntil as recovery_min_apply_delay could have
- * changed while waiting in this loop.
- */
- delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
-
- /*
- * Wait for difference between GetCurrentTimestamp() and delayUntil.
- */
- msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
- delayUntil);
-
- if (msecs <= 0)
- break;
-
- elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
-
- (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
- WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
- msecs,
- WAIT_EVENT_RECOVERY_APPLY_DELAY);
- }
- return true;
-}
-
-/*
- * Save timestamp of latest processed commit/abort record.
- *
- * We keep this in XLogCtl, not a simple static variable, so that it can be
- * seen by processes other than the startup process. Note in particular
- * that CreateRestartPoint is executed in the checkpointer.
- */
-static void
-SetLatestXTime(TimestampTz xtime)
-{
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->recoveryLastXTime = xtime;
- SpinLockRelease(&XLogCtl->info_lck);
-}
-
-/*
- * Fetch timestamp of latest processed commit/abort record.
- */
-TimestampTz
-GetLatestXTime(void)
-{
- TimestampTz xtime;
-
- SpinLockAcquire(&XLogCtl->info_lck);
- xtime = XLogCtl->recoveryLastXTime;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return xtime;
-}
-
-/*
- * Save timestamp of the next chunk of WAL records to apply.
- *
- * We keep this in XLogCtl, not a simple static variable, so that it can be
- * seen by all backends.
- */
-static void
-SetCurrentChunkStartTime(TimestampTz xtime)
-{
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->currentChunkStartTime = xtime;
- SpinLockRelease(&XLogCtl->info_lck);
-}
-
-/*
- * Fetch timestamp of latest processed commit/abort record.
- * Startup process maintains an accurate local copy in XLogReceiptTime
- */
-TimestampTz
-GetCurrentChunkReplayStartTime(void)
-{
- TimestampTz xtime;
-
- SpinLockAcquire(&XLogCtl->info_lck);
- xtime = XLogCtl->currentChunkStartTime;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return xtime;
-}
-
-/*
- * Returns time of receipt of current chunk of XLOG data, as well as
- * whether it was received from streaming replication or from archives.
- */
-void
-GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
-{
- /*
- * This must be executed in the startup process, since we don't export the
- * relevant state to shared memory.
- */
- Assert(InRecovery);
-
- *rtime = XLogReceiptTime;
- *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
-}
-
-/*
- * Note that text field supplied is a parameter name and does not require
- * translation
- */
-static void
-RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
-{
- if (currValue < minValue)
- {
- if (LocalHotStandbyActive)
- {
- bool warned_for_promote = false;
-
- ereport(WARNING,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("hot standby is not possible because of insufficient parameter settings"),
- errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
- param_name,
- currValue,
- minValue)));
-
- SetRecoveryPause(true);
-
- ereport(LOG,
- (errmsg("recovery has paused"),
- errdetail("If recovery is unpaused, the server will shut down."),
- errhint("You can then restart the server after making the necessary configuration changes.")));
-
- while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
- {
- HandleStartupProcInterrupts();
-
- if (CheckForStandbyTrigger())
- {
- if (!warned_for_promote)
- ereport(WARNING,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("promotion is not possible because of insufficient parameter settings"),
-
- /*
- * Repeat the detail from above so it's easy to find
- * in the log.
- */
- errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
- param_name,
- currValue,
- minValue),
- errhint("Restart the server after making the necessary configuration changes.")));
- warned_for_promote = true;
- }
-
- /*
- * If recovery pause is requested then set it paused. While
- * we are in the loop, user might resume and pause again so
- * set this every time.
- */
- ConfirmRecoveryPaused();
-
- /*
- * We wait on a condition variable that will wake us as soon
- * as the pause ends, but we use a timeout so we can check the
- * above conditions periodically too.
- */
- ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
- WAIT_EVENT_RECOVERY_PAUSE);
- }
- ConditionVariableCancelSleep();
- }
-
- ereport(FATAL,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("recovery aborted because of insufficient parameter settings"),
- /* Repeat the detail from above so it's easy to find in the log. */
- errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
- param_name,
- currValue,
- minValue),
- errhint("You can restart the server after making the necessary configuration changes.")));
- }
-}
-
-/*
- * Check to see if required parameters are set high enough on this server
- * for various aspects of recovery operation.
- *
- * Note that all the parameters which this function tests need to be
- * listed in Administrator's Overview section in high-availability.sgml.
- * If you change them, don't forget to update the list.
- */
-static void
-CheckRequiredParameterValues(void)
-{
- /*
- * For archive recovery, the WAL must be generated with at least 'replica'
- * wal_level.
- */
- if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
- {
- ereport(FATAL,
- (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
- errdetail("This happens if you temporarily set wal_level=minimal on the server."),
- errhint("Use a backup taken after setting wal_level to higher than minimal.")));
- }
-
- /*
- * For Hot Standby, the WAL must be generated with 'replica' mode, and we
- * must have at least as many backend slots as the primary.
+ * For Hot Standby, the WAL must be generated with 'replica' mode, and we
+ * must have at least as many backend slots as the primary.
*/
if (ArchiveRecoveryRequested && EnableHotStandby)
{
XLogCtlInsert *Insert;
CheckPoint checkPoint;
bool wasShutdown;
- bool reachedRecoveryTarget = false;
- bool haveBackupLabel = false;
- bool haveTblspcMap = false;
- XLogRecPtr RecPtr,
- LastRec,
- checkPointLoc,
- EndOfLog;
+ bool haveTblspcMap;
+ bool haveBackupLabel;
+ XLogRecPtr EndOfLog;
TimeLineID EndOfLogTLI;
- TimeLineID replayTLI,
- newTLI;
+ TimeLineID newTLI;
bool performedWalRecovery;
- char *recoveryStopReason;
- XLogRecord *record;
+ EndOfWalRecoveryInfo *endOfRecoveryInfo;
+ XLogRecPtr abortedRecPtr;
+ XLogRecPtr missingContrecPtr;
TransactionId oldestActiveXID;
- bool backupEndRequired = false;
- bool backupFromStandby = false;
- XLogReaderState *xlogreader;
- XLogPageReadPrivate private;
bool promoted = false;
- struct stat st;
/*
* We should have an aux process resource owner to use, and we should not
* this temporary data.
*
* - There might be data which we had written, intending to fsync it, but
- * which we had not actually fsync'd yet. Therefore, a power failure in
- * the near future might cause earlier unflushed writes to be lost, even
- * though more recent data written to disk from here on would be
- * persisted. To avoid that, fsync the entire data directory.
- */
- if (ControlFile->state != DB_SHUTDOWNED &&
- ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
- {
- RemoveTempXlogFiles();
- SyncDataDirectory();
- }
-
- /*---- BEGIN InitWalRecovery ----*/
-
- /*
- * Initialize on the assumption we want to recover to the latest timeline
- * that's active according to pg_control.
- */
- if (ControlFile->minRecoveryPointTLI >
- ControlFile->checkPointCopy.ThisTimeLineID)
- recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
- else
- recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
-
- /*
- * Check for signal files, and if so set up state for offline recovery
- */
- readRecoverySignalFile();
- validateRecoveryParameters();
-
- if (ArchiveRecoveryRequested)
- {
- if (StandbyModeRequested)
- ereport(LOG,
- (errmsg("entering standby mode")));
- else if (recoveryTarget == RECOVERY_TARGET_XID)
- ereport(LOG,
- (errmsg("starting point-in-time recovery to XID %u",
- recoveryTargetXid)));
- else if (recoveryTarget == RECOVERY_TARGET_TIME)
- ereport(LOG,
- (errmsg("starting point-in-time recovery to %s",
- timestamptz_to_str(recoveryTargetTime))));
- else if (recoveryTarget == RECOVERY_TARGET_NAME)
- ereport(LOG,
- (errmsg("starting point-in-time recovery to \"%s\"",
- recoveryTargetName)));
- else if (recoveryTarget == RECOVERY_TARGET_LSN)
- ereport(LOG,
- (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryTargetLSN))));
- else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
- ereport(LOG,
- (errmsg("starting point-in-time recovery to earliest consistent point")));
- else
- ereport(LOG,
- (errmsg("starting archive recovery")));
- }
-
- /*
- * Take ownership of the wakeup latch if we're going to sleep during
- * recovery.
- */
- if (ArchiveRecoveryRequested)
- OwnLatch(&XLogCtl->recoveryWakeupLatch);
-
- /* Set up XLOG reader facility */
- MemSet(&private, 0, sizeof(XLogPageReadPrivate));
- xlogreader =
- XLogReaderAllocate(wal_segment_size, NULL,
- XL_ROUTINE(.page_read = &XLogPageRead,
- .segment_open = NULL,
- .segment_close = wal_segment_close),
- &private);
- if (!xlogreader)
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory"),
- errdetail("Failed while allocating a WAL reading processor.")));
- xlogreader->system_identifier = ControlFile->system_identifier;
-
- /*
- * Allocate two page buffers dedicated to WAL consistency checks. We do
- * it this way, rather than just making static arrays, for two reasons:
- * (1) no need to waste the storage in most instantiations of the backend;
- * (2) a static char array isn't guaranteed to have any particular
- * alignment, whereas palloc() will provide MAXALIGN'd storage.
- */
- replay_image_masked = (char *) palloc(BLCKSZ);
- primary_image_masked = (char *) palloc(BLCKSZ);
-
- if (read_backup_label(&checkPointLoc, &replayTLI, &backupEndRequired,
- &backupFromStandby))
- {
- List *tablespaces = NIL;
-
- /*
- * Archive recovery was requested, and thanks to the backup label
- * file, we know how far we need to replay to reach consistency. Enter
- * archive recovery directly.
- */
- InArchiveRecovery = true;
- if (StandbyModeRequested)
- StandbyMode = true;
-
- /*
- * When a backup_label file is present, we want to roll forward from
- * the checkpoint it identifies, rather than using pg_control.
- */
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true,
- replayTLI);
- if (record != NULL)
- {
- memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
- wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
- ereport(DEBUG1,
- (errmsg_internal("checkpoint record is at %X/%X",
- LSN_FORMAT_ARGS(checkPointLoc))));
- InRecovery = true; /* force recovery even if SHUTDOWNED */
-
- /*
- * Make sure that REDO location exists. This may not be the case
- * if there was a crash during an online backup, which left a
- * backup_label around that references a WAL segment that's
- * already been archived.
- */
- if (checkPoint.redo < checkPointLoc)
- {
- XLogBeginRead(xlogreader, checkPoint.redo);
- if (!ReadRecord(xlogreader, LOG, false,
- checkPoint.ThisTimeLineID))
- ereport(FATAL,
- (errmsg("could not find redo location referenced by checkpoint record"),
- errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
- "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
- "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
- DataDir, DataDir, DataDir)));
- }
- }
- else
- {
- ereport(FATAL,
- (errmsg("could not locate required checkpoint record"),
- errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
- "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
- "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
- DataDir, DataDir, DataDir)));
- wasShutdown = false; /* keep compiler quiet */
- }
-
- /* read the tablespace_map file if present and create symlinks. */
- if (read_tablespace_map(&tablespaces))
- {
- ListCell *lc;
-
- foreach(lc, tablespaces)
- {
- tablespaceinfo *ti = lfirst(lc);
- char *linkloc;
-
- linkloc = psprintf("pg_tblspc/%s", ti->oid);
-
- /*
- * Remove the existing symlink if any and Create the symlink
- * under PGDATA.
- */
- remove_tablespace_symlink(linkloc);
-
- if (symlink(ti->path, linkloc) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create symbolic link \"%s\": %m",
- linkloc)));
-
- pfree(ti->oid);
- pfree(ti->path);
- pfree(ti);
- }
-
- /* set flag to delete it later */
- haveTblspcMap = true;
- }
-
- /* set flag to delete it later */
- haveBackupLabel = true;
- }
- else
- {
- /*
- * If tablespace_map file is present without backup_label file, there
- * is no use of such file. There is no harm in retaining it, but it
- * is better to get rid of the map file so that we don't have any
- * redundant file in data directory and it will avoid any sort of
- * confusion. It seems prudent though to just rename the file out of
- * the way rather than delete it completely, also we ignore any error
- * that occurs in rename operation as even if map file is present
- * without backup_label file, it is harmless.
- */
- if (stat(TABLESPACE_MAP, &st) == 0)
- {
- unlink(TABLESPACE_MAP_OLD);
- if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
- ereport(LOG,
- (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
- TABLESPACE_MAP, BACKUP_LABEL_FILE),
- errdetail("File \"%s\" was renamed to \"%s\".",
- TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
- else
- ereport(LOG,
- (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
- TABLESPACE_MAP, BACKUP_LABEL_FILE),
- errdetail("Could not rename file \"%s\" to \"%s\": %m.",
- TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
- }
-
- /*
- * It's possible that archive recovery was requested, but we don't
- * know how far we need to replay the WAL before we reach consistency.
- * This can happen for example if a base backup is taken from a
- * running server using an atomic filesystem snapshot, without calling
- * pg_start/stop_backup. Or if you just kill a running primary server
- * and put it into archive recovery by creating a recovery signal
- * file.
- *
- * Our strategy in that case is to perform crash recovery first,
- * replaying all the WAL present in pg_wal, and only enter archive
- * recovery after that.
- *
- * But usually we already know how far we need to replay the WAL (up
- * to minRecoveryPoint, up to backupEndPoint, or until we see an
- * end-of-backup record), and we can enter archive recovery directly.
- */
- if (ArchiveRecoveryRequested &&
- (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
- ControlFile->backupEndRequired ||
- ControlFile->backupEndPoint != InvalidXLogRecPtr ||
- ControlFile->state == DB_SHUTDOWNED))
- {
- InArchiveRecovery = true;
- if (StandbyModeRequested)
- StandbyMode = true;
- }
-
- /* Get the last valid checkpoint record. */
- checkPointLoc = ControlFile->checkPoint;
- RedoStartLSN = ControlFile->checkPointCopy.redo;
- replayTLI = ControlFile->checkPointCopy.ThisTimeLineID;
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true,
- replayTLI);
- if (record != NULL)
- {
- ereport(DEBUG1,
- (errmsg_internal("checkpoint record is at %X/%X",
- LSN_FORMAT_ARGS(checkPointLoc))));
- }
- else
- {
- /*
- * We used to attempt to go back to a secondary checkpoint record
- * here, but only when not in standby mode. We now just fail if we
- * can't read the last checkpoint because this allows us to
- * simplify processing around checkpoints.
- */
- ereport(PANIC,
- (errmsg("could not locate a valid checkpoint record")));
- }
- memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
- wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
- }
-
- /*
- * If the location of the checkpoint record is not on the expected
- * timeline in the history of the requested timeline, we cannot proceed:
- * the backup is not part of the history of the requested timeline.
- */
- Assert(expectedTLEs); /* was initialized by reading checkpoint
- * record */
- if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
- checkPoint.ThisTimeLineID)
- {
- XLogRecPtr switchpoint;
-
- /*
- * tliSwitchPoint will throw an error if the checkpoint's timeline is
- * not in expectedTLEs at all.
- */
- switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
- ereport(FATAL,
- (errmsg("requested timeline %u is not a child of this server's history",
- recoveryTargetTLI),
- errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
- LSN_FORMAT_ARGS(ControlFile->checkPoint),
- ControlFile->checkPointCopy.ThisTimeLineID,
- LSN_FORMAT_ARGS(switchpoint))));
- }
-
- /*
- * The min recovery point should be part of the requested timeline's
- * history, too.
- */
- if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
- tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
- ControlFile->minRecoveryPointTLI)
- ereport(FATAL,
- (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
- recoveryTargetTLI,
- LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
- ControlFile->minRecoveryPointTLI)));
-
- LastRec = RecPtr = checkPointLoc;
-
- ereport(DEBUG1,
- (errmsg_internal("redo record is at %X/%X; shutdown %s",
- LSN_FORMAT_ARGS(checkPoint.redo),
- wasShutdown ? "true" : "false")));
- ereport(DEBUG1,
- (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
- U64FromFullTransactionId(checkPoint.nextXid),
- checkPoint.nextOid)));
- ereport(DEBUG1,
- (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
- checkPoint.nextMulti, checkPoint.nextMultiOffset)));
- ereport(DEBUG1,
- (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
- checkPoint.oldestXid, checkPoint.oldestXidDB)));
- ereport(DEBUG1,
- (errmsg_internal("oldest MultiXactId: %u, in database %u",
- checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
- ereport(DEBUG1,
- (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
- checkPoint.oldestCommitTsXid,
- checkPoint.newestCommitTsXid)));
-
- /* sanity checks on the checkpoint record */
- if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
- ereport(PANIC,
- (errmsg("invalid next transaction ID")));
- if (checkPoint.redo > checkPointLoc)
- ereport(PANIC,
- (errmsg("invalid redo in checkpoint record")));
-
- /*
- * Check whether we need to force recovery from WAL. If it appears to
- * have been a clean shutdown and we did not have a recovery signal file,
- * then assume no recovery needed.
- */
- if (checkPoint.redo < checkPointLoc)
- {
- if (wasShutdown)
- ereport(PANIC,
- (errmsg("invalid redo record in shutdown checkpoint")));
- InRecovery = true;
- }
- else if (ControlFile->state != DB_SHUTDOWNED)
- InRecovery = true;
- else if (ArchiveRecoveryRequested)
- {
- /* force recovery due to presence of recovery signal file */
- InRecovery = true;
- }
-
- /*
- * If recovery is needed, update our in-memory copy of pg_control to show
- * that we are recovering and to show the selected checkpoint as the place
- * we are starting from. We also mark pg_control with any minimum recovery
- * stop point obtained from a backup history file.
- *
- * We don't write the changes to disk yet, though. Only do that after
- * initializing various subsystems.
- */
- if (InRecovery)
- {
- DBState dbstate_at_startup;
-
- dbstate_at_startup = ControlFile->state;
- if (InArchiveRecovery)
- {
- ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
- }
- else
- {
- ereport(LOG,
- (errmsg("database system was not properly shut down; "
- "automatic recovery in progress")));
- if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
- ereport(LOG,
- (errmsg("crash recovery starts in timeline %u "
- "and has target timeline %u",
- ControlFile->checkPointCopy.ThisTimeLineID,
- recoveryTargetTLI)));
- ControlFile->state = DB_IN_CRASH_RECOVERY;
- }
- ControlFile->checkPoint = checkPointLoc;
- ControlFile->checkPointCopy = checkPoint;
- if (InArchiveRecovery)
- {
- /* initialize minRecoveryPoint if not set yet */
- if (ControlFile->minRecoveryPoint < checkPoint.redo)
- {
- ControlFile->minRecoveryPoint = checkPoint.redo;
- ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
- }
- }
-
- /*
- * Set backupStartPoint if we're starting recovery from a base backup.
- *
- * Also set backupEndPoint and use minRecoveryPoint as the backup end
- * location if we're starting recovery from a base backup which was
- * taken from a standby. In this case, the database system status in
- * pg_control must indicate that the database was already in recovery.
- * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
- * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
- * before reaching this point; e.g. because restore_command or
- * primary_conninfo were faulty.
- *
- * Any other state indicates that the backup somehow became corrupted
- * and we can't sensibly continue with recovery.
- */
- if (haveBackupLabel)
- {
- ControlFile->backupStartPoint = checkPoint.redo;
- ControlFile->backupEndRequired = backupEndRequired;
-
- if (backupFromStandby)
- {
- if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
- dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
- ereport(FATAL,
- (errmsg("backup_label contains data inconsistent with control file"),
- errhint("This means that the backup is corrupted and you will "
- "have to use another backup for recovery.")));
- ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
- }
- }
+ * which we had not actually fsync'd yet. Therefore, a power failure in
+ * the near future might cause earlier unflushed writes to be lost, even
+ * though more recent data written to disk from here on would be
+ * persisted. To avoid that, fsync the entire data directory.
+ */
+ if (ControlFile->state != DB_SHUTDOWNED &&
+ ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+ {
+ RemoveTempXlogFiles();
+ SyncDataDirectory();
}
- /*---- END InitWalRecovery ----*/
+ /*
+ * Prepare for WAL recovery if needed.
+ *
+ * InitWalRecovery analyzes the control file and the backup label file, if
+ * any. It updates the in-memory ControlFile buffer according to the
+ * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
+ * It also applies the tablespace map file, if any.
+ */
+ InitWalRecovery(ControlFile, &wasShutdown,
+ &haveBackupLabel, &haveTblspcMap);
+ checkPoint = ControlFile->checkPointCopy;
/* initialize shared memory variables from the checkpoint record */
ShmemVariableCache->nextXid = checkPoint.nextXid;
else
XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
- /*
- * We must replay WAL entries using the same TimeLineID they were created
- * under, so temporarily adopt the TLI indicated by the checkpoint (see
- * also xlog_redo()).
- */
- replayTLI = checkPoint.ThisTimeLineID;
-
/*
* Copy any missing timeline history files between 'now' and the recovery
* target timeline from archive to pg_wal. While we don't need those files
* are small, so it's better to copy them unnecessarily than not copy them
* and regret later.
*/
- restoreTimeLineHistoryFiles(replayTLI, recoveryTargetTLI);
+ restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
/*
* Before running in recovery, scan pg_twophase and fill in its status to
RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
doPageWrites = lastFullPageWrites;
- /*
- * Start recovery assuming that the final record isn't lost.
- */
- abortedRecPtr = InvalidXLogRecPtr;
- missingContrecPtr = InvalidXLogRecPtr;
-
/* REDO */
if (InRecovery)
{
- int rmid;
-
/* Initialize state for RecoveryInProgress() */
SpinLockAcquire(&XLogCtl->info_lck);
if (InArchiveRecovery)
*/
if (InArchiveRecovery)
{
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
}
else
{
- minRecoveryPoint = InvalidXLogRecPtr;
- minRecoveryPointTLI = 0;
+ LocalMinRecoveryPoint = InvalidXLogRecPtr;
+ LocalMinRecoveryPointTLI = 0;
}
/*
}
}
- /*---- BEGIN PerformWalRecovery ----*/
-
- /*
- * Initialize shared variables for tracking progress of WAL replay, as
- * if we had just replayed the record before the REDO location (or the
- * checkpoint record itself, if it's a shutdown checkpoint).
- */
- SpinLockAcquire(&XLogCtl->info_lck);
- if (checkPoint.redo < checkPointLoc)
- XLogCtl->replayEndRecPtr = checkPoint.redo;
- else
- XLogCtl->replayEndRecPtr = xlogreader->EndRecPtr;
- XLogCtl->replayEndTLI = replayTLI;
- XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
- XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
- XLogCtl->recoveryLastXTime = 0;
- XLogCtl->currentChunkStartTime = 0;
- XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
- SpinLockRelease(&XLogCtl->info_lck);
-
- /* Also ensure XLogReceiptTime has a sane value */
- XLogReceiptTime = GetCurrentTimestamp();
-
- /*
- * Let postmaster know we've started redo now, so that it can launch
- * the archiver if necessary.
- */
- if (IsUnderPostmaster)
- SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-
- /*
- * Allow read-only connections immediately if we're consistent
- * already.
- */
- CheckRecoveryConsistency();
-
- /*
- * Find the first record that logically follows the checkpoint --- it
- * might physically precede it, though.
- */
- if (checkPoint.redo < checkPointLoc)
- {
- /* back up to find the record */
- XLogBeginRead(xlogreader, checkPoint.redo);
- record = ReadRecord(xlogreader, PANIC, false, replayTLI);
- }
- else
- {
- /* just have to read next record after CheckPoint */
- Assert(RecPtr == checkPointLoc);
- record = ReadRecord(xlogreader, LOG, false, replayTLI);
- }
-
- if (record != NULL)
- {
- ErrorContextCallback errcallback;
- TimestampTz xtime;
- PGRUsage ru0;
-
- pg_rusage_init(&ru0);
-
- InRedo = true;
-
- /* Initialize resource managers */
- for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
- {
- if (RmgrTable[rmid].rm_startup != NULL)
- RmgrTable[rmid].rm_startup();
- }
-
- ereport(LOG,
- (errmsg("redo starts at %X/%X",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
-
- /* Prepare to report progress of the redo phase. */
- if (!StandbyMode)
- begin_startup_progress_phase();
-
- /*
- * main redo apply loop
- */
- do
- {
- bool switchedTLI = false;
-
- if (!StandbyMode)
- ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
-
-#ifdef WAL_DEBUG
- if (XLOG_DEBUG ||
- (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
- (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
- {
- StringInfoData buf;
-
- initStringInfo(&buf);
- appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
- LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
- xlog_outrec(&buf, xlogreader);
- appendStringInfoString(&buf, " - ");
- xlog_outdesc(&buf, xlogreader);
- elog(LOG, "%s", buf.data);
- pfree(buf.data);
- }
-#endif
-
- /* Handle interrupt signals of startup process */
- HandleStartupProcInterrupts();
-
- /*
- * Pause WAL replay, if requested by a hot-standby session via
- * SetRecoveryPause().
- *
- * Note that we intentionally don't take the info_lck spinlock
- * here. We might therefore read a slightly stale value of
- * the recoveryPause flag, but it can't be very stale (no
- * worse than the last spinlock we did acquire). Since a
- * pause request is a pretty asynchronous thing anyway,
- * possibly responding to it one WAL record later than we
- * otherwise would is a minor issue, so it doesn't seem worth
- * adding another spinlock cycle to prevent that.
- */
- if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
- RECOVERY_NOT_PAUSED)
- recoveryPausesHere(false);
-
- /*
- * Have we reached our recovery target?
- */
- if (recoveryStopsBefore(xlogreader))
- {
- reachedRecoveryTarget = true;
- break;
- }
-
- /*
- * If we've been asked to lag the primary, wait on latch until
- * enough time has passed.
- */
- if (recoveryApplyDelay(xlogreader))
- {
- /*
- * We test for paused recovery again here. If user sets
- * delayed apply, it may be because they expect to pause
- * recovery in case of problems, so we must test again
- * here otherwise pausing during the delay-wait wouldn't
- * work.
- */
- if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
- RECOVERY_NOT_PAUSED)
- recoveryPausesHere(false);
- }
-
- /* Setup error traceback support for ereport() */
- errcallback.callback = rm_redo_error_callback;
- errcallback.arg = (void *) xlogreader;
- errcallback.previous = error_context_stack;
- error_context_stack = &errcallback;
-
- /*
- * ShmemVariableCache->nextXid must be beyond record's xid.
- */
- AdvanceNextFullTransactionIdPastXid(record->xl_xid);
-
- /*
- * Before replaying this record, check if this record causes
- * the current timeline to change. The record is already
- * considered to be part of the new timeline, so we update
- * replayTLI before replaying it. That's important so that
- * replayEndTLI, which is recorded as the minimum recovery
- * point's TLI if recovery stops after this record, is set
- * correctly.
- */
- if (record->xl_rmid == RM_XLOG_ID)
- {
- TimeLineID newReplayTLI = replayTLI;
- TimeLineID prevReplayTLI = replayTLI;
- uint8 info = record->xl_info & ~XLR_INFO_MASK;
-
- if (info == XLOG_CHECKPOINT_SHUTDOWN)
- {
- CheckPoint checkPoint;
-
- memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
- newReplayTLI = checkPoint.ThisTimeLineID;
- prevReplayTLI = checkPoint.PrevTimeLineID;
- }
- else if (info == XLOG_END_OF_RECOVERY)
- {
- xl_end_of_recovery xlrec;
-
- memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
- newReplayTLI = xlrec.ThisTimeLineID;
- prevReplayTLI = xlrec.PrevTimeLineID;
- }
-
- if (newReplayTLI != replayTLI)
- {
- /* Check that it's OK to switch to this TLI */
- checkTimeLineSwitch(xlogreader->EndRecPtr,
- newReplayTLI,
- prevReplayTLI, replayTLI);
-
- /* Following WAL records should be run with new TLI */
- replayTLI = newReplayTLI;
- switchedTLI = true;
- }
- }
-
- /*
- * Update shared replayEndRecPtr before replaying this record,
- * so that XLogFlush will update minRecoveryPoint correctly.
- */
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->replayEndRecPtr = xlogreader->EndRecPtr;
- XLogCtl->replayEndTLI = replayTLI;
- SpinLockRelease(&XLogCtl->info_lck);
-
- /*
- * If we are attempting to enter Hot Standby mode, process
- * XIDs we see
- */
- if (standbyState >= STANDBY_INITIALIZED &&
- TransactionIdIsValid(record->xl_xid))
- RecordKnownAssignedTransactionIds(record->xl_xid);
-
- /* Now apply the WAL record itself */
- RmgrTable[record->xl_rmid].rm_redo(xlogreader);
-
- /*
- * After redo, check whether the backup pages associated with
- * the WAL record are consistent with the existing pages. This
- * check is done only if consistency check is enabled for this
- * record.
- */
- if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
- checkXLogConsistency(xlogreader);
-
- /* Pop the error context stack */
- error_context_stack = errcallback.previous;
-
- /*
- * Update lastReplayedEndRecPtr after this record has been
- * successfully replayed.
- */
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
- XLogCtl->lastReplayedTLI = replayTLI;
- SpinLockRelease(&XLogCtl->info_lck);
-
- /*
- * If rm_redo called XLogRequestWalReceiverReply, then we wake
- * up the receiver so that it notices the updated
- * lastReplayedEndRecPtr and sends a reply to the primary.
- */
- if (doRequestWalReceiverReply)
- {
- doRequestWalReceiverReply = false;
- WalRcvForceReply();
- }
-
- /* Remember this record as the last-applied one */
- LastRec = xlogreader->ReadRecPtr;
-
- /* Allow read-only connections if we're consistent now */
- CheckRecoveryConsistency();
-
- /* Is this a timeline switch? */
- if (switchedTLI)
- {
- /*
- * Before we continue on the new timeline, clean up any
- * (possibly bogus) future WAL segments on the old
- * timeline.
- */
- RemoveNonParentXlogFiles(xlogreader->EndRecPtr, replayTLI);
-
- /*
- * Wake up any walsenders to notice that we are on a new
- * timeline.
- */
- if (AllowCascadeReplication())
- WalSndWakeup();
- }
-
- /* Exit loop if we reached inclusive recovery target */
- if (recoveryStopsAfter(xlogreader))
- {
- reachedRecoveryTarget = true;
- break;
- }
-
- /* Else, try to fetch the next WAL record */
- record = ReadRecord(xlogreader, LOG, false, replayTLI);
- } while (record != NULL);
-
- /*
- * end of main redo apply loop
- */
-
- if (reachedRecoveryTarget)
- {
- if (!reachedConsistency)
- ereport(FATAL,
- (errmsg("requested recovery stop point is before consistent recovery point")));
-
- /*
- * This is the last point where we can restart recovery with a
- * new recovery target, if we shutdown and begin again. After
- * this, Resource Managers may choose to do permanent
- * corrective actions at end of recovery.
- */
- switch (recoveryTargetAction)
- {
- case RECOVERY_TARGET_ACTION_SHUTDOWN:
-
- /*
- * exit with special return code to request shutdown
- * of postmaster. Log messages issued from
- * postmaster.
- */
- proc_exit(3);
-
- case RECOVERY_TARGET_ACTION_PAUSE:
- SetRecoveryPause(true);
- recoveryPausesHere(true);
-
- /* drop into promote */
-
- case RECOVERY_TARGET_ACTION_PROMOTE:
- break;
- }
- }
-
- /* Allow resource managers to do any required cleanup. */
- for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
- {
- if (RmgrTable[rmid].rm_cleanup != NULL)
- RmgrTable[rmid].rm_cleanup();
- }
-
- ereport(LOG,
- (errmsg("redo done at %X/%X system usage: %s",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
- pg_rusage_show(&ru0))));
- xtime = GetLatestXTime();
- if (xtime)
- ereport(LOG,
- (errmsg("last completed transaction was at log time %s",
- timestamptz_to_str(xtime))));
-
- InRedo = false;
- }
- else
- {
- /* there are no WAL records following the checkpoint */
- ereport(LOG,
- (errmsg("redo is not required")));
-
- }
-
/*
- * This check is intentionally after the above log messages that
- * indicate how far recovery went.
+ * We're all set for replaying the WAL now. Do it.
*/
- if (ArchiveRecoveryRequested &&
- recoveryTarget != RECOVERY_TARGET_UNSET &&
- !reachedRecoveryTarget)
- ereport(FATAL,
- (errmsg("recovery ended before configured recovery target was reached")));
-
- /*---- END PerformWalRecovery ----*/
+ PerformWalRecovery();
performedWalRecovery = true;
}
- /*---- BEGIN FinishWalRecovery ----*/
-
- /*
- * Kill WAL receiver, if it's still running, before we continue to write
- * the startup checkpoint and aborted-contrecord records. It will trump
- * over these records and subsequent ones if it's still alive when we
- * start writing WAL.
- */
- XLogShutdownWalRcv();
-
- /*
- * We are now done reading the xlog from stream. Turn off streaming
- * recovery to force fetching the files (which would be required at end of
- * recovery, e.g., timeline history file) from archive or pg_wal.
- *
- * Note that standby mode must be turned off after killing WAL receiver,
- * i.e., calling XLogShutdownWalRcv().
- */
- Assert(!WalRcvStreaming());
- StandbyMode = false;
-
- /*
- * Determine where to start writing WAL next.
- *
- * When recovery ended in an incomplete record, write a WAL record about
- * that and continue after it. In all other cases, re-fetch the last
- * valid or last applied record, so we can identify the exact endpoint of
- * what we consider the valid portion of WAL.
- */
- XLogBeginRead(xlogreader, LastRec);
- record = ReadRecord(xlogreader, PANIC, false, replayTLI);
- EndOfLog = xlogreader->EndRecPtr;
-
/*
- * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
- * the end-of-log. It could be different from the timeline that EndOfLog
- * nominally belongs to, if there was a timeline switch in that segment,
- * and we were reading the old WAL from a segment belonging to a higher
- * timeline.
+ * Finish WAL recovery.
*/
- EndOfLogTLI = xlogreader->seg.ws_tli;
-
- if (ArchiveRecoveryRequested)
- {
- /*
- * We are no longer in archive recovery state.
- *
- * We are now done reading the old WAL. Turn off archive fetching if
- * it was active.
- */
- Assert(InArchiveRecovery);
- InArchiveRecovery = false;
-
- /*
- * If the ending log segment is still open, close it (to avoid
- * problems on Windows with trying to rename or delete an open file).
- */
- if (readFile >= 0)
- {
- close(readFile);
- readFile = -1;
- }
- }
-
- recoveryStopReason = getRecoveryStopReason();
-
- /*---- END FinishWalRecovery ----*/
+ endOfRecoveryInfo = FinishWalRecovery();
+ EndOfLog = endOfRecoveryInfo->endOfLog;
+ EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
+ abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
+ missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
/*
* Complain if we did not roll forward far enough to render the backup
* dump consistent. Note: it is indeed okay to look at the local variable
- * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
- * be further ahead --- ControlFile->minRecoveryPoint cannot have been
- * advanced beyond the WAL we processed.
+ * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
+ * might be further ahead --- ControlFile->minRecoveryPoint cannot have
+ * been advanced beyond the WAL we processed.
*/
if (InRecovery &&
- (EndOfLog < minRecoveryPoint ||
+ (EndOfLog < LocalMinRecoveryPoint ||
!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
{
/*
*
* In a normal crash recovery, we can just extend the timeline we were in.
*/
- newTLI = replayTLI;
+ newTLI = endOfRecoveryInfo->lastRecTLI;
if (ArchiveRecoveryRequested)
{
newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
/*
* Make a writable copy of the last WAL segment. (Note that we also
- * have a copy of the last block of the old WAL in readBuf; we will
- * use that below.)
+ * have a copy of the last block of the old WAL in
+ * endOfRecovery->lastPage; we will use that below.)
*/
XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
* Remove the signal files out of the way, so that we don't
* accidentally re-enter archive recovery mode in a subsequent crash.
*/
- if (standby_signal_file_found)
+ if (endOfRecoveryInfo->standby_signal_file_found)
durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
- if (recovery_signal_file_found)
+ if (endOfRecoveryInfo->recovery_signal_file_found)
durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
/*
* between here and writing the end-of-recovery record.
*/
writeTimeLineHistory(newTLI, recoveryTargetTLI,
- EndOfLog, recoveryStopReason);
+ EndOfLog, endOfRecoveryInfo->recoveryStopReason);
ereport(LOG,
(errmsg("archive recovery complete")));
/* Save the selected TimeLineID in shared memory, too */
XLogCtl->InsertTimeLineID = newTLI;
- XLogCtl->PrevTimeLineID = replayTLI;
+ XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
/*
* Actually, if WAL ended in an incomplete record, skip the parts that
* previous incarnation.
*/
Insert = &XLogCtl->Insert;
- Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
+ Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
/*
- * Tricky point here: readBuf contains the *last* block that the LastRec
+ * Tricky point here: lastPage contains the *last* block that the LastRec
* record spans, not the one it starts in. The last block is indeed the
* one we want to use.
*/
char *page;
int len;
int firstIdx;
- XLogRecPtr pageBeginPtr;
-
- pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
- Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+ len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
+ Assert(len < XLOG_BLCKSZ);
/* Copy the valid part of the last block, and zero the rest */
page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
- len = EndOfLog % XLOG_BLCKSZ;
- memcpy(page, xlogreader->readBuf, len);
+ memcpy(page, endOfRecoveryInfo->lastPage, XLOG_BLCKSZ);
memset(page + len, 0, XLOG_BLCKSZ - len);
- XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
- XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
+ XLogCtl->xlblocks[firstIdx] = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
+ XLogCtl->InitializedUpTo = endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ;
}
else
{
/* Reload shared-memory state for prepared transactions */
RecoverPreparedTransactions();
- /*---- BEGIN ShutdownWalRecovery ----*/
-
/* Shut down xlogreader */
- if (readFile >= 0)
- {
- close(readFile);
- readFile = -1;
- }
- XLogReaderFree(xlogreader);
-
- if (ArchiveRecoveryRequested)
- {
- char recoveryPath[MAXPGPATH];
-
- /*
- * Since there might be a partial WAL segment named RECOVERYXLOG, get
- * rid of it.
- */
- snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
- unlink(recoveryPath); /* ignore any error */
-
- /* Get rid of any remaining recovered timeline-history file, too */
- snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
- unlink(recoveryPath); /* ignore any error */
- }
-
- /*
- * We don't need the latch anymore. It's not strictly necessary to disown
- * it, but let's do it for the sake of tidiness.
- */
- if (ArchiveRecoveryRequested)
- DisownLatch(&XLogCtl->recoveryWakeupLatch);
-
- /*---- END ShutdownWalRecovery ----*/
+ ShutdownWalRecovery();
/* Enable WAL writes for this backend only. */
LocalSetXLogInsertAllowed();
{
Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
- abortedRecPtr = InvalidXLogRecPtr;
- missingContrecPtr = InvalidXLogRecPtr;
}
/*
}
/*
- * Checks if recovery has reached a consistent state. When consistency is
- * reached and we have a valid starting standby snapshot, tell postmaster
- * that it can start accepting read-only connections.
+ * Callback from PerformWalRecovery(), called when we switch from crash
+ * recovery to archive recovery mode. Updates the control file accordingly.
*/
-static void
-CheckRecoveryConsistency(void)
+void
+SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
{
- XLogRecPtr lastReplayedEndRecPtr;
+ /* initialize minRecoveryPoint to this record */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+ if (ControlFile->minRecoveryPoint < EndRecPtr)
+ {
+ ControlFile->minRecoveryPoint = EndRecPtr;
+ ControlFile->minRecoveryPointTLI = replayTLI;
+ }
+ /* update local copy */
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
/*
- * During crash recovery, we don't reach a consistent state until we've
- * replayed all the WAL.
+ * The startup process can update its local copy of minRecoveryPoint from
+ * this point.
*/
- if (XLogRecPtrIsInvalid(minRecoveryPoint))
- return;
-
- Assert(InArchiveRecovery);
+ updateMinRecoveryPoint = true;
- /*
- * assume that we are called in the startup process, and hence don't need
- * a lock to read lastReplayedEndRecPtr
- */
- lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
+ UpdateControlFile();
/*
- * Have we reached the point where our base backup was completed?
+ * We update SharedRecoveryState while holding the lock on ControlFileLock
+ * so both states are consistent in shared memory.
*/
- if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
- ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
- {
- /*
- * We have reached the end of base backup, as indicated by pg_control.
- * The data on disk is now consistent. Reset backupStartPoint and
- * backupEndPoint, and update minRecoveryPoint to make sure we don't
- * allow starting up at an earlier point even if recovery is stopped
- * and restarted soon after this.
- */
- elog(DEBUG1, "end of backup reached");
-
- LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
- if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
- ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
-
- ControlFile->backupStartPoint = InvalidXLogRecPtr;
- ControlFile->backupEndPoint = InvalidXLogRecPtr;
- ControlFile->backupEndRequired = false;
- UpdateControlFile();
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+ SpinLockRelease(&XLogCtl->info_lck);
- LWLockRelease(ControlFileLock);
- }
+ LWLockRelease(ControlFileLock);
+}
+/*
+ * Callback from PerformWalRecovery(), called when we reach the end of backup.
+ * Updates the control file accordingly.
+ */
+void
+ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
+{
/*
- * Have we passed our safe starting point? Note that minRecoveryPoint is
- * known to be incorrectly set if ControlFile->backupEndRequired, until
- * the XLOG_BACKUP_END arrives to advise us of the correct
- * minRecoveryPoint. All we know prior to that is that we're not
- * consistent yet.
+ * We have reached the end of base backup, as indicated by pg_control. The
+ * data on disk is now consistent (unless minRecovery point is further
+ * ahead, which can happen if we crashed during previous recovery). Reset
+ * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
+ * make sure we don't allow starting up at an earlier point even if
+ * recovery is stopped and restarted soon after this.
*/
- if (!reachedConsistency && !ControlFile->backupEndRequired &&
- minRecoveryPoint <= lastReplayedEndRecPtr &&
- XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
- {
- /*
- * Check to see if the XLOG sequence contained any unresolved
- * references to uninitialized pages.
- */
- XLogCheckInvalidPages();
-
- reachedConsistency = true;
- ereport(LOG,
- (errmsg("consistent recovery state reached at %X/%X",
- LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
- }
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
- /*
- * Have we got a valid starting snapshot that will allow queries to be
- * run? If so, we can tell postmaster that the database is consistent now,
- * enabling connections.
- */
- if (standbyState == STANDBY_SNAPSHOT_READY &&
- !LocalHotStandbyActive &&
- reachedConsistency &&
- IsUnderPostmaster)
+ if (ControlFile->minRecoveryPoint < EndRecPtr)
{
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->SharedHotStandbyActive = true;
- SpinLockRelease(&XLogCtl->info_lck);
+ ControlFile->minRecoveryPoint = EndRecPtr;
+ ControlFile->minRecoveryPointTLI = tli;
+ }
- LocalHotStandbyActive = true;
+ ControlFile->backupStartPoint = InvalidXLogRecPtr;
+ ControlFile->backupEndPoint = InvalidXLogRecPtr;
+ ControlFile->backupEndRequired = false;
+ UpdateControlFile();
- SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
- }
+ LWLockRelease(ControlFileLock);
}
/*
* fully out of recovery mode and already accepting queries.
*/
if (ArchiveRecoveryRequested && IsUnderPostmaster &&
- LocalPromoteIsTriggered)
+ PromoteIsTriggered())
{
promoted = true;
return retval;
}
-/*
- * Is HotStandby active yet? This is only important in special backends
- * since normal backends won't ever be able to connect until this returns
- * true. Postmaster knows this by way of signal, not via shared memory.
- *
- * Unlike testing standbyState, this works in any process that's connected to
- * shared memory. (And note that standbyState alone doesn't tell the truth
- * anyway.)
- */
-bool
-HotStandbyActive(void)
-{
- /*
- * We check shared state each time only until Hot Standby is active. We
- * can't de-activate Hot Standby, so there's no need to keep checking
- * after the shared variable has once been seen true.
- */
- if (LocalHotStandbyActive)
- return true;
- else
- {
- /* spinlock is essential on machines with weak memory ordering! */
- SpinLockAcquire(&XLogCtl->info_lck);
- LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return LocalHotStandbyActive;
- }
-}
-
-/*
- * Like HotStandbyActive(), but to be used only in WAL replay code,
- * where we don't need to ask any other process what the state is.
- */
-bool
-HotStandbyActiveInReplay(void)
-{
- Assert(AmStartupProcess() || !IsPostmasterEnvironment);
- return LocalHotStandbyActive;
-}
-
/*
* Is this process allowed to insert new WAL records?
*
return oldXLogAllowed;
}
-/*
- * Subroutine to try to fetch and validate a prior checkpoint record.
- *
- * whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 0 for "other" (backup_label)
- */
-static XLogRecord *
-ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI)
-{
- XLogRecord *record;
- uint8 info;
-
- if (!XRecOffIsValid(RecPtr))
- {
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint link in control file")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint link in backup_label file")));
- break;
- }
- return NULL;
- }
-
- XLogBeginRead(xlogreader, RecPtr);
- record = ReadRecord(xlogreader, LOG, true, replayTLI);
-
- if (record == NULL)
- {
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint record")));
- break;
- }
- return NULL;
- }
- if (record->xl_rmid != RM_XLOG_ID)
- {
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid resource manager ID in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
- break;
- }
- return NULL;
- }
- info = record->xl_info & ~XLR_INFO_MASK;
- if (info != XLOG_CHECKPOINT_SHUTDOWN &&
- info != XLOG_CHECKPOINT_ONLINE)
- {
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid xl_info in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid xl_info in checkpoint record")));
- break;
- }
- return NULL;
- }
- if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
- {
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid length of primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid length of checkpoint record")));
- break;
- }
- return NULL;
- }
- return record;
-}
-
/*
* Return the current Redo pointer from shared memory.
*
ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
/* update local copy */
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
}
if (flags & CHECKPOINT_IS_SHUTDOWN)
ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
END_CRIT_SECTION();
}
-/*
- * Check that it's OK to switch to new timeline during recovery.
- *
- * 'lsn' is the address of the shutdown checkpoint record we're about to
- * replay. (Currently, timeline can only change at a shutdown checkpoint).
- */
-static void
-checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
- TimeLineID replayTLI)
-{
- /* Check that the record agrees on what the current (old) timeline is */
- if (prevTLI != replayTLI)
- ereport(PANIC,
- (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
- prevTLI, replayTLI)));
-
- /*
- * The new timeline better be in the list of timelines we expect to see,
- * according to the timeline history. It should also not decrease.
- */
- if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
- ereport(PANIC,
- (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- newTLI, replayTLI)));
-
- /*
- * If we have not yet reached min recovery point, and we're about to
- * switch to a timeline greater than the timeline of the min recovery
- * point: trouble. After switching to the new timeline, we could not
- * possibly visit the min recovery point on the correct timeline anymore.
- * This can happen if there is a newer timeline in the archive that
- * branched before the timeline the min recovery point is on, and you
- * attempt to do PITR to the new timeline.
- */
- if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
- lsn < minRecoveryPoint &&
- newTLI > minRecoveryPointTLI)
- ereport(PANIC,
- (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
- newTLI,
- LSN_FORMAT_ARGS(minRecoveryPoint),
- minRecoveryPointTLI)));
-
- /* Looks good */
-}
-
/*
* XLOG resource manager's routines
*
* Definitions of info values are in include/catalog/pg_control.h, though
* not all record types are related to control file updates.
+ *
+ * NOTE: Some XLOG record types that are directly related to WAL recovery
+ * are handled in xlogrecovery_redo().
*/
void
xlog_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
XLogRecPtr lsn = record->EndRecPtr;
- TimeLineID replayTLI;
-
- /* No other process can change this, so we can read it without a lock. */
- replayTLI = XLogCtl->replayEndTLI;
/*
* In XLOG rmgr, backup blocks are only used by XLOG_FPI and
else if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
+ TimeLineID replayTLI;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
/* In a SHUTDOWN checkpoint, believe the counters exactly */
* We should've already switched to the new TLI before replaying this
* record.
*/
+ (void) GetCurrentReplayRecPtr(&replayTLI);
if (checkPoint.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
else if (info == XLOG_CHECKPOINT_ONLINE)
{
CheckPoint checkPoint;
+ TimeLineID replayTLI;
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
/* In an ONLINE checkpoint, treat the XID counter as a minimum */
SpinLockRelease(&XLogCtl->info_lck);
/* TLI should not change in an on-line checkpoint */
+ (void) GetCurrentReplayRecPtr(&replayTLI);
if (checkPoint.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
}
else if (info == XLOG_OVERWRITE_CONTRECORD)
{
- xl_overwrite_contrecord xlrec;
-
- memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
- VerifyOverwriteContrecord(&xlrec, record);
+ /* nothing to do here, handled in xlogrecovery_redo() */
}
else if (info == XLOG_END_OF_RECOVERY)
{
xl_end_of_recovery xlrec;
+ TimeLineID replayTLI;
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
* We should've already switched to the new TLI before replaying this
* record.
*/
+ (void) GetCurrentReplayRecPtr(&replayTLI);
if (xlrec.ThisTimeLineID != replayTLI)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
}
else if (info == XLOG_RESTORE_POINT)
{
- /* nothing to do here */
+ /* nothing to do here, handled in xlogrecovery.c */
}
else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
{
}
else if (info == XLOG_BACKUP_END)
{
- XLogRecPtr startpoint;
-
- memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
-
- if (ControlFile->backupStartPoint == startpoint)
- {
- /*
- * We have reached the end of base backup, the point where
- * pg_stop_backup() was done. The data on disk is now consistent.
- * Reset backupStartPoint, and update minRecoveryPoint to make
- * sure we don't allow starting up at an earlier point even if
- * recovery is stopped and restarted soon after this.
- */
- elog(DEBUG1, "end of backup reached");
-
- LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
- if (ControlFile->minRecoveryPoint < lsn)
- {
- ControlFile->minRecoveryPoint = lsn;
- ControlFile->minRecoveryPointTLI = replayTLI;
- }
- ControlFile->backupStartPoint = InvalidXLogRecPtr;
- ControlFile->backupEndRequired = false;
- UpdateControlFile();
-
- LWLockRelease(ControlFileLock);
- }
+ /* nothing to do here, handled in xlogrecovery_redo() */
}
else if (info == XLOG_PARAMETER_CHANGE)
{
*/
if (InArchiveRecovery)
{
- minRecoveryPoint = ControlFile->minRecoveryPoint;
- minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+ LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
}
- if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
+ if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
{
+ TimeLineID replayTLI;
+
+ (void) GetCurrentReplayRecPtr(&replayTLI);
ControlFile->minRecoveryPoint = lsn;
ControlFile->minRecoveryPointTLI = replayTLI;
}
}
}
-/*
- * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
- */
-static void
-VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
-{
- if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
- elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
- LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
- LSN_FORMAT_ARGS(state->overwrittenRecPtr));
-
- ereport(LOG,
- (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
- LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
- timestamptz_to_str(xlrec->overwrite_time))));
-
- /* Verifying the record should only happen once */
- state->overwrittenRecPtr = InvalidXLogRecPtr;
-}
-
-#ifdef WAL_DEBUG
-
-static void
-xlog_outrec(StringInfo buf, XLogReaderState *record)
-{
- appendStringInfo(buf, "prev %X/%X; xid %u",
- LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
- XLogRecGetXid(record));
-
- appendStringInfo(buf, "; len %u",
- XLogRecGetDataLen(record));
-
- xlog_block_info(buf, record);
-}
-#endif /* WAL_DEBUG */
-
-/*
- * Returns a string giving information about all the blocks in an
- * XLogRecord.
- */
-static void
-xlog_block_info(StringInfo buf, XLogReaderState *record)
-{
- int block_id;
-
- /* decode block references */
- for (block_id = 0; block_id <= record->max_block_id; block_id++)
- {
- RelFileNode rnode;
- ForkNumber forknum;
- BlockNumber blk;
-
- if (!XLogRecHasBlockRef(record, block_id))
- continue;
-
- XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
- if (forknum != MAIN_FORKNUM)
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
- block_id,
- rnode.spcNode, rnode.dbNode, rnode.relNode,
- forknum,
- blk);
- else
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
- block_id,
- rnode.spcNode, rnode.dbNode, rnode.relNode,
- blk);
- if (XLogRecHasBlockImage(record, block_id))
- appendStringInfoString(buf, " FPW");
- }
-}
-
-/*
- * Returns a string describing an XLogRecord, consisting of its identity
- * optionally followed by a colon, a space, and a further description.
- */
-static void
-xlog_outdesc(StringInfo buf, XLogReaderState *record)
-{
- RmgrId rmid = XLogRecGetRmid(record);
- uint8 info = XLogRecGetInfo(record);
- const char *id;
-
- appendStringInfoString(buf, RmgrTable[rmid].rm_name);
- appendStringInfoChar(buf, '/');
-
- id = RmgrTable[rmid].rm_identify(info);
- if (id == NULL)
- appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
- else
- appendStringInfo(buf, "%s: ", id);
-
- RmgrTable[rmid].rm_desc(buf, record);
-}
-
-
/*
* Return the (possible) sync flag used for opening a file, depending on the
* value of the GUC wal_sync_method.
already_done = true;
}
-/*
- * Get latest redo apply position.
- *
- * Exported to allow WALReceiver to read the pointer directly.
- */
-XLogRecPtr
-GetXLogReplayRecPtr(TimeLineID *replayTLI)
-{
- XLogRecPtr recptr;
- TimeLineID tli;
-
- SpinLockAcquire(&XLogCtl->info_lck);
- recptr = XLogCtl->lastReplayedEndRecPtr;
- tli = XLogCtl->lastReplayedTLI;
- SpinLockRelease(&XLogCtl->info_lck);
-
- if (replayTLI)
- *replayTLI = tli;
- return recptr;
-}
-
/*
* Get latest WAL insert pointer
*/
/*
* Get latest WAL write pointer
*/
-XLogRecPtr
-GetXLogWriteRecPtr(void)
-{
- SpinLockAcquire(&XLogCtl->info_lck);
- LogwrtResult = XLogCtl->LogwrtResult;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return LogwrtResult.Write;
-}
-
-/*
- * Returns the redo pointer of the last checkpoint or restartpoint. This is
- * the oldest point in WAL that we still need, if we have to restart recovery.
- */
-void
-GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
-{
- LWLockAcquire(ControlFileLock, LW_SHARED);
- *oldrecptr = ControlFile->checkPointCopy.redo;
- *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
- LWLockRelease(ControlFileLock);
-}
-
-/*
- * read_backup_label: check to see if a backup_label file is present
- *
- * If we see a backup_label during recovery, we assume that we are recovering
- * from a backup dump file, and we therefore roll forward from the checkpoint
- * identified by the label file, NOT what pg_control says. This avoids the
- * problem that pg_control might have been archived one or more checkpoints
- * later than the start of the dump, and so if we rely on it as the start
- * point, we will fail to restore a consistent database state.
- *
- * Returns true if a backup_label was found (and fills the checkpoint
- * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
- * returns false if not. If this backup_label came from a streamed backup,
- * *backupEndRequired is set to true. If this backup_label was created during
- * recovery, *backupFromStandby is set to true.
- *
- * Also sets the global variable RedoStartLSN with the LSN read from the
- * backup file.
- */
-static bool
-read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
- bool *backupEndRequired, bool *backupFromStandby)
-{
- char startxlogfilename[MAXFNAMELEN];
- TimeLineID tli_from_walseg,
- tli_from_file;
- FILE *lfp;
- char ch;
- char backuptype[20];
- char backupfrom[20];
- char backuplabel[MAXPGPATH];
- char backuptime[128];
- uint32 hi,
- lo;
-
- /* suppress possible uninitialized-variable warnings */
- *checkPointLoc = InvalidXLogRecPtr;
- *backupLabelTLI = 0;
- *backupEndRequired = false;
- *backupFromStandby = false;
-
- /*
- * See if label file is present
- */
- lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
- if (!lfp)
- {
- if (errno != ENOENT)
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not read file \"%s\": %m",
- BACKUP_LABEL_FILE)));
- return false; /* it's not there, all is fine */
- }
-
- /*
- * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
- * is pretty crude, but we are not expecting any variability in the file
- * format).
- */
- if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
- &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
- ereport(FATAL,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
- RedoStartLSN = ((uint64) hi) << 32 | lo;
- if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
- &hi, &lo, &ch) != 3 || ch != '\n')
- ereport(FATAL,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
- *checkPointLoc = ((uint64) hi) << 32 | lo;
-
- /*
- * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
- * from an older backup anyway, but since the information on it is not
- * strictly required, don't error out if it's missing for some reason.
- */
- if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
- {
- if (strcmp(backuptype, "streamed") == 0)
- *backupEndRequired = true;
- }
-
- if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
- {
- if (strcmp(backupfrom, "standby") == 0)
- *backupFromStandby = true;
- }
-
- /*
- * Parse START TIME and LABEL. Those are not mandatory fields for recovery
- * but checking for their presence is useful for debugging and the next
- * sanity checks. Cope also with the fact that the result buffers have a
- * pre-allocated size, hence if the backup_label file has been generated
- * with strings longer than the maximum assumed here an incorrect parsing
- * happens. That's fine as only minor consistency checks are done
- * afterwards.
- */
- if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
- ereport(DEBUG1,
- (errmsg_internal("backup time %s in file \"%s\"",
- backuptime, BACKUP_LABEL_FILE)));
-
- if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
- ereport(DEBUG1,
- (errmsg_internal("backup label %s in file \"%s\"",
- backuplabel, BACKUP_LABEL_FILE)));
-
- /*
- * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
- * it as a sanity check if present.
- */
- if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
- {
- if (tli_from_walseg != tli_from_file)
- ereport(FATAL,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
- errdetail("Timeline ID parsed is %u, but expected %u.",
- tli_from_file, tli_from_walseg)));
-
- ereport(DEBUG1,
- (errmsg_internal("backup timeline %u in file \"%s\"",
- tli_from_file, BACKUP_LABEL_FILE)));
- }
-
- if (ferror(lfp) || FreeFile(lfp))
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not read file \"%s\": %m",
- BACKUP_LABEL_FILE)));
-
- *backupLabelTLI = tli_from_walseg;
-
- return true;
-}
-
-/*
- * read_tablespace_map: check to see if a tablespace_map file is present
- *
- * If we see a tablespace_map file during recovery, we assume that we are
- * recovering from a backup dump file, and we therefore need to create symlinks
- * as per the information present in tablespace_map file.
- *
- * Returns true if a tablespace_map file was found (and fills *tablespaces
- * with a tablespaceinfo struct for each tablespace listed in the file);
- * returns false if not.
- */
-static bool
-read_tablespace_map(List **tablespaces)
-{
- tablespaceinfo *ti;
- FILE *lfp;
- char str[MAXPGPATH];
- int ch,
- i,
- n;
- bool was_backslash;
-
- /*
- * See if tablespace_map file is present
- */
- lfp = AllocateFile(TABLESPACE_MAP, "r");
- if (!lfp)
- {
- if (errno != ENOENT)
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not read file \"%s\": %m",
- TABLESPACE_MAP)));
- return false; /* it's not there, all is fine */
- }
-
- /*
- * Read and parse the link name and path lines from tablespace_map file
- * (this code is pretty crude, but we are not expecting any variability in
- * the file format). De-escape any backslashes that were inserted.
- */
- i = 0;
- was_backslash = false;
- while ((ch = fgetc(lfp)) != EOF)
- {
- if (!was_backslash && (ch == '\n' || ch == '\r'))
- {
- if (i == 0)
- continue; /* \r immediately followed by \n */
-
- /*
- * The de-escaped line should contain an OID followed by exactly
- * one space followed by a path. The path might start with
- * spaces, so don't be too liberal about parsing.
- */
- str[i] = '\0';
- n = 0;
- while (str[n] && str[n] != ' ')
- n++;
- if (n < 1 || n >= i - 1)
- ereport(FATAL,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
- str[n++] = '\0';
-
- ti = palloc0(sizeof(tablespaceinfo));
- ti->oid = pstrdup(str);
- ti->path = pstrdup(str + n);
- *tablespaces = lappend(*tablespaces, ti);
-
- i = 0;
- continue;
- }
- else if (!was_backslash && ch == '\\')
- was_backslash = true;
- else
- {
- if (i < sizeof(str) - 1)
- str[i++] = ch;
- was_backslash = false;
- }
- }
-
- if (i != 0 || was_backslash) /* last line not terminated? */
- ereport(FATAL,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
-
- if (ferror(lfp) || FreeFile(lfp))
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not read file \"%s\": %m",
- TABLESPACE_MAP)));
+XLogRecPtr
+GetXLogWriteRecPtr(void)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ LogwrtResult = XLogCtl->LogwrtResult;
+ SpinLockRelease(&XLogCtl->info_lck);
- return true;
+ return LogwrtResult.Write;
}
/*
- * Error context callback for errors occurring during rm_redo().
+ * Returns the redo pointer of the last checkpoint or restartpoint. This is
+ * the oldest point in WAL that we still need, if we have to restart recovery.
*/
-static void
-rm_redo_error_callback(void *arg)
+void
+GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
{
- XLogReaderState *record = (XLogReaderState *) arg;
- StringInfoData buf;
-
- initStringInfo(&buf);
- xlog_outdesc(&buf, record);
- xlog_block_info(&buf, record);
-
- /* translator: %s is a WAL record description */
- errcontext("WAL redo at %X/%X for %s",
- LSN_FORMAT_ARGS(record->ReadRecPtr),
- buf.data);
-
- pfree(buf.data);
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ *oldrecptr = ControlFile->checkPointCopy.redo;
+ *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
+ LWLockRelease(ControlFileLock);
}
/*
}
}
-/*
- * Read the XLOG page containing RecPtr into readBuf (if not read already).
- * Returns number of bytes read, if the page is read successfully, or -1
- * in case of errors. When errors occur, they are ereport'ed, but only
- * if they have not been previously reported.
- *
- * This is responsible for restoring files from archive as needed, as well
- * as for waiting for the requested WAL record to arrive in standby mode.
- *
- * 'emode' specifies the log level used for reporting "file not found" or
- * "end of WAL" situations in archive recovery, or in standby mode when a
- * trigger file is found. If set to WARNING or below, XLogPageRead() returns
- * false in those situations, on higher log levels the ereport() won't
- * return.
- *
- * In standby mode, if after a successful return of XLogPageRead() the
- * caller finds the record it's interested in to be broken, it should
- * ereport the error with the level determined by
- * emode_for_corrupt_record(), and then set lastSourceFailed
- * and call XLogPageRead() again with the same arguments. This lets
- * XLogPageRead() to try fetching the record from another source, or to
- * sleep and retry.
- */
-static int
-XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
- XLogRecPtr targetRecPtr, char *readBuf)
-{
- XLogPageReadPrivate *private =
- (XLogPageReadPrivate *) xlogreader->private_data;
- int emode = private->emode;
- uint32 targetPageOff;
- XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
- int r;
-
- XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
- targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
-
- /*
- * See if we need to switch to a new segment because the requested record
- * is not in the currently open one.
- */
- if (readFile >= 0 &&
- !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
- {
- /*
- * Request a restartpoint if we've replayed too much xlog since the
- * last one.
- */
- if (ArchiveRecoveryRequested && IsUnderPostmaster)
- {
- if (XLogCheckpointNeeded(readSegNo))
- {
- (void) GetRedoRecPtr();
- if (XLogCheckpointNeeded(readSegNo))
- RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
- }
- }
-
- close(readFile);
- readFile = -1;
- readSource = XLOG_FROM_ANY;
- }
-
- XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
-
-retry:
- /* See if we need to retrieve more data */
- if (readFile < 0 ||
- (readSource == XLOG_FROM_STREAM &&
- flushedUpto < targetPagePtr + reqLen))
- {
- if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
- private->randAccess,
- private->fetching_ckpt,
- targetRecPtr,
- private->replayTLI,
- xlogreader->EndRecPtr))
- {
- if (readFile >= 0)
- close(readFile);
- readFile = -1;
- readLen = 0;
- readSource = XLOG_FROM_ANY;
-
- return -1;
- }
- }
-
- /*
- * At this point, we have the right segment open and if we're streaming we
- * know the requested record is in it.
- */
- Assert(readFile != -1);
-
- /*
- * If the current segment is being streamed from the primary, calculate
- * how much of the current page we have received already. We know the
- * requested record has been received, but this is for the benefit of
- * future calls, to allow quick exit at the top of this function.
- */
- if (readSource == XLOG_FROM_STREAM)
- {
- if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
- readLen = XLOG_BLCKSZ;
- else
- readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
- targetPageOff;
- }
- else
- readLen = XLOG_BLCKSZ;
-
- /* Read the requested page */
- readOff = targetPageOff;
-
- pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
- r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
- if (r != XLOG_BLCKSZ)
- {
- char fname[MAXFNAMELEN];
- int save_errno = errno;
-
- pgstat_report_wait_end();
- XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
- if (r < 0)
- {
- errno = save_errno;
- ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
- (errcode_for_file_access(),
- errmsg("could not read from log segment %s, offset %u: %m",
- fname, readOff)));
- }
- else
- ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("could not read from log segment %s, offset %u: read %d of %zu",
- fname, readOff, r, (Size) XLOG_BLCKSZ)));
- goto next_record_is_invalid;
- }
- pgstat_report_wait_end();
-
- Assert(targetSegNo == readSegNo);
- Assert(targetPageOff == readOff);
- Assert(reqLen <= readLen);
-
- xlogreader->seg.ws_tli = curFileTLI;
-
- /*
- * Check the page header immediately, so that we can retry immediately if
- * it's not valid. This may seem unnecessary, because ReadPageInternal()
- * validates the page header anyway, and would propagate the failure up to
- * ReadRecord(), which would retry. However, there's a corner case with
- * continuation records, if a record is split across two pages such that
- * we would need to read the two pages from different sources. For
- * example, imagine a scenario where a streaming replica is started up,
- * and replay reaches a record that's split across two WAL segments. The
- * first page is only available locally, in pg_wal, because it's already
- * been recycled on the primary. The second page, however, is not present
- * in pg_wal, and we should stream it from the primary. There is a
- * recycled WAL segment present in pg_wal, with garbage contents, however.
- * We would read the first page from the local WAL segment, but when
- * reading the second page, we would read the bogus, recycled, WAL
- * segment. If we didn't catch that case here, we would never recover,
- * because ReadRecord() would retry reading the whole record from the
- * beginning.
- *
- * Of course, this only catches errors in the page header, which is what
- * happens in the case of a recycled WAL segment. Other kinds of errors or
- * corruption still has the same problem. But this at least fixes the
- * common case, which can happen as part of normal operation.
- *
- * Validating the page header is cheap enough that doing it twice
- * shouldn't be a big deal from a performance point of view.
- *
- * When not in standby mode, an invalid page header should cause recovery
- * to end, not retry reading the page, so we don't need to validate the
- * page header here for the retry. Instead, ReadPageInternal() is
- * responsible for the validation.
- */
- if (StandbyMode &&
- !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
- {
- /*
- * Emit this error right now then retry this page immediately. Use
- * errmsg_internal() because the message was already translated.
- */
- if (xlogreader->errormsg_buf[0])
- ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
- (errmsg_internal("%s", xlogreader->errormsg_buf)));
-
- /* reset any error XLogReaderValidatePageHeader() might have set */
- xlogreader->errormsg_buf[0] = '\0';
- goto next_record_is_invalid;
- }
-
- return readLen;
-
-next_record_is_invalid:
- lastSourceFailed = true;
-
- if (readFile >= 0)
- close(readFile);
- readFile = -1;
- readLen = 0;
- readSource = XLOG_FROM_ANY;
-
- /* In standby-mode, keep trying */
- if (StandbyMode)
- goto retry;
- else
- return -1;
-}
-
-/*
- * Open the WAL segment containing WAL location 'RecPtr'.
- *
- * The segment can be fetched via restore_command, or via walreceiver having
- * streamed the record, or it can already be present in pg_wal. Checking
- * pg_wal is mainly for crash recovery, but it will be polled in standby mode
- * too, in case someone copies a new segment directly to pg_wal. That is not
- * documented or recommended, though.
- *
- * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
- * prepare to read WAL starting from RedoStartLSN after this.
- *
- * 'RecPtr' might not point to the beginning of the record we're interested
- * in, it might also point to the page or segment header. In that case,
- * 'tliRecPtr' is the position of the WAL record we're interested in. It is
- * used to decide which timeline to stream the requested WAL from.
- *
- * 'replayLSN' is the current replay LSN, so that if we scan for new
- * timelines, we can reject a switch to a timeline that branched off before
- * this point.
- *
- * If the record is not immediately available, the function returns false
- * if we're not in standby mode. In standby mode, waits for it to become
- * available.
- *
- * When the requested record becomes available, the function opens the file
- * containing it (if not open already), and returns true. When end of standby
- * mode is triggered by the user, and there is no more WAL available, returns
- * false.
- */
-static bool
-WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
- bool fetching_ckpt, XLogRecPtr tliRecPtr,
- TimeLineID replayTLI, XLogRecPtr replayLSN)
-{
- static TimestampTz last_fail_time = 0;
- TimestampTz now;
- bool streaming_reply_sent = false;
-
- /*-------
- * Standby mode is implemented by a state machine:
- *
- * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
- * pg_wal (XLOG_FROM_PG_WAL)
- * 2. Check trigger file
- * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
- * 4. Rescan timelines
- * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
- *
- * Failure to read from the current source advances the state machine to
- * the next state.
- *
- * 'currentSource' indicates the current state. There are no currentSource
- * values for "check trigger", "rescan timelines", and "sleep" states,
- * those actions are taken when reading from the previous source fails, as
- * part of advancing to the next state.
- *
- * If standby mode is turned off while reading WAL from stream, we move
- * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
- * the files (which would be required at end of recovery, e.g., timeline
- * history file) from archive or pg_wal. We don't need to kill WAL receiver
- * here because it's already stopped when standby mode is turned off at
- * the end of recovery.
- *-------
- */
- if (!InArchiveRecovery)
- currentSource = XLOG_FROM_PG_WAL;
- else if (currentSource == XLOG_FROM_ANY ||
- (!StandbyMode && currentSource == XLOG_FROM_STREAM))
- {
- lastSourceFailed = false;
- currentSource = XLOG_FROM_ARCHIVE;
- }
-
- for (;;)
- {
- XLogSource oldSource = currentSource;
- bool startWalReceiver = false;
-
- /*
- * First check if we failed to read from the current source, and
- * advance the state machine if so. The failure to read might've
- * happened outside this function, e.g when a CRC check fails on a
- * record, or within this loop.
- */
- if (lastSourceFailed)
- {
- switch (currentSource)
- {
- case XLOG_FROM_ARCHIVE:
- case XLOG_FROM_PG_WAL:
-
- /*
- * Check to see if the trigger file exists. Note that we
- * do this only after failure, so when you create the
- * trigger file, we still finish replaying as much as we
- * can from archive and pg_wal before failover.
- */
- if (StandbyMode && CheckForStandbyTrigger())
- {
- XLogShutdownWalRcv();
- return false;
- }
-
- /*
- * Not in standby mode, and we've now tried the archive
- * and pg_wal.
- */
- if (!StandbyMode)
- return false;
-
- /*
- * Move to XLOG_FROM_STREAM state, and set to start a
- * walreceiver if necessary.
- */
- currentSource = XLOG_FROM_STREAM;
- startWalReceiver = true;
- break;
-
- case XLOG_FROM_STREAM:
-
- /*
- * Failure while streaming. Most likely, we got here
- * because streaming replication was terminated, or
- * promotion was triggered. But we also get here if we
- * find an invalid record in the WAL streamed from the
- * primary, in which case something is seriously wrong.
- * There's little chance that the problem will just go
- * away, but PANIC is not good for availability either,
- * especially in hot standby mode. So, we treat that the
- * same as disconnection, and retry from archive/pg_wal
- * again. The WAL in the archive should be identical to
- * what was streamed, so it's unlikely that it helps, but
- * one can hope...
- */
-
- /*
- * We should be able to move to XLOG_FROM_STREAM only in
- * standby mode.
- */
- Assert(StandbyMode);
-
- /*
- * Before we leave XLOG_FROM_STREAM state, make sure that
- * walreceiver is not active, so that it won't overwrite
- * WAL that we restore from archive.
- */
- if (WalRcvStreaming())
- XLogShutdownWalRcv();
-
- /*
- * Before we sleep, re-scan for possible new timelines if
- * we were requested to recover to the latest timeline.
- */
- if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
- {
- if (rescanLatestTimeLine(replayTLI, replayLSN))
- {
- currentSource = XLOG_FROM_ARCHIVE;
- break;
- }
- }
-
- /*
- * XLOG_FROM_STREAM is the last state in our state
- * machine, so we've exhausted all the options for
- * obtaining the requested WAL. We're going to loop back
- * and retry from the archive, but if it hasn't been long
- * since last attempt, sleep wal_retrieve_retry_interval
- * milliseconds to avoid busy-waiting.
- */
- now = GetCurrentTimestamp();
- if (!TimestampDifferenceExceeds(last_fail_time, now,
- wal_retrieve_retry_interval))
- {
- long wait_time;
-
- wait_time = wal_retrieve_retry_interval -
- TimestampDifferenceMilliseconds(last_fail_time, now);
-
- (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
- WL_LATCH_SET | WL_TIMEOUT |
- WL_EXIT_ON_PM_DEATH,
- wait_time,
- WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
- ResetLatch(&XLogCtl->recoveryWakeupLatch);
- now = GetCurrentTimestamp();
-
- /* Handle interrupt signals of startup process */
- HandleStartupProcInterrupts();
- }
- last_fail_time = now;
- currentSource = XLOG_FROM_ARCHIVE;
- break;
-
- default:
- elog(ERROR, "unexpected WAL source %d", currentSource);
- }
- }
- else if (currentSource == XLOG_FROM_PG_WAL)
- {
- /*
- * We just successfully read a file in pg_wal. We prefer files in
- * the archive over ones in pg_wal, so try the next file again
- * from the archive first.
- */
- if (InArchiveRecovery)
- currentSource = XLOG_FROM_ARCHIVE;
- }
-
- if (currentSource != oldSource)
- elog(DEBUG2, "switched WAL source from %s to %s after %s",
- xlogSourceNames[oldSource], xlogSourceNames[currentSource],
- lastSourceFailed ? "failure" : "success");
-
- /*
- * We've now handled possible failure. Try to read from the chosen
- * source.
- */
- lastSourceFailed = false;
-
- switch (currentSource)
- {
- case XLOG_FROM_ARCHIVE:
- case XLOG_FROM_PG_WAL:
-
- /*
- * WAL receiver must not be running when reading WAL from
- * archive or pg_wal.
- */
- Assert(!WalRcvStreaming());
-
- /* Close any old file we might have open. */
- if (readFile >= 0)
- {
- close(readFile);
- readFile = -1;
- }
- /* Reset curFileTLI if random fetch. */
- if (randAccess)
- curFileTLI = 0;
-
- /*
- * Try to restore the file from archive, or read an existing
- * file from pg_wal.
- */
- readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
- currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
- currentSource);
- if (readFile >= 0)
- return true; /* success! */
-
- /*
- * Nope, not found in archive or pg_wal.
- */
- lastSourceFailed = true;
- break;
-
- case XLOG_FROM_STREAM:
- {
- bool havedata;
-
- /*
- * We should be able to move to XLOG_FROM_STREAM only in
- * standby mode.
- */
- Assert(StandbyMode);
-
- /*
- * First, shutdown walreceiver if its restart has been
- * requested -- but no point if we're already slated for
- * starting it.
- */
- if (pendingWalRcvRestart && !startWalReceiver)
- {
- XLogShutdownWalRcv();
-
- /*
- * Re-scan for possible new timelines if we were
- * requested to recover to the latest timeline.
- */
- if (recoveryTargetTimeLineGoal ==
- RECOVERY_TARGET_TIMELINE_LATEST)
- rescanLatestTimeLine(replayTLI, replayLSN);
-
- startWalReceiver = true;
- }
- pendingWalRcvRestart = false;
-
- /*
- * Launch walreceiver if needed.
- *
- * If fetching_ckpt is true, RecPtr points to the initial
- * checkpoint location. In that case, we use RedoStartLSN
- * as the streaming start position instead of RecPtr, so
- * that when we later jump backwards to start redo at
- * RedoStartLSN, we will have the logs streamed already.
- */
- if (startWalReceiver &&
- PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
- {
- XLogRecPtr ptr;
- TimeLineID tli;
-
- if (fetching_ckpt)
- {
- ptr = RedoStartLSN;
- tli = ControlFile->checkPointCopy.ThisTimeLineID;
- }
- else
- {
- ptr = RecPtr;
-
- /*
- * Use the record begin position to determine the
- * TLI, rather than the position we're reading.
- */
- tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
-
- if (curFileTLI > 0 && tli < curFileTLI)
- elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
- LSN_FORMAT_ARGS(tliRecPtr),
- tli, curFileTLI);
- }
- curFileTLI = tli;
- LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
- XLogCtl->InstallXLogFileSegmentActive = true;
- LWLockRelease(ControlFileLock);
- RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
- PrimarySlotName,
- wal_receiver_create_temp_slot);
- flushedUpto = 0;
- }
-
- /*
- * Check if WAL receiver is active or wait to start up.
- */
- if (!WalRcvStreaming())
- {
- lastSourceFailed = true;
- break;
- }
-
- /*
- * Walreceiver is active, so see if new data has arrived.
- *
- * We only advance XLogReceiptTime when we obtain fresh
- * WAL from walreceiver and observe that we had already
- * processed everything before the most recent "chunk"
- * that it flushed to disk. In steady state where we are
- * keeping up with the incoming data, XLogReceiptTime will
- * be updated on each cycle. When we are behind,
- * XLogReceiptTime will not advance, so the grace time
- * allotted to conflicting queries will decrease.
- */
- if (RecPtr < flushedUpto)
- havedata = true;
- else
- {
- XLogRecPtr latestChunkStart;
-
- flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
- if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
- {
- havedata = true;
- if (latestChunkStart <= RecPtr)
- {
- XLogReceiptTime = GetCurrentTimestamp();
- SetCurrentChunkStartTime(XLogReceiptTime);
- }
- }
- else
- havedata = false;
- }
- if (havedata)
- {
- /*
- * Great, streamed far enough. Open the file if it's
- * not open already. Also read the timeline history
- * file if we haven't initialized timeline history
- * yet; it should be streamed over and present in
- * pg_wal by now. Use XLOG_FROM_STREAM so that source
- * info is set correctly and XLogReceiptTime isn't
- * changed.
- *
- * NB: We must set readTimeLineHistory based on
- * recoveryTargetTLI, not receiveTLI. Normally they'll
- * be the same, but if recovery_target_timeline is
- * 'latest' and archiving is configured, then it's
- * possible that we managed to retrieve one or more
- * new timeline history files from the archive,
- * updating recoveryTargetTLI.
- */
- if (readFile < 0)
- {
- if (!expectedTLEs)
- expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
- readFile = XLogFileRead(readSegNo, PANIC,
- receiveTLI,
- XLOG_FROM_STREAM, false);
- Assert(readFile >= 0);
- }
- else
- {
- /* just make sure source info is correct... */
- readSource = XLOG_FROM_STREAM;
- XLogReceiptSource = XLOG_FROM_STREAM;
- return true;
- }
- break;
- }
-
- /*
- * Data not here yet. Check for trigger, then wait for
- * walreceiver to wake us up when new WAL arrives.
- */
- if (CheckForStandbyTrigger())
- {
- /*
- * Note that we don't "return false" immediately here.
- * After being triggered, we still want to replay all
- * the WAL that was already streamed. It's in pg_wal
- * now, so we just treat this as a failure, and the
- * state machine will move on to replay the streamed
- * WAL from pg_wal, and then recheck the trigger and
- * exit replay.
- */
- lastSourceFailed = true;
- break;
- }
-
- /*
- * Since we have replayed everything we have received so
- * far and are about to start waiting for more WAL, let's
- * tell the upstream server our replay location now so
- * that pg_stat_replication doesn't show stale
- * information.
- */
- if (!streaming_reply_sent)
- {
- WalRcvForceReply();
- streaming_reply_sent = true;
- }
-
- /*
- * Wait for more WAL to arrive. Time out after 5 seconds
- * to react to a trigger file promptly and to check if the
- * WAL receiver is still active.
- */
- (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
- WL_LATCH_SET | WL_TIMEOUT |
- WL_EXIT_ON_PM_DEATH,
- 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
- ResetLatch(&XLogCtl->recoveryWakeupLatch);
- break;
- }
-
- default:
- elog(ERROR, "unexpected WAL source %d", currentSource);
- }
-
- /*
- * Check for recovery pause here so that we can confirm more quickly
- * that a requested pause has actually taken effect.
- */
- if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
- RECOVERY_NOT_PAUSED)
- recoveryPausesHere(false);
-
- /*
- * This possibly-long loop needs to handle interrupts of startup
- * process.
- */
- HandleStartupProcInterrupts();
- }
-
- return false; /* not reached */
-}
-
-/*
- * Set flag to signal the walreceiver to restart. (The startup process calls
- * this on noticing a relevant configuration change.)
- */
-void
-StartupRequestWalReceiverRestart(void)
-{
- if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
- {
- ereport(LOG,
- (errmsg("WAL receiver process shutdown requested")));
-
- pendingWalRcvRestart = true;
- }
-}
-
/* Thin wrapper around ShutdownWalRcv(). */
-static void
+void
XLogShutdownWalRcv(void)
{
ShutdownWalRcv();
LWLockRelease(ControlFileLock);
}
-/*
- * Determine what log level should be used to report a corrupt WAL record
- * in the current WAL page, previously read by XLogPageRead().
- *
- * 'emode' is the error mode that would be used to report a file-not-found
- * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
- * we're retrying the exact same record that we've tried previously, only
- * complain the first time to keep the noise down. However, we only do when
- * reading from pg_wal, because we don't expect any invalid records in archive
- * or in records streamed from the primary. Files in the archive should be complete,
- * and we should never hit the end of WAL because we stop and wait for more WAL
- * to arrive before replaying it.
- *
- * NOTE: This function remembers the RecPtr value it was last called with,
- * to suppress repeated messages about the same record. Only call this when
- * you are about to ereport(), or you might cause a later message to be
- * erroneously suppressed.
- */
-static int
-emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
-{
- static XLogRecPtr lastComplaint = 0;
-
- if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
- {
- if (RecPtr == lastComplaint)
- emode = DEBUG1;
- else
- lastComplaint = RecPtr;
- }
- return emode;
-}
-
-/*
- * Has a standby promotion already been triggered?
- *
- * Unlike CheckForStandbyTrigger(), this works in any process
- * that's connected to shared memory.
- */
-bool
-PromoteIsTriggered(void)
-{
- /*
- * We check shared state each time only until a standby promotion is
- * triggered. We can't trigger a promotion again, so there's no need to
- * keep checking after the shared variable has once been seen true.
- */
- if (LocalPromoteIsTriggered)
- return true;
-
- SpinLockAcquire(&XLogCtl->info_lck);
- LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
- SpinLockRelease(&XLogCtl->info_lck);
-
- return LocalPromoteIsTriggered;
-}
-
-static void
-SetPromoteIsTriggered(void)
-{
- SpinLockAcquire(&XLogCtl->info_lck);
- XLogCtl->SharedPromoteIsTriggered = true;
- SpinLockRelease(&XLogCtl->info_lck);
-
- /*
- * Mark the recovery pause state as 'not paused' because the paused state
- * ends and promotion continues if a promotion is triggered while recovery
- * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
- * return 'paused' while a promotion is ongoing.
- */
- SetRecoveryPause(false);
-
- LocalPromoteIsTriggered = true;
-}
-
-/*
- * Check to see whether the user-specified trigger file exists and whether a
- * promote request has arrived. If either condition holds, return true.
- */
-static bool
-CheckForStandbyTrigger(void)
-{
- struct stat stat_buf;
-
- if (LocalPromoteIsTriggered)
- return true;
-
- if (IsPromoteSignaled() && CheckPromoteSignal())
- {
- ereport(LOG, (errmsg("received promote request")));
- RemovePromoteSignalFiles();
- ResetPromoteSignaled();
- SetPromoteIsTriggered();
- return true;
- }
-
- if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
- return false;
-
- if (stat(PromoteTriggerFile, &stat_buf) == 0)
- {
- ereport(LOG,
- (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
- unlink(PromoteTriggerFile);
- SetPromoteIsTriggered();
- return true;
- }
- else if (errno != ENOENT)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not stat promote trigger file \"%s\": %m",
- PromoteTriggerFile)));
-
- return false;
-}
-
-/*
- * Remove the files signaling a standby promotion request.
- */
+/* Enable WAL file recycling and preallocation. */
void
-RemovePromoteSignalFiles(void)
+SetInstallXLogFileSegmentActive(void)
{
- unlink(PROMOTE_SIGNAL_FILE);
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ XLogCtl->InstallXLogFileSegmentActive = true;
+ LWLockRelease(ControlFileLock);
}
-/*
- * Check to see if a promote request has arrived.
- */
bool
-CheckPromoteSignal(void)
+IsInstallXLogFileSegmentActive(void)
{
- struct stat stat_buf;
-
- if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
- return true;
+ bool result;
- return false;
-}
+ LWLockAcquire(ControlFileLock, LW_SHARED);
+ result = XLogCtl->InstallXLogFileSegmentActive;
+ LWLockRelease(ControlFileLock);
-/*
- * Wake up startup process to replay newly arrived WAL, or to notice that
- * failover has been requested.
- */
-void
-WakeupRecovery(void)
-{
- SetLatch(&XLogCtl->recoveryWakeupLatch);
+ return result;
}
/*
XLogCtl->WalWriterSleeping = sleeping;
SpinLockRelease(&XLogCtl->info_lck);
}
-
-/*
- * Schedule a walreceiver wakeup in the main recovery loop.
- */
-void
-XLogRequestWalReceiverReply(void)
-{
- doRequestWalReceiverReply = true;
-}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * xlogrecovery.c
+ * Functions for WAL recovery, standby mode
+ *
+ * This source file contains functions controlling WAL recovery.
+ * InitWalRecovery() initializes the system for crash or archive recovery,
+ * or standby mode, depending on configuration options and the state of
+ * the control file and possible backup label file. PerformWalRecovery()
+ * performs the actual WAL replay, calling the rmgr-specific redo routines.
+ * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
+ * and prepares information needed to initialize the WAL for writes. In
+ * addition to these three main functions, there are a bunch of functions
+ * for interrogating recovery state and controlling the recovery process.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogrecovery.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_control.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "replication/basebackup.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/pg_rusage.h"
+
+/* Unsupported old recovery command file names (relative to $PGDATA) */
+#define RECOVERY_COMMAND_FILE "recovery.conf"
+#define RECOVERY_COMMAND_DONE "recovery.done"
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry recovery_target_action_options[] = {
+ {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
+ {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
+ {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
+ {NULL, 0, false}
+};
+
+/* options formerly taken from recovery.conf for archive recovery */
+char *recoveryRestoreCommand = NULL;
+char *recoveryEndCommand = NULL;
+char *archiveCleanupCommand = NULL;
+RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
+bool recoveryTargetInclusive = true;
+int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
+TransactionId recoveryTargetXid;
+char *recovery_target_time_string;
+TimestampTz recoveryTargetTime;
+const char *recoveryTargetName;
+XLogRecPtr recoveryTargetLSN;
+int recovery_min_apply_delay = 0;
+
+/* options formerly taken from recovery.conf for XLOG streaming */
+char *PrimaryConnInfo = NULL;
+char *PrimarySlotName = NULL;
+char *PromoteTriggerFile = NULL;
+bool wal_receiver_create_temp_slot = false;
+
+/*
+ * recoveryTargetTimeLineGoal: what the user requested, if any
+ *
+ * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
+ *
+ * recoveryTargetTLI: the currently understood target timeline; changes
+ *
+ * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
+ * the timelines of its known parents, newest first (so recoveryTargetTLI is
+ * always the first list member). Only these TLIs are expected to be seen in
+ * the WAL segments we read, and indeed only these TLIs will be considered as
+ * candidate WAL files to open at all.
+ *
+ * curFileTLI: the TLI appearing in the name of the current input WAL file.
+ * (This is not necessarily the same as the timeline from which we are
+ * replaying WAL, which StartupXLOG calls replayTLI, because we could be
+ * scanning data that was copied from an ancestor timeline when the current
+ * file was created.) During a sequential scan we do not allow this value
+ * to decrease.
+ */
+RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
+TimeLineID recoveryTargetTLIRequested = 0;
+TimeLineID recoveryTargetTLI = 0;
+static List *expectedTLEs;
+static TimeLineID curFileTLI;
+
+/*
+ * When ArchiveRecoveryRequested is set, archive recovery was requested,
+ * ie. signal files were present. When InArchiveRecovery is set, we are
+ * currently recovering using offline XLOG archives. These variables are only
+ * valid in the startup process.
+ *
+ * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
+ * currently performing crash recovery using only XLOG files in pg_wal, but
+ * will switch to using offline XLOG archives as soon as we reach the end of
+ * WAL in pg_wal.
+*/
+bool ArchiveRecoveryRequested = false;
+bool InArchiveRecovery = false;
+
+/*
+ * When StandbyModeRequested is set, standby mode was requested, i.e.
+ * standby.signal file was present. When StandbyMode is set, we are currently
+ * in standby mode. These variables are only valid in the startup process.
+ * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
+ */
+static bool StandbyModeRequested = false;
+bool StandbyMode = false;
+
+/* was a signal file present at startup? */
+static bool standby_signal_file_found = false;
+static bool recovery_signal_file_found = false;
+
+/*
+ * CheckPointLoc is the position of the checkpoint record that determines
+ * where to start the replay. It comes from the backup label file or the
+ * control file.
+ *
+ * RedoStartLSN is the checkpoint's REDO location, also from the backup label
+ * file or the control file. In standby mode, XLOG streaming usually starts
+ * from the position where an invalid record was found. But if we fail to
+ * read even the initial checkpoint record, we use the REDO location instead
+ * of the checkpoint location as the start position of XLOG streaming.
+ * Otherwise we would have to jump backwards to the REDO location after
+ * reading the checkpoint record, because the REDO record can precede the
+ * checkpoint record.
+ */
+static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
+static TimeLineID CheckPointTLI = 0;
+static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
+static TimeLineID RedoStartTLI = 0;
+
+/*
+ * Local copy of SharedHotStandbyActive variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalHotStandbyActive = false;
+
+/*
+ * Local copy of SharedPromoteIsTriggered variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalPromoteIsTriggered = false;
+
+/* Has the recovery code requested a walreceiver wakeup? */
+static bool doRequestWalReceiverReply;
+
+/* XLogReader object used to parse the WAL records */
+static XLogReaderState *xlogreader = NULL;
+
+/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
+typedef struct XLogPageReadPrivate
+{
+ int emode;
+ bool fetching_ckpt; /* are we fetching a checkpoint record? */
+ bool randAccess;
+ TimeLineID replayTLI;
+} XLogPageReadPrivate;
+
+/* flag to tell XLogPageRead that we have started replaying */
+static bool InRedo = false;
+
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.
+ */
+typedef enum
+{
+ XLOG_FROM_ANY = 0, /* request to read WAL from any source */
+ XLOG_FROM_ARCHIVE, /* restored using restore_command */
+ XLOG_FROM_PG_WAL, /* existing file in pg_wal */
+ XLOG_FROM_STREAM /* streamed from primary */
+} XLogSource;
+
+/* human-readable names for XLogSources, for debugging output */
+static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
+
+/*
+ * readFile is -1 or a kernel FD for the log file segment that's currently
+ * open for reading. readSegNo identifies the segment. readOff is the offset
+ * of the page just read, readLen indicates how much of it has been read into
+ * readBuf, and readSource indicates where we got the currently open file from.
+ *
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
+ * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
+ * worthwhile, since the XLOG is not read by general-purpose sessions.
+ */
+static int readFile = -1;
+static XLogSegNo readSegNo = 0;
+static uint32 readOff = 0;
+static uint32 readLen = 0;
+static XLogSource readSource = XLOG_FROM_ANY;
+
+/*
+ * Keeps track of which source we're currently reading from. This is
+ * different from readSource in that this is always set, even when we don't
+ * currently have a WAL file open. If lastSourceFailed is set, our last
+ * attempt to read from currentSource failed, and we should try another source
+ * next.
+ *
+ * pendingWalRcvRestart is set when a config change occurs that requires a
+ * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
+ */
+static XLogSource currentSource = XLOG_FROM_ANY;
+static bool lastSourceFailed = false;
+static bool pendingWalRcvRestart = false;
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from. (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now. It is also different from currentSource, which
+ * also changes when we try to read from a source and fail, while
+ * XLogReceiptSource tracks where we last successfully read some WAL.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
+
+/* Local copy of WalRcv->flushedUpto */
+static XLogRecPtr flushedUpto = 0;
+static TimeLineID receiveTLI = 0;
+
+/*
+ * Copy of minRecoveryPoint and backupEndPoint from the control file.
+ *
+ * In order to reach consistency, we must replay the WAL up to
+ * minRecoveryPoint. If backupEndRequired is true, we must also reach
+ * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
+ * to backupStartPoint.
+ *
+ * Note: In archive recovery, after consistency has been reached, the
+ * functions in xlog.c will start updating minRecoveryPoint in the control
+ * file. But this copy of minRecoveryPoint variable reflects the value at the
+ * beginning of recovery, and is *not* updated after consistency is reached.
+ */
+static XLogRecPtr minRecoveryPoint;
+static TimeLineID minRecoveryPointTLI;
+
+static XLogRecPtr backupStartPoint;
+static XLogRecPtr backupEndPoint;
+static bool backupEndRequired = false;
+
+/*
+ * Have we reached a consistent database state? In crash recovery, we have
+ * to replay all the WAL, so reachedConsistency is never set. During archive
+ * recovery, the database is consistent once minRecoveryPoint is reached.
+ *
+ * Consistent state means that the system is internally consistent, all
+ * the WAL has been replayed up to a certain point, and importantly, there
+ * is no trace of later actions on disk.
+ */
+bool reachedConsistency = false;
+
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *primary_image_masked = NULL;
+
+
+/*
+ * Shared-memory state for WAL recovery.
+ */
+typedef struct XLogRecoveryCtlData
+{
+ /*
+ * SharedHotStandbyActive indicates if we allow hot standby queries to be
+ * run. Protected by info_lck.
+ */
+ bool SharedHotStandbyActive;
+
+ /*
+ * SharedPromoteIsTriggered indicates if a standby promotion has been
+ * triggered. Protected by info_lck.
+ */
+ bool SharedPromoteIsTriggered;
+
+ /*
+ * recoveryWakeupLatch is used to wake up the startup process to continue
+ * WAL replay, if it is waiting for WAL to arrive or failover trigger file
+ * to appear.
+ *
+ * Note that the startup process also uses another latch, its procLatch,
+ * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
+ * signaling the startup process in favor of using its procLatch, which
+ * comports better with possible generic signal handlers using that latch.
+ * But we should not do that because the startup process doesn't assume
+ * that it's waken up by walreceiver process or SIGHUP signal handler
+ * while it's waiting for recovery conflict. The separate latches,
+ * recoveryWakeupLatch and procLatch, should be used for inter-process
+ * communication for WAL replay and recovery conflict, respectively.
+ */
+ Latch recoveryWakeupLatch;
+
+ /*
+ * Last record successfully replayed.
+ */
+ XLogRecPtr lastReplayedReadRecPtr; /* start position */
+ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
+ TimeLineID lastReplayedTLI; /* timeline */
+
+ /*
+ * When we're currently replaying a record, ie. in a redo function,
+ * replayEndRecPtr points to the end+1 of the record being replayed,
+ * otherwise it's equal to lastReplayedEndRecPtr.
+ */
+ XLogRecPtr replayEndRecPtr;
+ TimeLineID replayEndTLI;
+ /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
+ TimestampTz recoveryLastXTime;
+
+ /*
+ * timestamp of when we started replaying the current chunk of WAL data,
+ * only relevant for replication or archive recovery
+ */
+ TimestampTz currentChunkStartTime;
+ /* Recovery pause state */
+ RecoveryPauseState recoveryPauseState;
+ ConditionVariable recoveryNotPausedCV;
+
+ slock_t info_lck; /* locks shared variables shown above */
+} XLogRecoveryCtlData;
+
+static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
+
+/*
+ * abortedRecPtr is the start pointer of a broken record at end of WAL when
+ * recovery completes; missingContrecPtr is the location of the first
+ * contrecord that went missing. See CreateOverwriteContrecordRecord for
+ * details.
+ */
+static XLogRecPtr abortedRecPtr;
+static XLogRecPtr missingContrecPtr;
+
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
+static TransactionId recoveryStopXid;
+static TimestampTz recoveryStopTime;
+static XLogRecPtr recoveryStopLSN;
+static char recoveryStopName[MAXFNAMELEN];
+static bool recoveryStopAfter;
+
+/* prototypes for local functions */
+static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
+
+static void readRecoverySignalFile(void);
+static void validateRecoveryParameters(void);
+static bool read_backup_label(XLogRecPtr *checkPointLoc,
+ TimeLineID *backupLabelTLI,
+ bool *backupEndRequired, bool *backupFromStandby);
+static bool read_tablespace_map(List **tablespaces);
+
+static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
+static void CheckRecoveryConsistency(void);
+static void rm_redo_error_callback(void *arg);
+#ifdef WAL_DEBUG
+static void xlog_outrec(StringInfo buf, XLogReaderState *record);
+#endif
+static void xlog_block_info(StringInfo buf, XLogReaderState *record);
+static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
+ TimeLineID prevTLI, TimeLineID replayTLI);
+static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
+static void verifyBackupPageConsistency(XLogReaderState *record);
+
+static bool recoveryStopsBefore(XLogReaderState *record);
+static bool recoveryStopsAfter(XLogReaderState *record);
+static char *getRecoveryStopReason(void);
+static void recoveryPausesHere(bool endOfRecovery);
+static bool recoveryApplyDelay(XLogReaderState *record);
+static void ConfirmRecoveryPaused(void);
+
+static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
+ int emode, bool fetching_ckpt, TimeLineID replayTLI);
+
+static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
+ int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
+static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+ bool fetching_ckpt,
+ XLogRecPtr tliRecPtr,
+ TimeLineID replayTLI,
+ XLogRecPtr replayLSN);
+static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
+static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
+ int whichChkpt, bool report, TimeLineID replayTLI);
+static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
+static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+ XLogSource source, bool notfoundOk);
+static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
+
+static bool CheckForStandbyTrigger(void);
+static void SetPromoteIsTriggered(void);
+static bool HotStandbyActiveInReplay(void);
+
+static void SetCurrentChunkStartTime(TimestampTz xtime);
+static void SetLatestXTime(TimestampTz xtime);
+
+/*
+ * Initialization of shared memory for WAL recovery
+ */
+Size
+XLogRecoveryShmemSize(void)
+{
+ Size size;
+
+ /* XLogRecoveryCtl */
+ size = sizeof(XLogRecoveryCtlData);
+
+ return size;
+}
+
+void
+XLogRecoveryShmemInit(void)
+{
+ bool found;
+
+ XLogRecoveryCtl = (XLogRecoveryCtlData *)
+ ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
+ if (found)
+ return;
+ memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
+
+ SpinLockInit(&XLogRecoveryCtl->info_lck);
+ InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Prepare the system for WAL recovery, if needed.
+ *
+ * This is called by StartupXLOG() which coordinates the server startup
+ * sequence. This function analyzes the control file and the backup label
+ * file, if any, and figures out whether we need to perform crash recovery or
+ * archive recovery, and how far we need to replay the WAL to reach a
+ * consistent state.
+ *
+ * This doesn't yet change the on-disk state, except for creating the symlinks
+ * from table space map file if any, and for fetching WAL files needed to find
+ * the checkpoint record. On entry, the caller has already read the control
+ * file into memory, and passes it as argument. This function updates it to
+ * reflect the recovery state, and the caller is expected to write it back to
+ * disk does after initializing other subsystems, but before calling
+ * PerformWalRecovery().
+ *
+ * This initializes some global variables like ArchiveModeRequested, and
+ * StandbyModeRequested and InRecovery.
+ */
+void
+InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
+ bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
+{
+ XLogPageReadPrivate *private;
+ struct stat st;
+ bool wasShutdown;
+ XLogRecord *record;
+ DBState dbstate_at_startup;
+ bool haveTblspcMap = false;
+ bool haveBackupLabel = false;
+ CheckPoint checkPoint;
+ bool backupFromStandby = false;
+
+ dbstate_at_startup = ControlFile->state;
+
+ /*
+ * Initialize on the assumption we want to recover to the latest timeline
+ * that's active according to pg_control.
+ */
+ if (ControlFile->minRecoveryPointTLI >
+ ControlFile->checkPointCopy.ThisTimeLineID)
+ recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
+ else
+ recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+
+ /*
+ * Check for signal files, and if so set up state for offline recovery
+ */
+ readRecoverySignalFile();
+ validateRecoveryParameters();
+
+ if (ArchiveRecoveryRequested)
+ {
+ if (StandbyModeRequested)
+ ereport(LOG,
+ (errmsg("entering standby mode")));
+ else if (recoveryTarget == RECOVERY_TARGET_XID)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to XID %u",
+ recoveryTargetXid)));
+ else if (recoveryTarget == RECOVERY_TARGET_TIME)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to %s",
+ timestamptz_to_str(recoveryTargetTime))));
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to \"%s\"",
+ recoveryTargetName)));
+ else if (recoveryTarget == RECOVERY_TARGET_LSN)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryTargetLSN))));
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ ereport(LOG,
+ (errmsg("starting point-in-time recovery to earliest consistent point")));
+ else
+ ereport(LOG,
+ (errmsg("starting archive recovery")));
+ }
+
+ /*
+ * Take ownership of the wakeup latch if we're going to sleep during
+ * recovery.
+ */
+ if (ArchiveRecoveryRequested)
+ OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+ private = palloc0(sizeof(XLogPageReadPrivate));
+ xlogreader =
+ XLogReaderAllocate(wal_segment_size, NULL,
+ XL_ROUTINE(.page_read = &XLogPageRead,
+ .segment_open = NULL,
+ .segment_close = wal_segment_close),
+ private);
+ if (!xlogreader)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while allocating a WAL reading processor.")));
+ xlogreader->system_identifier = ControlFile->system_identifier;
+
+ /*
+ * Allocate two page buffers dedicated to WAL consistency checks. We do
+ * it this way, rather than just making static arrays, for two reasons:
+ * (1) no need to waste the storage in most instantiations of the backend;
+ * (2) a static char array isn't guaranteed to have any particular
+ * alignment, whereas palloc() will provide MAXALIGN'd storage.
+ */
+ replay_image_masked = (char *) palloc(BLCKSZ);
+ primary_image_masked = (char *) palloc(BLCKSZ);
+
+ if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
+ &backupFromStandby))
+ {
+ List *tablespaces = NIL;
+
+ /*
+ * Archive recovery was requested, and thanks to the backup label
+ * file, we know how far we need to replay to reach consistency. Enter
+ * archive recovery directly.
+ */
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ StandbyMode = true;
+
+ /*
+ * When a backup_label file is present, we want to roll forward from
+ * the checkpoint it identifies, rather than using pg_control.
+ */
+ record = ReadCheckpointRecord(xlogreader, CheckPointLoc, 0, true, CheckPointTLI);
+ if (record != NULL)
+ {
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+ ereport(DEBUG1,
+ (errmsg_internal("checkpoint record is at %X/%X",
+ LSN_FORMAT_ARGS(CheckPointLoc))));
+ InRecovery = true; /* force recovery even if SHUTDOWNED */
+
+ /*
+ * Make sure that REDO location exists. This may not be the case
+ * if there was a crash during an online backup, which left a
+ * backup_label around that references a WAL segment that's
+ * already been archived.
+ */
+ if (checkPoint.redo < CheckPointLoc)
+ {
+ XLogBeginRead(xlogreader, checkPoint.redo);
+ if (!ReadRecord(xlogreader, LOG, false,
+ checkPoint.ThisTimeLineID))
+ ereport(FATAL,
+ (errmsg("could not find redo location referenced by checkpoint record"),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir)));
+ }
+ }
+ else
+ {
+ ereport(FATAL,
+ (errmsg("could not locate required checkpoint record"),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir)));
+ wasShutdown = false; /* keep compiler quiet */
+ }
+
+ /* Read the tablespace_map file if present and create symlinks. */
+ if (read_tablespace_map(&tablespaces))
+ {
+ ListCell *lc;
+
+ foreach(lc, tablespaces)
+ {
+ tablespaceinfo *ti = lfirst(lc);
+ char *linkloc;
+
+ linkloc = psprintf("pg_tblspc/%s", ti->oid);
+
+ /*
+ * Remove the existing symlink if any and Create the symlink
+ * under PGDATA.
+ */
+ remove_tablespace_symlink(linkloc);
+
+ if (symlink(ti->path, linkloc) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create symbolic link \"%s\": %m",
+ linkloc)));
+
+ pfree(ti->oid);
+ pfree(ti->path);
+ pfree(ti);
+ }
+
+ /* tell the caller to delete it later */
+ haveTblspcMap = true;
+ }
+
+ /* tell the caller to delete it later */
+ haveBackupLabel = true;
+ }
+ else
+ {
+ /*
+ * If tablespace_map file is present without backup_label file, there
+ * is no use of such file. There is no harm in retaining it, but it
+ * is better to get rid of the map file so that we don't have any
+ * redundant file in data directory and it will avoid any sort of
+ * confusion. It seems prudent though to just rename the file out of
+ * the way rather than delete it completely, also we ignore any error
+ * that occurs in rename operation as even if map file is present
+ * without backup_label file, it is harmless.
+ */
+ if (stat(TABLESPACE_MAP, &st) == 0)
+ {
+ unlink(TABLESPACE_MAP_OLD);
+ if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
+ ereport(LOG,
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("File \"%s\" was renamed to \"%s\".",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ else
+ ereport(LOG,
+ (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+ TABLESPACE_MAP, BACKUP_LABEL_FILE),
+ errdetail("Could not rename file \"%s\" to \"%s\": %m.",
+ TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+ }
+
+ /*
+ * It's possible that archive recovery was requested, but we don't
+ * know how far we need to replay the WAL before we reach consistency.
+ * This can happen for example if a base backup is taken from a
+ * running server using an atomic filesystem snapshot, without calling
+ * pg_start/stop_backup. Or if you just kill a running primary server
+ * and put it into archive recovery by creating a recovery signal
+ * file.
+ *
+ * Our strategy in that case is to perform crash recovery first,
+ * replaying all the WAL present in pg_wal, and only enter archive
+ * recovery after that.
+ *
+ * But usually we already know how far we need to replay the WAL (up
+ * to minRecoveryPoint, up to backupEndPoint, or until we see an
+ * end-of-backup record), and we can enter archive recovery directly.
+ */
+ if (ArchiveRecoveryRequested &&
+ (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
+ ControlFile->backupEndRequired ||
+ ControlFile->backupEndPoint != InvalidXLogRecPtr ||
+ ControlFile->state == DB_SHUTDOWNED))
+ {
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ StandbyMode = true;
+ }
+
+ /* Get the last valid checkpoint record. */
+ CheckPointLoc = ControlFile->checkPoint;
+ CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+ RedoStartLSN = ControlFile->checkPointCopy.redo;
+ RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+ record = ReadCheckpointRecord(xlogreader, CheckPointLoc, 1, true,
+ CheckPointTLI);
+ if (record != NULL)
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("checkpoint record is at %X/%X",
+ LSN_FORMAT_ARGS(CheckPointLoc))));
+ }
+ else
+ {
+ /*
+ * We used to attempt to go back to a secondary checkpoint record
+ * here, but only when not in standby mode. We now just fail if we
+ * can't read the last checkpoint because this allows us to
+ * simplify processing around checkpoints.
+ */
+ ereport(PANIC,
+ (errmsg("could not locate a valid checkpoint record")));
+ }
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+ }
+
+ /*
+ * If the location of the checkpoint record is not on the expected
+ * timeline in the history of the requested timeline, we cannot proceed:
+ * the backup is not part of the history of the requested timeline.
+ */
+ Assert(expectedTLEs); /* was initialized by reading checkpoint
+ * record */
+ if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
+ CheckPointTLI)
+ {
+ XLogRecPtr switchpoint;
+
+ /*
+ * tliSwitchPoint will throw an error if the checkpoint's timeline is
+ * not in expectedTLEs at all.
+ */
+ switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
+ ereport(FATAL,
+ (errmsg("requested timeline %u is not a child of this server's history",
+ recoveryTargetTLI),
+ errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+ LSN_FORMAT_ARGS(ControlFile->checkPoint),
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ LSN_FORMAT_ARGS(switchpoint))));
+ }
+
+ /*
+ * The min recovery point should be part of the requested timeline's
+ * history, too.
+ */
+ if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
+ tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+ ControlFile->minRecoveryPointTLI)
+ ereport(FATAL,
+ (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
+ recoveryTargetTLI,
+ LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+ ControlFile->minRecoveryPointTLI)));
+
+ ereport(DEBUG1,
+ (errmsg_internal("redo record is at %X/%X; shutdown %s",
+ LSN_FORMAT_ARGS(checkPoint.redo),
+ wasShutdown ? "true" : "false")));
+ ereport(DEBUG1,
+ (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
+ U64FromFullTransactionId(checkPoint.nextXid),
+ checkPoint.nextOid)));
+ ereport(DEBUG1,
+ (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
+ checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+ ereport(DEBUG1,
+ (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
+ checkPoint.oldestXid, checkPoint.oldestXidDB)));
+ ereport(DEBUG1,
+ (errmsg_internal("oldest MultiXactId: %u, in database %u",
+ checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
+ ereport(DEBUG1,
+ (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
+ checkPoint.oldestCommitTsXid,
+ checkPoint.newestCommitTsXid)));
+ if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
+ ereport(PANIC,
+ (errmsg("invalid next transaction ID")));
+
+ /* sanity check */
+ if (checkPoint.redo > CheckPointLoc)
+ ereport(PANIC,
+ (errmsg("invalid redo in checkpoint record")));
+
+ /*
+ * Check whether we need to force recovery from WAL. If it appears to
+ * have been a clean shutdown and we did not have a recovery signal file,
+ * then assume no recovery needed.
+ */
+ if (checkPoint.redo < CheckPointLoc)
+ {
+ if (wasShutdown)
+ ereport(PANIC,
+ (errmsg("invalid redo record in shutdown checkpoint")));
+ InRecovery = true;
+ }
+ else if (ControlFile->state != DB_SHUTDOWNED)
+ InRecovery = true;
+ else if (ArchiveRecoveryRequested)
+ {
+ /* force recovery due to presence of recovery signal file */
+ InRecovery = true;
+ }
+
+ /*
+ * Update pg_control to show that we are recovering and to show the
+ * selected checkpoint as the place we are starting from. We also mark
+ * pg_control with any minimum recovery stop point obtained from a backup
+ * history file.
+ */
+ if (InArchiveRecovery)
+ {
+ ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("database system was not properly shut down; "
+ "automatic recovery in progress")));
+ if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
+ ereport(LOG,
+ (errmsg("crash recovery starts in timeline %u "
+ "and has target timeline %u",
+ ControlFile->checkPointCopy.ThisTimeLineID,
+ recoveryTargetTLI)));
+ ControlFile->state = DB_IN_CRASH_RECOVERY;
+ }
+ ControlFile->checkPoint = CheckPointLoc;
+ ControlFile->checkPointCopy = checkPoint;
+ if (InArchiveRecovery)
+ {
+ /* initialize minRecoveryPoint if not set yet */
+ if (ControlFile->minRecoveryPoint < checkPoint.redo)
+ {
+ ControlFile->minRecoveryPoint = checkPoint.redo;
+ ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
+ }
+ }
+
+ /*
+ * Set backupStartPoint if we're starting recovery from a base backup.
+ *
+ * Also set backupEndPoint and use minRecoveryPoint as the backup end
+ * location if we're starting recovery from a base backup which was taken
+ * from a standby. In this case, the database system status in pg_control
+ * must indicate that the database was already in recovery. Usually that
+ * will be DB_IN_ARCHIVE_RECOVERY but also can be
+ * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted before
+ * reaching this point; e.g. because restore_command or primary_conninfo
+ * were faulty.
+ *
+ * Any other state indicates that the backup somehow became corrupted and
+ * we can't sensibly continue with recovery.
+ */
+ if (haveBackupLabel)
+ {
+ ControlFile->backupStartPoint = checkPoint.redo;
+ ControlFile->backupEndRequired = backupEndRequired;
+
+ if (backupFromStandby)
+ {
+ if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
+ dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
+ ereport(FATAL,
+ (errmsg("backup_label contains data inconsistent with control file"),
+ errhint("This means that the backup is corrupted and you will "
+ "have to use another backup for recovery.")));
+ ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
+ }
+ }
+
+ /* remember these, so that we know when we have reached consistency */
+ backupStartPoint = ControlFile->backupStartPoint;
+ backupEndRequired = ControlFile->backupEndRequired;
+ backupEndPoint = ControlFile->backupEndPoint;
+ if (InArchiveRecovery)
+ {
+ minRecoveryPoint = ControlFile->minRecoveryPoint;
+ minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+ }
+ else
+ {
+ minRecoveryPoint = InvalidXLogRecPtr;
+ minRecoveryPointTLI = 0;
+ }
+
+ /*
+ * Start recovery assuming that the final record isn't lost.
+ */
+ abortedRecPtr = InvalidXLogRecPtr;
+ missingContrecPtr = InvalidXLogRecPtr;
+
+ *wasShutdown_ptr = wasShutdown;
+ *haveBackupLabel_ptr = haveBackupLabel;
+ *haveTblspcMap_ptr = haveTblspcMap;
+}
+
+/*
+ * See if there are any recovery signal files and if so, set state for
+ * recovery.
+ *
+ * See if there is a recovery command file (recovery.conf), and if so
+ * throw an ERROR since as of PG12 we no longer recognize that.
+ */
+static void
+readRecoverySignalFile(void)
+{
+ struct stat stat_buf;
+
+ if (IsBootstrapProcessingMode())
+ return;
+
+ /*
+ * Check for old recovery API file: recovery.conf
+ */
+ if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("using recovery command file \"%s\" is not supported",
+ RECOVERY_COMMAND_FILE)));
+
+ /*
+ * Remove unused .done file, if present. Ignore if absent.
+ */
+ unlink(RECOVERY_COMMAND_DONE);
+
+ /*
+ * Check for recovery signal files and if found, fsync them since they
+ * represent server state information. We don't sweat too much about the
+ * possibility of fsync failure, however.
+ *
+ * If present, standby signal file takes precedence. If neither is present
+ * then we won't enter archive recovery.
+ */
+ if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ int fd;
+
+ fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd >= 0)
+ {
+ (void) pg_fsync(fd);
+ close(fd);
+ }
+ standby_signal_file_found = true;
+ }
+ else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ int fd;
+
+ fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd >= 0)
+ {
+ (void) pg_fsync(fd);
+ close(fd);
+ }
+ recovery_signal_file_found = true;
+ }
+
+ StandbyModeRequested = false;
+ ArchiveRecoveryRequested = false;
+ if (standby_signal_file_found)
+ {
+ StandbyModeRequested = true;
+ ArchiveRecoveryRequested = true;
+ }
+ else if (recovery_signal_file_found)
+ {
+ StandbyModeRequested = false;
+ ArchiveRecoveryRequested = true;
+ }
+ else
+ return;
+
+ /*
+ * We don't support standby mode in standalone backends; that requires
+ * other processes such as the WAL receiver to be alive.
+ */
+ if (StandbyModeRequested && !IsUnderPostmaster)
+ ereport(FATAL,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("standby mode is not supported by single-user servers")));
+}
+
+static void
+validateRecoveryParameters(void)
+{
+ if (!ArchiveRecoveryRequested)
+ return;
+
+ /*
+ * Check for compulsory parameters
+ */
+ if (StandbyModeRequested)
+ {
+ if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
+ (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
+ ereport(WARNING,
+ (errmsg("specified neither primary_conninfo nor restore_command"),
+ errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
+ }
+ else
+ {
+ if (recoveryRestoreCommand == NULL ||
+ strcmp(recoveryRestoreCommand, "") == 0)
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("must specify restore_command when standby mode is not enabled")));
+ }
+
+ /*
+ * Override any inconsistent requests. Note that this is a change of
+ * behaviour in 9.5; prior to this we simply ignored a request to pause if
+ * hot_standby = off, which was surprising behaviour.
+ */
+ if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
+ !EnableHotStandby)
+ recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
+
+ /*
+ * Final parsing of recovery_target_time string; see also
+ * check_recovery_target_time().
+ */
+ if (recoveryTarget == RECOVERY_TARGET_TIME)
+ {
+ recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+ CStringGetDatum(recovery_target_time_string),
+ ObjectIdGetDatum(InvalidOid),
+ Int32GetDatum(-1)));
+ }
+
+ /*
+ * If user specified recovery_target_timeline, validate it or compute the
+ * "latest" value. We can't do this until after we've gotten the restore
+ * command and set InArchiveRecovery, because we need to fetch timeline
+ * history files from the archive.
+ */
+ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
+ {
+ TimeLineID rtli = recoveryTargetTLIRequested;
+
+ /* Timeline 1 does not have a history file, all else should */
+ if (rtli != 1 && !existsTimeLineHistory(rtli))
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery target timeline %u does not exist",
+ rtli)));
+ recoveryTargetTLI = rtli;
+ }
+ else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+ {
+ /* We start the "latest" search from pg_control's timeline */
+ recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
+ }
+ else
+ {
+ /*
+ * else we just use the recoveryTargetTLI as already read from
+ * ControlFile
+ */
+ Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
+ }
+}
+
+/*
+ * read_backup_label: check to see if a backup_label file is present
+ *
+ * If we see a backup_label during recovery, we assume that we are recovering
+ * from a backup dump file, and we therefore roll forward from the checkpoint
+ * identified by the label file, NOT what pg_control says. This avoids the
+ * problem that pg_control might have been archived one or more checkpoints
+ * later than the start of the dump, and so if we rely on it as the start
+ * point, we will fail to restore a consistent database state.
+ *
+ * Returns true if a backup_label was found (and fills the checkpoint
+ * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
+ * returns false if not. If this backup_label came from a streamed backup,
+ * *backupEndRequired is set to true. If this backup_label was created during
+ * recovery, *backupFromStandby is set to true.
+ *
+ * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
+ * and TLI read from the backup file.
+ */
+static bool
+read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
+ bool *backupEndRequired, bool *backupFromStandby)
+{
+ char startxlogfilename[MAXFNAMELEN];
+ TimeLineID tli_from_walseg,
+ tli_from_file;
+ FILE *lfp;
+ char ch;
+ char backuptype[20];
+ char backupfrom[20];
+ char backuplabel[MAXPGPATH];
+ char backuptime[128];
+ uint32 hi,
+ lo;
+
+ /* suppress possible uninitialized-variable warnings */
+ *checkPointLoc = InvalidXLogRecPtr;
+ *backupLabelTLI = 0;
+ *backupEndRequired = false;
+ *backupFromStandby = false;
+
+ /*
+ * See if label file is present
+ */
+ lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+ if (!lfp)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+ return false; /* it's not there, all is fine */
+ }
+
+ /*
+ * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
+ * is pretty crude, but we are not expecting any variability in the file
+ * format).
+ */
+ if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+ &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ RedoStartLSN = ((uint64) hi) << 32 | lo;
+ RedoStartTLI = tli_from_walseg;
+ if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+ &hi, &lo, &ch) != 3 || ch != '\n')
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+ *checkPointLoc = ((uint64) hi) << 32 | lo;
+ *backupLabelTLI = tli_from_walseg;
+
+ /*
+ * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
+ * from an older backup anyway, but since the information on it is not
+ * strictly required, don't error out if it's missing for some reason.
+ */
+ if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
+ {
+ if (strcmp(backuptype, "streamed") == 0)
+ *backupEndRequired = true;
+ }
+
+ if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
+ {
+ if (strcmp(backupfrom, "standby") == 0)
+ *backupFromStandby = true;
+ }
+
+ /*
+ * Parse START TIME and LABEL. Those are not mandatory fields for recovery
+ * but checking for their presence is useful for debugging and the next
+ * sanity checks. Cope also with the fact that the result buffers have a
+ * pre-allocated size, hence if the backup_label file has been generated
+ * with strings longer than the maximum assumed here an incorrect parsing
+ * happens. That's fine as only minor consistency checks are done
+ * afterwards.
+ */
+ if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
+ ereport(DEBUG1,
+ (errmsg_internal("backup time %s in file \"%s\"",
+ backuptime, BACKUP_LABEL_FILE)));
+
+ if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
+ ereport(DEBUG1,
+ (errmsg_internal("backup label %s in file \"%s\"",
+ backuplabel, BACKUP_LABEL_FILE)));
+
+ /*
+ * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
+ * it as a sanity check if present.
+ */
+ if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
+ {
+ if (tli_from_walseg != tli_from_file)
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
+ errdetail("Timeline ID parsed is %u, but expected %u.",
+ tli_from_file, tli_from_walseg)));
+
+ ereport(DEBUG1,
+ (errmsg_internal("backup timeline %u in file \"%s\"",
+ tli_from_file, BACKUP_LABEL_FILE)));
+ }
+
+ if (ferror(lfp) || FreeFile(lfp))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ BACKUP_LABEL_FILE)));
+
+ return true;
+}
+
+/*
+ * read_tablespace_map: check to see if a tablespace_map file is present
+ *
+ * If we see a tablespace_map file during recovery, we assume that we are
+ * recovering from a backup dump file, and we therefore need to create symlinks
+ * as per the information present in tablespace_map file.
+ *
+ * Returns true if a tablespace_map file was found (and fills *tablespaces
+ * with a tablespaceinfo struct for each tablespace listed in the file);
+ * returns false if not.
+ */
+static bool
+read_tablespace_map(List **tablespaces)
+{
+ tablespaceinfo *ti;
+ FILE *lfp;
+ char str[MAXPGPATH];
+ int ch,
+ i,
+ n;
+ bool was_backslash;
+
+ /*
+ * See if tablespace_map file is present
+ */
+ lfp = AllocateFile(TABLESPACE_MAP, "r");
+ if (!lfp)
+ {
+ if (errno != ENOENT)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ TABLESPACE_MAP)));
+ return false; /* it's not there, all is fine */
+ }
+
+ /*
+ * Read and parse the link name and path lines from tablespace_map file
+ * (this code is pretty crude, but we are not expecting any variability in
+ * the file format). De-escape any backslashes that were inserted.
+ */
+ i = 0;
+ was_backslash = false;
+ while ((ch = fgetc(lfp)) != EOF)
+ {
+ if (!was_backslash && (ch == '\n' || ch == '\r'))
+ {
+ if (i == 0)
+ continue; /* \r immediately followed by \n */
+
+ /*
+ * The de-escaped line should contain an OID followed by exactly
+ * one space followed by a path. The path might start with
+ * spaces, so don't be too liberal about parsing.
+ */
+ str[i] = '\0';
+ n = 0;
+ while (str[n] && str[n] != ' ')
+ n++;
+ if (n < 1 || n >= i - 1)
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+ str[n++] = '\0';
+
+ ti = palloc0(sizeof(tablespaceinfo));
+ ti->oid = pstrdup(str);
+ ti->path = pstrdup(str + n);
+ *tablespaces = lappend(*tablespaces, ti);
+
+ i = 0;
+ continue;
+ }
+ else if (!was_backslash && ch == '\\')
+ was_backslash = true;
+ else
+ {
+ if (i < sizeof(str) - 1)
+ str[i++] = ch;
+ was_backslash = false;
+ }
+ }
+
+ if (i != 0 || was_backslash) /* last line not terminated? */
+ ereport(FATAL,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+
+ if (ferror(lfp) || FreeFile(lfp))
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ TABLESPACE_MAP)));
+
+ return true;
+}
+
+/*
+ * Finish WAL recovery.
+ *
+ * This does not close the 'xlogreader' yet, because in some cases the caller
+ * still wants to re-read the last checkpoint record by calling
+ * ReadCheckPointRecord().
+ *
+ * Returns the position of the last valid or applied record, after which new
+ * WAL should be appended, information about why recovery was ended, and some
+ * other things. See the WalRecoveryResult struct for details.
+ */
+EndOfWalRecoveryInfo *
+FinishWalRecovery(void)
+{
+ EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
+ XLogRecPtr lastRec;
+ TimeLineID lastRecTLI;
+ XLogRecPtr endOfLog;
+
+ /*
+ * Kill WAL receiver, if it's still running, before we continue to write
+ * the startup checkpoint and aborted-contrecord records. It will trump
+ * over these records and subsequent ones if it's still alive when we
+ * start writing WAL.
+ */
+ XLogShutdownWalRcv();
+
+ /*
+ * We are now done reading the xlog from stream. Turn off streaming
+ * recovery to force fetching the files (which would be required at end of
+ * recovery, e.g., timeline history file) from archive or pg_wal.
+ *
+ * Note that standby mode must be turned off after killing WAL receiver,
+ * i.e., calling XLogShutdownWalRcv().
+ */
+ Assert(!WalRcvStreaming());
+ StandbyMode = false;
+
+ /*
+ * Determine where to start writing WAL next.
+ *
+ * Re-fetch the last valid or last applied record, so we can identify the
+ * exact endpoint of what we consider the valid portion of WAL. There may
+ * be an incomplete continuation record after that, in which case
+ * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
+ * write a special OVERWRITE_CONTRECORD message to mark that the rest of
+ * it is intentionally missing. See CreateOverwriteContrecordRecord().
+ *
+ * An important side-effect of this is to load the last page into
+ * xlogreader. The caller uses it to initialize the WAL for writing.
+ */
+ if (!InRecovery)
+ {
+ lastRec = CheckPointLoc;
+ lastRecTLI = CheckPointTLI;
+ }
+ else
+ {
+ lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
+ lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
+ }
+ XLogBeginRead(xlogreader, lastRec);
+ (void) ReadRecord(xlogreader, PANIC, false, lastRecTLI);
+ endOfLog = xlogreader->EndRecPtr;
+
+ /*
+ * Remember the TLI in the filename of the XLOG segment containing the
+ * end-of-log. It could be different from the timeline that endOfLog
+ * nominally belongs to, if there was a timeline switch in that segment,
+ * and we were reading the old WAL from a segment belonging to a higher
+ * timeline.
+ */
+ result->endOfLogTLI = xlogreader->seg.ws_tli;
+
+ if (ArchiveRecoveryRequested)
+ {
+ /*
+ * We are no longer in archive recovery state.
+ *
+ * We are now done reading the old WAL. Turn off archive fetching if
+ * it was active.
+ */
+ Assert(InArchiveRecovery);
+ InArchiveRecovery = false;
+
+ /*
+ * If the ending log segment is still open, close it (to avoid
+ * problems on Windows with trying to rename or delete an open file).
+ */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ }
+
+ /*
+ * Copy the last partial block to the caller, for initializing the WAL
+ * buffer for appending new WAL.
+ */
+ if (endOfLog % XLOG_BLCKSZ != 0)
+ {
+ char *page;
+ int len;
+ XLogRecPtr pageBeginPtr;
+
+ pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
+ Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+
+ /* Copy the valid part of the last block */
+ len = endOfLog % XLOG_BLCKSZ;
+ page = palloc(len);
+ memcpy(page, xlogreader->readBuf, len);
+
+ result->lastPageBeginPtr = pageBeginPtr;
+ result->lastPage = page;
+ }
+ else
+ {
+ /* There is no partial block to copy. */
+ result->lastPageBeginPtr = endOfLog;
+ result->lastPage = NULL;
+ }
+
+ /*
+ * Create a comment for the history file to explain why and where timeline
+ * changed.
+ */
+ result->recoveryStopReason = getRecoveryStopReason();
+
+ result->lastRec = lastRec;
+ result->lastRecTLI = lastRecTLI;
+ result->endOfLog = endOfLog;
+
+ result->abortedRecPtr = abortedRecPtr;
+ result->missingContrecPtr = missingContrecPtr;
+
+ result->standby_signal_file_found = standby_signal_file_found;
+ result->recovery_signal_file_found = recovery_signal_file_found;
+
+ return result;
+}
+
+/*
+ * Clean up the WAL reader and leftovers from restoring WAL from archive
+ */
+void
+ShutdownWalRecovery(void)
+{
+ char recoveryPath[MAXPGPATH];
+
+ /* Shut down xlogreader */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ XLogReaderFree(xlogreader);
+
+ if (ArchiveRecoveryRequested)
+ {
+ /*
+ * Since there might be a partial WAL segment named RECOVERYXLOG, get
+ * rid of it.
+ */
+ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
+ unlink(recoveryPath); /* ignore any error */
+
+ /* Get rid of any remaining recovered timeline-history file, too */
+ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
+ unlink(recoveryPath); /* ignore any error */
+ }
+
+ /*
+ * We don't need the latch anymore. It's not strictly necessary to disown
+ * it, but let's do it for the sake of tidiness.
+ */
+ if (ArchiveRecoveryRequested)
+ DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Perform WAL recovery.
+ *
+ * If the system was shut down cleanly, this is never called.
+ */
+void
+PerformWalRecovery(void)
+{
+ int rmid;
+ XLogRecord *record;
+ bool reachedRecoveryTarget = false;
+ TimeLineID replayTLI;
+
+ /*
+ * Initialize shared variables for tracking progress of WAL replay, as if
+ * we had just replayed the record before the REDO location (or the
+ * checkpoint record itself, if it's a shutdown checkpoint).
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ if (RedoStartLSN < CheckPointLoc)
+ {
+ XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
+ XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
+ }
+ else
+ {
+ XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
+ }
+ XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
+ XLogRecoveryCtl->recoveryLastXTime = 0;
+ XLogRecoveryCtl->currentChunkStartTime = 0;
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /* Also ensure XLogReceiptTime has a sane value */
+ XLogReceiptTime = GetCurrentTimestamp();
+
+ /*
+ * Let postmaster know we've started redo now, so that it can launch the
+ * archiver if necessary.
+ */
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+
+ /*
+ * Allow read-only connections immediately if we're consistent already.
+ */
+ CheckRecoveryConsistency();
+
+ /*
+ * Find the first record that logically follows the checkpoint --- it
+ * might physically precede it, though.
+ */
+ if (RedoStartLSN < CheckPointLoc)
+ {
+ /* back up to find the record */
+ replayTLI = RedoStartTLI;
+ XLogBeginRead(xlogreader, RedoStartLSN);
+ record = ReadRecord(xlogreader, PANIC, false, replayTLI);
+ }
+ else
+ {
+ /* just have to read next record after CheckPoint */
+ Assert(xlogreader->ReadRecPtr == CheckPointLoc);
+ replayTLI = CheckPointTLI;
+ record = ReadRecord(xlogreader, LOG, false, replayTLI);
+ }
+
+ if (record != NULL)
+ {
+ TimestampTz xtime;
+ PGRUsage ru0;
+
+ pg_rusage_init(&ru0);
+
+ InRedo = true;
+
+ /* Initialize resource managers */
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (RmgrTable[rmid].rm_startup != NULL)
+ RmgrTable[rmid].rm_startup();
+ }
+
+ ereport(LOG,
+ (errmsg("redo starts at %X/%X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+
+ /* Prepare to report progress of the redo phase. */
+ if (!StandbyMode)
+ begin_startup_progress_phase();
+
+ /*
+ * main redo apply loop
+ */
+ do
+ {
+ if (!StandbyMode)
+ ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
+
+#ifdef WAL_DEBUG
+ if (XLOG_DEBUG ||
+ (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+ (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
+ {
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+ LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
+ xlog_outrec(&buf, xlogreader);
+ appendStringInfoString(&buf, " - ");
+ xlog_outdesc(&buf, xlogreader);
+ elog(LOG, "%s", buf.data);
+ pfree(buf.data);
+ }
+#endif
+
+ /* Handle interrupt signals of startup process */
+ HandleStartupProcInterrupts();
+
+ /*
+ * Pause WAL replay, if requested by a hot-standby session via
+ * SetRecoveryPause().
+ *
+ * Note that we intentionally don't take the info_lck spinlock
+ * here. We might therefore read a slightly stale value of the
+ * recoveryPause flag, but it can't be very stale (no worse than
+ * the last spinlock we did acquire). Since a pause request is a
+ * pretty asynchronous thing anyway, possibly responding to it one
+ * WAL record later than we otherwise would is a minor issue, so
+ * it doesn't seem worth adding another spinlock cycle to prevent
+ * that.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+
+ /*
+ * Have we reached our recovery target?
+ */
+ if (recoveryStopsBefore(xlogreader))
+ {
+ reachedRecoveryTarget = true;
+ break;
+ }
+
+ /*
+ * If we've been asked to lag the primary, wait on latch until
+ * enough time has passed.
+ */
+ if (recoveryApplyDelay(xlogreader))
+ {
+ /*
+ * We test for paused recovery again here. If user sets
+ * delayed apply, it may be because they expect to pause
+ * recovery in case of problems, so we must test again here
+ * otherwise pausing during the delay-wait wouldn't work.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+ }
+
+ /*
+ * Apply the record
+ */
+ ApplyWalRecord(xlogreader, record, &replayTLI);
+
+ /* Exit loop if we reached inclusive recovery target */
+ if (recoveryStopsAfter(xlogreader))
+ {
+ reachedRecoveryTarget = true;
+ break;
+ }
+
+ /* Else, try to fetch the next WAL record */
+ record = ReadRecord(xlogreader, LOG, false, replayTLI);
+ } while (record != NULL);
+
+ /*
+ * end of main redo apply loop
+ */
+
+ if (reachedRecoveryTarget)
+ {
+ if (!reachedConsistency)
+ ereport(FATAL,
+ (errmsg("requested recovery stop point is before consistent recovery point")));
+
+ /*
+ * This is the last point where we can restart recovery with a new
+ * recovery target, if we shutdown and begin again. After this,
+ * Resource Managers may choose to do permanent corrective actions
+ * at end of recovery.
+ */
+ switch (recoveryTargetAction)
+ {
+ case RECOVERY_TARGET_ACTION_SHUTDOWN:
+
+ /*
+ * exit with special return code to request shutdown of
+ * postmaster. Log messages issued from postmaster.
+ */
+ proc_exit(3);
+
+ case RECOVERY_TARGET_ACTION_PAUSE:
+ SetRecoveryPause(true);
+ recoveryPausesHere(true);
+
+ /* drop into promote */
+
+ case RECOVERY_TARGET_ACTION_PROMOTE:
+ break;
+ }
+ }
+
+ /* Allow resource managers to do any required cleanup. */
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (RmgrTable[rmid].rm_cleanup != NULL)
+ RmgrTable[rmid].rm_cleanup();
+ }
+
+ ereport(LOG,
+ (errmsg("redo done at %X/%X system usage: %s",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+ pg_rusage_show(&ru0))));
+ xtime = GetLatestXTime();
+ if (xtime)
+ ereport(LOG,
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(xtime))));
+
+ InRedo = false;
+ }
+ else
+ {
+ /* there are no WAL records following the checkpoint */
+ ereport(LOG,
+ (errmsg("redo is not required")));
+
+ }
+
+ /*
+ * This check is intentionally after the above log messages that indicate
+ * how far recovery went.
+ */
+ if (ArchiveRecoveryRequested &&
+ recoveryTarget != RECOVERY_TARGET_UNSET &&
+ !reachedRecoveryTarget)
+ ereport(FATAL,
+ (errmsg("recovery ended before configured recovery target was reached")));
+}
+
+/*
+ * Subroutine of PerformWalRecovery, to apply one WAL record.
+ */
+static void
+ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
+{
+ ErrorContextCallback errcallback;
+ bool switchedTLI = false;
+
+ /* Setup error traceback support for ereport() */
+ errcallback.callback = rm_redo_error_callback;
+ errcallback.arg = (void *) xlogreader;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /*
+ * ShmemVariableCache->nextXid must be beyond record's xid.
+ */
+ AdvanceNextFullTransactionIdPastXid(record->xl_xid);
+
+ /*
+ * Before replaying this record, check if this record causes the current
+ * timeline to change. The record is already considered to be part of the
+ * new timeline, so we update replayTLI before replaying it. That's
+ * important so that replayEndTLI, which is recorded as the minimum
+ * recovery point's TLI if recovery stops after this record, is set
+ * correctly.
+ */
+ if (record->xl_rmid == RM_XLOG_ID)
+ {
+ TimeLineID newReplayTLI = *replayTLI;
+ TimeLineID prevReplayTLI = *replayTLI;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (info == XLOG_CHECKPOINT_SHUTDOWN)
+ {
+ CheckPoint checkPoint;
+
+ memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+ newReplayTLI = checkPoint.ThisTimeLineID;
+ prevReplayTLI = checkPoint.PrevTimeLineID;
+ }
+ else if (info == XLOG_END_OF_RECOVERY)
+ {
+ xl_end_of_recovery xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
+ newReplayTLI = xlrec.ThisTimeLineID;
+ prevReplayTLI = xlrec.PrevTimeLineID;
+ }
+
+ if (newReplayTLI != *replayTLI)
+ {
+ /* Check that it's OK to switch to this TLI */
+ checkTimeLineSwitch(xlogreader->EndRecPtr,
+ newReplayTLI, prevReplayTLI, *replayTLI);
+
+ /* Following WAL records should be run with new TLI */
+ *replayTLI = newReplayTLI;
+ switchedTLI = true;
+ }
+ }
+
+ /*
+ * Update shared replayEndRecPtr before replaying this record, so that
+ * XLogFlush will update minRecoveryPoint correctly.
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->replayEndTLI = *replayTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * If we are attempting to enter Hot Standby mode, process XIDs we see
+ */
+ if (standbyState >= STANDBY_INITIALIZED &&
+ TransactionIdIsValid(record->xl_xid))
+ RecordKnownAssignedTransactionIds(record->xl_xid);
+
+ /*
+ * Some XLOG record types that are related to recovery are processed
+ * directly here, rather than in xlog_redo()
+ */
+ if (record->xl_rmid == RM_XLOG_ID)
+ xlogrecovery_redo(xlogreader, *replayTLI);
+
+ /* Now apply the WAL record itself */
+ RmgrTable[record->xl_rmid].rm_redo(xlogreader);
+
+ /*
+ * After redo, check whether the backup pages associated with the WAL
+ * record are consistent with the existing pages. This check is done only
+ * if consistency check is enabled for this record.
+ */
+ if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+ verifyBackupPageConsistency(xlogreader);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+
+ /*
+ * Update lastReplayedEndRecPtr after this record has been successfully
+ * replayed.
+ */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
+ XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+ XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
+ * receiver so that it notices the updated lastReplayedEndRecPtr and sends
+ * a reply to the primary.
+ */
+ if (doRequestWalReceiverReply)
+ {
+ doRequestWalReceiverReply = false;
+ WalRcvForceReply();
+ }
+
+ /* Allow read-only connections if we're consistent now */
+ CheckRecoveryConsistency();
+
+ /* Is this a timeline switch? */
+ if (switchedTLI)
+ {
+ /*
+ * Before we continue on the new timeline, clean up any (possibly
+ * bogus) future WAL segments on the old timeline.
+ */
+ RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
+
+ /*
+ * Wake up any walsenders to notice that we are on a new timeline.
+ */
+ if (AllowCascadeReplication())
+ WalSndWakeup();
+ }
+}
+
+/*
+ * Some XLOG RM record types that are directly related to WAL recovery are
+ * handled here rather than in the xlog_redo()
+ */
+static void
+xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ XLogRecPtr lsn = record->EndRecPtr;
+
+ Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
+
+ if (info == XLOG_OVERWRITE_CONTRECORD)
+ {
+ /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
+ xl_overwrite_contrecord xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
+ if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
+ elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+ LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+ LSN_FORMAT_ARGS(record->overwrittenRecPtr));
+
+ ereport(LOG,
+ (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
+ LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+ timestamptz_to_str(xlrec.overwrite_time))));
+
+ /* Verifying the record should only happen once */
+ record->overwrittenRecPtr = InvalidXLogRecPtr;
+ }
+ else if (info == XLOG_BACKUP_END)
+ {
+ XLogRecPtr startpoint;
+
+ memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
+
+ if (backupStartPoint == startpoint)
+ {
+ /*
+ * We have reached the end of base backup, the point where
+ * pg_stop_backup() was done. The data on disk is now consistent
+ * (assuming we have also reached minRecoveryPoint). Set
+ * backupEndPoint to the current LSN, so that the next call to
+ * CheckRecoveryConsistency() will notice it and do the
+ * end-of-backup processing.
+ */
+ elog(DEBUG1, "end of backup record reached");
+
+ backupEndPoint = lsn;
+ }
+ else
+ elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+ LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
+ }
+}
+
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+ XLogRecPtr lastReplayedEndRecPtr;
+ TimeLineID lastReplayedTLI;
+
+ /*
+ * During crash recovery, we don't reach a consistent state until we've
+ * replayed all the WAL.
+ */
+ if (XLogRecPtrIsInvalid(minRecoveryPoint))
+ return;
+
+ Assert(InArchiveRecovery);
+
+ /*
+ * assume that we are called in the startup process, and hence don't need
+ * a lock to read lastReplayedEndRecPtr
+ */
+ lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
+
+ /*
+ * Have we reached the point where our base backup was completed?
+ */
+ if (!XLogRecPtrIsInvalid(backupEndPoint) &&
+ backupEndPoint <= lastReplayedEndRecPtr)
+ {
+ elog(DEBUG1, "end of backup reached");
+
+ /*
+ * We have reached the end of base backup, as indicated by pg_control.
+ * Update the control file accordingly.
+ */
+ ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
+ backupStartPoint = InvalidXLogRecPtr;
+ backupEndPoint = InvalidXLogRecPtr;
+ backupEndRequired = false;
+ }
+
+ /*
+ * Have we passed our safe starting point? Note that minRecoveryPoint is
+ * known to be incorrectly set if ControlFile->backupEndRequired, until
+ * the XLOG_BACKUP_END arrives to advise us of the correct
+ * minRecoveryPoint. All we know prior to that is that we're not
+ * consistent yet.
+ */
+ if (!reachedConsistency && !backupEndRequired &&
+ minRecoveryPoint <= lastReplayedEndRecPtr)
+ {
+ /*
+ * Check to see if the XLOG sequence contained any unresolved
+ * references to uninitialized pages.
+ */
+ XLogCheckInvalidPages();
+
+ reachedConsistency = true;
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+ }
+
+ /*
+ * Have we got a valid starting snapshot that will allow queries to be
+ * run? If so, we can tell postmaster that the database is consistent now,
+ * enabling connections.
+ */
+ if (standbyState == STANDBY_SNAPSHOT_READY &&
+ !LocalHotStandbyActive &&
+ reachedConsistency &&
+ IsUnderPostmaster)
+ {
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->SharedHotStandbyActive = true;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ LocalHotStandbyActive = true;
+
+ SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
+ }
+}
+
+/*
+ * Error context callback for errors occurring during rm_redo().
+ */
+static void
+rm_redo_error_callback(void *arg)
+{
+ XLogReaderState *record = (XLogReaderState *) arg;
+ StringInfoData buf;
+
+ initStringInfo(&buf);
+ xlog_outdesc(&buf, record);
+ xlog_block_info(&buf, record);
+
+ /* translator: %s is a WAL record description */
+ errcontext("WAL redo at %X/%X for %s",
+ LSN_FORMAT_ARGS(record->ReadRecPtr),
+ buf.data);
+
+ pfree(buf.data);
+}
+
+/*
+ * Returns a string describing an XLogRecord, consisting of its identity
+ * optionally followed by a colon, a space, and a further description.
+ */
+void
+xlog_outdesc(StringInfo buf, XLogReaderState *record)
+{
+ RmgrId rmid = XLogRecGetRmid(record);
+ uint8 info = XLogRecGetInfo(record);
+ const char *id;
+
+ appendStringInfoString(buf, RmgrTable[rmid].rm_name);
+ appendStringInfoChar(buf, '/');
+
+ id = RmgrTable[rmid].rm_identify(info);
+ if (id == NULL)
+ appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
+ else
+ appendStringInfo(buf, "%s: ", id);
+
+ RmgrTable[rmid].rm_desc(buf, record);
+}
+
+#ifdef WAL_DEBUG
+
+static void
+xlog_outrec(StringInfo buf, XLogReaderState *record)
+{
+ appendStringInfo(buf, "prev %X/%X; xid %u",
+ LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
+ XLogRecGetXid(record));
+
+ appendStringInfo(buf, "; len %u",
+ XLogRecGetDataLen(record));
+
+ xlog_block_info(buf, record);
+}
+#endif /* WAL_DEBUG */
+
+/*
+ * Returns a string giving information about all the blocks in an
+ * XLogRecord.
+ */
+static void
+xlog_block_info(StringInfo buf, XLogReaderState *record)
+{
+ int block_id;
+
+ /* decode block references */
+ for (block_id = 0; block_id <= record->max_block_id; block_id++)
+ {
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blk;
+
+ if (!XLogRecHasBlockRef(record, block_id))
+ continue;
+
+ XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
+ if (forknum != MAIN_FORKNUM)
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
+ block_id,
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum,
+ blk);
+ else
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
+ block_id,
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ blk);
+ if (XLogRecHasBlockImage(record, block_id))
+ appendStringInfoString(buf, " FPW");
+ }
+}
+
+
+/*
+ * Check that it's OK to switch to new timeline during recovery.
+ *
+ * 'lsn' is the address of the shutdown checkpoint record we're about to
+ * replay. (Currently, timeline can only change at a shutdown checkpoint).
+ */
+static void
+checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
+ TimeLineID replayTLI)
+{
+ /* Check that the record agrees on what the current (old) timeline is */
+ if (prevTLI != replayTLI)
+ ereport(PANIC,
+ (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
+ prevTLI, replayTLI)));
+
+ /*
+ * The new timeline better be in the list of timelines we expect to see,
+ * according to the timeline history. It should also not decrease.
+ */
+ if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+ newTLI, replayTLI)));
+
+ /*
+ * If we have not yet reached min recovery point, and we're about to
+ * switch to a timeline greater than the timeline of the min recovery
+ * point: trouble. After switching to the new timeline, we could not
+ * possibly visit the min recovery point on the correct timeline anymore.
+ * This can happen if there is a newer timeline in the archive that
+ * branched before the timeline the min recovery point is on, and you
+ * attempt to do PITR to the new timeline.
+ */
+ if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
+ lsn < minRecoveryPoint &&
+ newTLI > minRecoveryPointTLI)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
+ newTLI,
+ LSN_FORMAT_ARGS(minRecoveryPoint),
+ minRecoveryPointTLI)));
+
+ /* Looks good */
+}
+
+
+/*
+ * Extract timestamp from WAL record.
+ *
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
+ */
+static bool
+getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ uint8 xact_info = info & XLOG_XACT_OPMASK;
+ uint8 rmid = XLogRecGetRmid(record);
+
+ if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+ {
+ *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+ return true;
+ }
+ if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED))
+ {
+ *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED))
+ {
+ *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+verifyBackupPageConsistency(XLogReaderState *record)
+{
+ RmgrId rmid = XLogRecGetRmid(record);
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+ int block_id;
+
+ /* Records with no backup blocks have no need for consistency checks. */
+ if (!XLogRecHasAnyBlockRefs(record))
+ return;
+
+ Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+ for (block_id = 0; block_id <= record->max_block_id; block_id++)
+ {
+ Buffer buf;
+ Page page;
+
+ if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+ {
+ /*
+ * WAL record doesn't contain a block reference with the given id.
+ * Do nothing.
+ */
+ continue;
+ }
+
+ Assert(XLogRecHasBlockImage(record, block_id));
+
+ if (XLogRecBlockImageApply(record, block_id))
+ {
+ /*
+ * WAL record has already applied the page, so bypass the
+ * consistency check as that would result in comparing the full
+ * page stored in the record with itself.
+ */
+ continue;
+ }
+
+ /*
+ * Read the contents from the current buffer and store it in a
+ * temporary page.
+ */
+ buf = XLogReadBufferExtended(rnode, forknum, blkno,
+ RBM_NORMAL_NO_LOG);
+ if (!BufferIsValid(buf))
+ continue;
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+
+ /*
+ * Take a copy of the local page where WAL has been applied to have a
+ * comparison base before masking it...
+ */
+ memcpy(replay_image_masked, page, BLCKSZ);
+
+ /* No need for this page anymore now that a copy is in. */
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * If the block LSN is already ahead of this WAL record, we can't
+ * expect contents to match. This can happen if recovery is
+ * restarted.
+ */
+ if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+ continue;
+
+ /*
+ * Read the contents from the backup copy, stored in WAL record and
+ * store it in a temporary page. There is no need to allocate a new
+ * page here, a local buffer is fine to hold its contents and a mask
+ * can be directly applied on it.
+ */
+ if (!RestoreBlockImage(record, block_id, primary_image_masked))
+ elog(ERROR, "failed to restore block image");
+
+ /*
+ * If masking function is defined, mask both the primary and replay
+ * images
+ */
+ if (RmgrTable[rmid].rm_mask != NULL)
+ {
+ RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
+ RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
+ }
+
+ /* Time to compare the primary and replay images. */
+ if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
+ {
+ elog(FATAL,
+ "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum, blkno);
+ }
+ }
+}
+
+/*
+ * For point-in-time recovery, this function decides whether we want to
+ * stop applying the XLOG before the current record.
+ *
+ * Returns true if we are stopping, false otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
+ */
+static bool
+recoveryStopsBefore(XLogReaderState *record)
+{
+ bool stopsHere = false;
+ uint8 xact_info;
+ bool isCommit;
+ TimestampTz recordXtime = 0;
+ TransactionId recordXid;
+
+ /*
+ * Ignore recovery target settings when not in archive recovery (meaning
+ * we are in crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
+
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ /* Check if target LSN has been reached */
+ if (recoveryTarget == RECOVERY_TARGET_LSN &&
+ !recoveryTargetInclusive &&
+ record->ReadRecPtr >= recoveryTargetLSN)
+ {
+ recoveryStopAfter = false;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = record->ReadRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ ereport(LOG,
+ (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN))));
+ return true;
+ }
+
+ /* Otherwise we only consider stopping before COMMIT or ABORT records. */
+ if (XLogRecGetRmid(record) != RM_XACT_ID)
+ return false;
+
+ xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+ if (xact_info == XLOG_XACT_COMMIT)
+ {
+ isCommit = true;
+ recordXid = XLogRecGetXid(record);
+ }
+ else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ isCommit = true;
+ ParseCommitRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else if (xact_info == XLOG_XACT_ABORT)
+ {
+ isCommit = false;
+ recordXid = XLogRecGetXid(record);
+ }
+ else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ isCommit = false;
+ ParseAbortRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else
+ return false;
+
+ if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
+ {
+ /*
+ * There can be only one transaction end record with this exact
+ * transactionid
+ *
+ * when testing for an xid, we MUST test for equality only, since
+ * transactions are numbered in the order they start, not the order
+ * they complete. A higher numbered xid will complete before you about
+ * 50% of the time...
+ */
+ stopsHere = (recordXid == recoveryTargetXid);
+ }
+
+ if (recoveryTarget == RECOVERY_TARGET_TIME &&
+ getRecordTimestamp(record, &recordXtime))
+ {
+ /*
+ * There can be many transactions that share the same commit time, so
+ * we stop after the last one, if we are inclusive, or stop at the
+ * first one if we are exclusive
+ */
+ if (recoveryTargetInclusive)
+ stopsHere = (recordXtime > recoveryTargetTime);
+ else
+ stopsHere = (recordXtime >= recoveryTargetTime);
+ }
+
+ if (stopsHere)
+ {
+ recoveryStopAfter = false;
+ recoveryStopXid = recordXid;
+ recoveryStopTime = recordXtime;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+
+ if (isCommit)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping before commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ else
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping before abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ }
+
+ return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogRecoveryCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogReaderState *record)
+{
+ uint8 info;
+ uint8 xact_info;
+ uint8 rmid;
+ TimestampTz recordXtime;
+
+ /*
+ * Ignore recovery target settings when not in archive recovery (meaning
+ * we are in crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ rmid = XLogRecGetRmid(record);
+
+ /*
+ * There can be many restore points that share the same name; we stop at
+ * the first one.
+ */
+ if (recoveryTarget == RECOVERY_TARGET_NAME &&
+ rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *recordRestorePointData;
+
+ recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+ if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ (void) getRecordTimestamp(record, &recoveryStopTime);
+ strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+ ereport(LOG,
+ (errmsg("recovery stopping at restore point \"%s\", time %s",
+ recoveryStopName,
+ timestamptz_to_str(recoveryStopTime))));
+ return true;
+ }
+ }
+
+ /* Check if the target LSN has been reached */
+ if (recoveryTarget == RECOVERY_TARGET_LSN &&
+ recoveryTargetInclusive &&
+ record->ReadRecPtr >= recoveryTargetLSN)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopLSN = record->ReadRecPtr;
+ recoveryStopTime = 0;
+ recoveryStopName[0] = '\0';
+ ereport(LOG,
+ (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN))));
+ return true;
+ }
+
+ if (rmid != RM_XACT_ID)
+ return false;
+
+ xact_info = info & XLOG_XACT_OPMASK;
+
+ if (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED ||
+ xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ TransactionId recordXid;
+
+ /* Update the last applied transaction timestamp */
+ if (getRecordTimestamp(record, &recordXtime))
+ SetLatestXTime(recordXtime);
+
+ /* Extract the XID of the committed/aborted transaction */
+ if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+ xl_xact_parsed_commit parsed;
+
+ ParseCommitRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+ xl_xact_parsed_abort parsed;
+
+ ParseAbortRecord(XLogRecGetInfo(record),
+ xlrec,
+ &parsed);
+ recordXid = parsed.twophase_xid;
+ }
+ else
+ recordXid = XLogRecGetXid(record);
+
+ /*
+ * There can be only one transaction end record with this exact
+ * transactionid
+ *
+ * when testing for an xid, we MUST test for equality only, since
+ * transactions are numbered in the order they start, not the order
+ * they complete. A higher numbered xid will complete before you about
+ * 50% of the time...
+ */
+ if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+ recordXid == recoveryTargetXid)
+ {
+ recoveryStopAfter = true;
+ recoveryStopXid = recordXid;
+ recoveryStopTime = recordXtime;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+
+ if (xact_info == XLOG_XACT_COMMIT ||
+ xact_info == XLOG_XACT_COMMIT_PREPARED)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after commit of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ else if (xact_info == XLOG_XACT_ABORT ||
+ xact_info == XLOG_XACT_ABORT_PREPARED)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after abort of transaction %u, time %s",
+ recoveryStopXid,
+ timestamptz_to_str(recoveryStopTime))));
+ }
+ return true;
+ }
+ }
+
+ /* Check if we should stop as soon as reaching consistency */
+ if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+ {
+ ereport(LOG,
+ (errmsg("recovery stopping after reaching consistency")));
+
+ recoveryStopAfter = true;
+ recoveryStopXid = InvalidTransactionId;
+ recoveryStopTime = 0;
+ recoveryStopLSN = InvalidXLogRecPtr;
+ recoveryStopName[0] = '\0';
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Create a comment for the history file to explain why and where
+ * timeline changed.
+ */
+static char *
+getRecoveryStopReason(void)
+{
+ char reason[200];
+
+ if (recoveryTarget == RECOVERY_TARGET_XID)
+ snprintf(reason, sizeof(reason),
+ "%s transaction %u",
+ recoveryStopAfter ? "after" : "before",
+ recoveryStopXid);
+ else if (recoveryTarget == RECOVERY_TARGET_TIME)
+ snprintf(reason, sizeof(reason),
+ "%s %s\n",
+ recoveryStopAfter ? "after" : "before",
+ timestamptz_to_str(recoveryStopTime));
+ else if (recoveryTarget == RECOVERY_TARGET_LSN)
+ snprintf(reason, sizeof(reason),
+ "%s LSN %X/%X\n",
+ recoveryStopAfter ? "after" : "before",
+ LSN_FORMAT_ARGS(recoveryStopLSN));
+ else if (recoveryTarget == RECOVERY_TARGET_NAME)
+ snprintf(reason, sizeof(reason),
+ "at restore point \"%s\"",
+ recoveryStopName);
+ else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+ snprintf(reason, sizeof(reason), "reached consistency");
+ else
+ snprintf(reason, sizeof(reason), "no recovery target specified");
+
+ return pstrdup(reason);
+}
+
+/*
+ * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
+ *
+ * endOfRecovery is true if the recovery target is reached and
+ * the paused state starts at the end of recovery because of
+ * recovery_target_action=pause, and false otherwise.
+ */
+static void
+recoveryPausesHere(bool endOfRecovery)
+{
+ /* Don't pause unless users can connect! */
+ if (!LocalHotStandbyActive)
+ return;
+
+ /* Don't pause after standby promotion has been triggered */
+ if (LocalPromoteIsTriggered)
+ return;
+
+ if (endOfRecovery)
+ ereport(LOG,
+ (errmsg("pausing at the end of recovery"),
+ errhint("Execute pg_wal_replay_resume() to promote.")));
+ else
+ ereport(LOG,
+ (errmsg("recovery has paused"),
+ errhint("Execute pg_wal_replay_resume() to continue.")));
+
+ /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
+ while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+ {
+ HandleStartupProcInterrupts();
+ if (CheckForStandbyTrigger())
+ return;
+
+ /*
+ * If recovery pause is requested then set it paused. While we are in
+ * the loop, user might resume and pause again so set this every time.
+ */
+ ConfirmRecoveryPaused();
+
+ /*
+ * We wait on a condition variable that will wake us as soon as the
+ * pause ends, but we use a timeout so we can check the above exit
+ * condition periodically too.
+ */
+ ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+ WAIT_EVENT_RECOVERY_PAUSE);
+ }
+ ConditionVariableCancelSleep();
+}
+
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the primary.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogReaderState *record)
+{
+ uint8 xact_info;
+ TimestampTz xtime;
+ TimestampTz delayUntil;
+ long msecs;
+
+ /* nothing to do if no delay configured */
+ if (recovery_min_apply_delay <= 0)
+ return false;
+
+ /* no delay is applied on a database not yet consistent */
+ if (!reachedConsistency)
+ return false;
+
+ /* nothing to do if crash recovery is requested */
+ if (!ArchiveRecoveryRequested)
+ return false;
+
+ /*
+ * Is it a COMMIT record?
+ *
+ * We deliberately choose not to delay aborts since they have no effect on
+ * MVCC. We already allow replay of records that don't have a timestamp,
+ * so there is already opportunity for issues caused by early conflicts on
+ * standbys.
+ */
+ if (XLogRecGetRmid(record) != RM_XACT_ID)
+ return false;
+
+ xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+ if (xact_info != XLOG_XACT_COMMIT &&
+ xact_info != XLOG_XACT_COMMIT_PREPARED)
+ return false;
+
+ if (!getRecordTimestamp(record, &xtime))
+ return false;
+
+ delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+ /*
+ * Exit without arming the latch if it's already past time to apply this
+ * record
+ */
+ msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
+ if (msecs <= 0)
+ return false;
+
+ while (true)
+ {
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+
+ /*
+ * This might change recovery_min_apply_delay or the trigger file's
+ * location.
+ */
+ HandleStartupProcInterrupts();
+
+ if (CheckForStandbyTrigger())
+ break;
+
+ /*
+ * Recalculate delayUntil as recovery_min_apply_delay could have
+ * changed while waiting in this loop.
+ */
+ delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+ /*
+ * Wait for difference between GetCurrentTimestamp() and delayUntil.
+ */
+ msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
+ delayUntil);
+
+ if (msecs <= 0)
+ break;
+
+ elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
+
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ msecs,
+ WAIT_EVENT_RECOVERY_APPLY_DELAY);
+ }
+ return true;
+}
+
+/*
+ * Get the current state of the recovery pause request.
+ */
+RecoveryPauseState
+GetRecoveryPauseState(void)
+{
+ RecoveryPauseState state;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ state = XLogRecoveryCtl->recoveryPauseState;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return state;
+}
+
+/*
+ * Set the recovery pause state.
+ *
+ * If recovery pause is requested then sets the recovery pause state to
+ * 'pause requested' if it is not already 'paused'. Otherwise, sets it
+ * to 'not paused' to resume the recovery. The recovery pause will be
+ * confirmed by the ConfirmRecoveryPaused.
+ */
+void
+SetRecoveryPause(bool recoveryPause)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+
+ if (!recoveryPause)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+ else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
+
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (!recoveryPause)
+ ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Confirm the recovery pause by setting the recovery pause state to
+ * RECOVERY_PAUSED.
+ */
+static void
+ConfirmRecoveryPaused(void)
+{
+ /* If recovery pause is requested then set it paused */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
+ XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+
+/*
+ * Attempt to read the next XLOG record.
+ *
+ * Before first call, the reader needs to be positioned to the first record
+ * by calling XLogBeginRead().
+ *
+ * If no valid record is available, returns NULL, or fails if emode is PANIC.
+ * (emode must be either PANIC, LOG). In standby mode, retries until a valid
+ * record is available.
+ */
+static XLogRecord *
+ReadRecord(XLogReaderState *xlogreader, int emode,
+ bool fetching_ckpt, TimeLineID replayTLI)
+{
+ XLogRecord *record;
+ XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
+
+ /* Pass through parameters to XLogPageRead */
+ private->fetching_ckpt = fetching_ckpt;
+ private->emode = emode;
+ private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
+ private->replayTLI = replayTLI;
+
+ /* This is the first attempt to read this page. */
+ lastSourceFailed = false;
+
+ for (;;)
+ {
+ char *errormsg;
+
+ record = XLogReadRecord(xlogreader, &errormsg);
+ if (record == NULL)
+ {
+ /*
+ * When not in standby mode we find that WAL ends in an incomplete
+ * record, keep track of that record. After recovery is done,
+ * we'll write a record to indicate downstream WAL readers that
+ * that portion is to be ignored.
+ */
+ if (!StandbyMode &&
+ !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+ {
+ abortedRecPtr = xlogreader->abortedRecPtr;
+ missingContrecPtr = xlogreader->missingContrecPtr;
+ }
+
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+
+ /*
+ * We only end up here without a message when XLogPageRead()
+ * failed - in that case we already logged something. In
+ * StandbyMode that only happens if we have been triggered, so we
+ * shouldn't loop anymore in that case.
+ */
+ if (errormsg)
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg_internal("%s", errormsg) /* already translated */ ));
+ }
+
+ /*
+ * Check page TLI is one of the expected values.
+ */
+ else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
+ {
+ char fname[MAXFNAMELEN];
+ XLogSegNo segno;
+ int32 offset;
+
+ XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
+ offset = XLogSegmentOffset(xlogreader->latestPagePtr,
+ wal_segment_size);
+ XLogFileName(fname, xlogreader->seg.ws_tli, segno,
+ wal_segment_size);
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+ xlogreader->latestPageTLI,
+ fname,
+ offset)));
+ record = NULL;
+ }
+
+ if (record)
+ {
+ /* Great, got a record */
+ return record;
+ }
+ else
+ {
+ /* No valid record available from this source */
+ lastSourceFailed = true;
+
+ /*
+ * If archive recovery was requested, but we were still doing
+ * crash recovery, switch to archive recovery and retry using the
+ * offline archive. We have now replayed all the valid WAL in
+ * pg_wal, so we are presumably now consistent.
+ *
+ * We require that there's at least some valid WAL present in
+ * pg_wal, however (!fetching_ckpt). We could recover using the
+ * WAL from the archive, even if pg_wal is completely empty, but
+ * we'd have no idea how far we'd have to replay to reach
+ * consistency. So err on the safe side and give up.
+ */
+ if (!InArchiveRecovery && ArchiveRecoveryRequested &&
+ !fetching_ckpt)
+ {
+ ereport(DEBUG1,
+ (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+ InArchiveRecovery = true;
+ if (StandbyModeRequested)
+ StandbyMode = true;
+
+ SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
+ minRecoveryPoint = xlogreader->EndRecPtr;
+ minRecoveryPointTLI = replayTLI;
+
+ CheckRecoveryConsistency();
+
+ /*
+ * Before we retry, reset lastSourceFailed and currentSource
+ * so that we will check the archive next.
+ */
+ lastSourceFailed = false;
+ currentSource = XLOG_FROM_ANY;
+
+ continue;
+ }
+
+ /* In standby mode, loop back to retry. Otherwise, give up. */
+ if (StandbyMode && !CheckForStandbyTrigger())
+ continue;
+ else
+ return NULL;
+ }
+ }
+}
+
+/*
+ * Read the XLOG page containing RecPtr into readBuf (if not read already).
+ * Returns number of bytes read, if the page is read successfully, or -1
+ * in case of errors. When errors occur, they are ereport'ed, but only
+ * if they have not been previously reported.
+ *
+ * This is responsible for restoring files from archive as needed, as well
+ * as for waiting for the requested WAL record to arrive in standby mode.
+ *
+ * 'emode' specifies the log level used for reporting "file not found" or
+ * "end of WAL" situations in archive recovery, or in standby mode when a
+ * trigger file is found. If set to WARNING or below, XLogPageRead() returns
+ * false in those situations, on higher log levels the ereport() won't
+ * return.
+ *
+ * In standby mode, if after a successful return of XLogPageRead() the
+ * caller finds the record it's interested in to be broken, it should
+ * ereport the error with the level determined by
+ * emode_for_corrupt_record(), and then set lastSourceFailed
+ * and call XLogPageRead() again with the same arguments. This lets
+ * XLogPageRead() to try fetching the record from another source, or to
+ * sleep and retry.
+ */
+static int
+XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
+ XLogRecPtr targetRecPtr, char *readBuf)
+{
+ XLogPageReadPrivate *private =
+ (XLogPageReadPrivate *) xlogreader->private_data;
+ int emode = private->emode;
+ uint32 targetPageOff;
+ XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+ int r;
+
+ XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
+ targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
+
+ /*
+ * See if we need to switch to a new segment because the requested record
+ * is not in the currently open one.
+ */
+ if (readFile >= 0 &&
+ !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
+ {
+ /*
+ * Request a restartpoint if we've replayed too much xlog since the
+ * last one.
+ */
+ if (ArchiveRecoveryRequested && IsUnderPostmaster)
+ {
+ if (XLogCheckpointNeeded(readSegNo))
+ {
+ (void) GetRedoRecPtr();
+ if (XLogCheckpointNeeded(readSegNo))
+ RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+ }
+ }
+
+ close(readFile);
+ readFile = -1;
+ readSource = XLOG_FROM_ANY;
+ }
+
+ XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
+
+retry:
+ /* See if we need to retrieve more data */
+ if (readFile < 0 ||
+ (readSource == XLOG_FROM_STREAM &&
+ flushedUpto < targetPagePtr + reqLen))
+ {
+ if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
+ private->randAccess,
+ private->fetching_ckpt,
+ targetRecPtr,
+ private->replayTLI,
+ xlogreader->EndRecPtr))
+ {
+ if (readFile >= 0)
+ close(readFile);
+ readFile = -1;
+ readLen = 0;
+ readSource = XLOG_FROM_ANY;
+
+ return -1;
+ }
+ }
+
+ /*
+ * At this point, we have the right segment open and if we're streaming we
+ * know the requested record is in it.
+ */
+ Assert(readFile != -1);
+
+ /*
+ * If the current segment is being streamed from the primary, calculate
+ * how much of the current page we have received already. We know the
+ * requested record has been received, but this is for the benefit of
+ * future calls, to allow quick exit at the top of this function.
+ */
+ if (readSource == XLOG_FROM_STREAM)
+ {
+ if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
+ readLen = XLOG_BLCKSZ;
+ else
+ readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
+ targetPageOff;
+ }
+ else
+ readLen = XLOG_BLCKSZ;
+
+ /* Read the requested page */
+ readOff = targetPageOff;
+
+ pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+ r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+ if (r != XLOG_BLCKSZ)
+ {
+ char fname[MAXFNAMELEN];
+ int save_errno = errno;
+
+ pgstat_report_wait_end();
+ XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+ if (r < 0)
+ {
+ errno = save_errno;
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode_for_file_access(),
+ errmsg("could not read from log segment %s, offset %u: %m",
+ fname, readOff)));
+ }
+ else
+ ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+ fname, readOff, r, (Size) XLOG_BLCKSZ)));
+ goto next_record_is_invalid;
+ }
+ pgstat_report_wait_end();
+
+ Assert(targetSegNo == readSegNo);
+ Assert(targetPageOff == readOff);
+ Assert(reqLen <= readLen);
+
+ xlogreader->seg.ws_tli = curFileTLI;
+
+ /*
+ * Check the page header immediately, so that we can retry immediately if
+ * it's not valid. This may seem unnecessary, because ReadPageInternal()
+ * validates the page header anyway, and would propagate the failure up to
+ * ReadRecord(), which would retry. However, there's a corner case with
+ * continuation records, if a record is split across two pages such that
+ * we would need to read the two pages from different sources. For
+ * example, imagine a scenario where a streaming replica is started up,
+ * and replay reaches a record that's split across two WAL segments. The
+ * first page is only available locally, in pg_wal, because it's already
+ * been recycled on the primary. The second page, however, is not present
+ * in pg_wal, and we should stream it from the primary. There is a
+ * recycled WAL segment present in pg_wal, with garbage contents, however.
+ * We would read the first page from the local WAL segment, but when
+ * reading the second page, we would read the bogus, recycled, WAL
+ * segment. If we didn't catch that case here, we would never recover,
+ * because ReadRecord() would retry reading the whole record from the
+ * beginning.
+ *
+ * Of course, this only catches errors in the page header, which is what
+ * happens in the case of a recycled WAL segment. Other kinds of errors or
+ * corruption still has the same problem. But this at least fixes the
+ * common case, which can happen as part of normal operation.
+ *
+ * Validating the page header is cheap enough that doing it twice
+ * shouldn't be a big deal from a performance point of view.
+ *
+ * When not in standby mode, an invalid page header should cause recovery
+ * to end, not retry reading the page, so we don't need to validate the
+ * page header here for the retry. Instead, ReadPageInternal() is
+ * responsible for the validation.
+ */
+ if (StandbyMode &&
+ !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+ {
+ /*
+ * Emit this error right now then retry this page immediately. Use
+ * errmsg_internal() because the message was already translated.
+ */
+ if (xlogreader->errormsg_buf[0])
+ ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
+ (errmsg_internal("%s", xlogreader->errormsg_buf)));
+
+ /* reset any error XLogReaderValidatePageHeader() might have set */
+ xlogreader->errormsg_buf[0] = '\0';
+ goto next_record_is_invalid;
+ }
+
+ return readLen;
+
+next_record_is_invalid:
+ lastSourceFailed = true;
+
+ if (readFile >= 0)
+ close(readFile);
+ readFile = -1;
+ readLen = 0;
+ readSource = XLOG_FROM_ANY;
+
+ /* In standby-mode, keep trying */
+ if (StandbyMode)
+ goto retry;
+ else
+ return -1;
+}
+
+/*
+ * Open the WAL segment containing WAL location 'RecPtr'.
+ *
+ * The segment can be fetched via restore_command, or via walreceiver having
+ * streamed the record, or it can already be present in pg_wal. Checking
+ * pg_wal is mainly for crash recovery, but it will be polled in standby mode
+ * too, in case someone copies a new segment directly to pg_wal. That is not
+ * documented or recommended, though.
+ *
+ * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
+ * prepare to read WAL starting from RedoStartLSN after this.
+ *
+ * 'RecPtr' might not point to the beginning of the record we're interested
+ * in, it might also point to the page or segment header. In that case,
+ * 'tliRecPtr' is the position of the WAL record we're interested in. It is
+ * used to decide which timeline to stream the requested WAL from.
+ *
+ * 'replayLSN' is the current replay LSN, so that if we scan for new
+ * timelines, we can reject a switch to a timeline that branched off before
+ * this point.
+ *
+ * If the record is not immediately available, the function returns false
+ * if we're not in standby mode. In standby mode, waits for it to become
+ * available.
+ *
+ * When the requested record becomes available, the function opens the file
+ * containing it (if not open already), and returns true. When end of standby
+ * mode is triggered by the user, and there is no more WAL available, returns
+ * false.
+ */
+static bool
+WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+ bool fetching_ckpt, XLogRecPtr tliRecPtr,
+ TimeLineID replayTLI, XLogRecPtr replayLSN)
+{
+ static TimestampTz last_fail_time = 0;
+ TimestampTz now;
+ bool streaming_reply_sent = false;
+
+ /*-------
+ * Standby mode is implemented by a state machine:
+ *
+ * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
+ * pg_wal (XLOG_FROM_PG_WAL)
+ * 2. Check trigger file
+ * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+ * 4. Rescan timelines
+ * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
+ *
+ * Failure to read from the current source advances the state machine to
+ * the next state.
+ *
+ * 'currentSource' indicates the current state. There are no currentSource
+ * values for "check trigger", "rescan timelines", and "sleep" states,
+ * those actions are taken when reading from the previous source fails, as
+ * part of advancing to the next state.
+ *
+ * If standby mode is turned off while reading WAL from stream, we move
+ * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
+ * the files (which would be required at end of recovery, e.g., timeline
+ * history file) from archive or pg_wal. We don't need to kill WAL receiver
+ * here because it's already stopped when standby mode is turned off at
+ * the end of recovery.
+ *-------
+ */
+ if (!InArchiveRecovery)
+ currentSource = XLOG_FROM_PG_WAL;
+ else if (currentSource == XLOG_FROM_ANY ||
+ (!StandbyMode && currentSource == XLOG_FROM_STREAM))
+ {
+ lastSourceFailed = false;
+ currentSource = XLOG_FROM_ARCHIVE;
+ }
+
+ for (;;)
+ {
+ XLogSource oldSource = currentSource;
+ bool startWalReceiver = false;
+
+ /*
+ * First check if we failed to read from the current source, and
+ * advance the state machine if so. The failure to read might've
+ * happened outside this function, e.g when a CRC check fails on a
+ * record, or within this loop.
+ */
+ if (lastSourceFailed)
+ {
+ switch (currentSource)
+ {
+ case XLOG_FROM_ARCHIVE:
+ case XLOG_FROM_PG_WAL:
+
+ /*
+ * Check to see if the trigger file exists. Note that we
+ * do this only after failure, so when you create the
+ * trigger file, we still finish replaying as much as we
+ * can from archive and pg_wal before failover.
+ */
+ if (StandbyMode && CheckForStandbyTrigger())
+ {
+ XLogShutdownWalRcv();
+ return false;
+ }
+
+ /*
+ * Not in standby mode, and we've now tried the archive
+ * and pg_wal.
+ */
+ if (!StandbyMode)
+ return false;
+
+ /*
+ * Move to XLOG_FROM_STREAM state, and set to start a
+ * walreceiver if necessary.
+ */
+ currentSource = XLOG_FROM_STREAM;
+ startWalReceiver = true;
+ break;
+
+ case XLOG_FROM_STREAM:
+
+ /*
+ * Failure while streaming. Most likely, we got here
+ * because streaming replication was terminated, or
+ * promotion was triggered. But we also get here if we
+ * find an invalid record in the WAL streamed from the
+ * primary, in which case something is seriously wrong.
+ * There's little chance that the problem will just go
+ * away, but PANIC is not good for availability either,
+ * especially in hot standby mode. So, we treat that the
+ * same as disconnection, and retry from archive/pg_wal
+ * again. The WAL in the archive should be identical to
+ * what was streamed, so it's unlikely that it helps, but
+ * one can hope...
+ */
+
+ /*
+ * We should be able to move to XLOG_FROM_STREAM only in
+ * standby mode.
+ */
+ Assert(StandbyMode);
+
+ /*
+ * Before we leave XLOG_FROM_STREAM state, make sure that
+ * walreceiver is not active, so that it won't overwrite
+ * WAL that we restore from archive.
+ */
+ if (WalRcvStreaming())
+ XLogShutdownWalRcv();
+
+ /*
+ * Before we sleep, re-scan for possible new timelines if
+ * we were requested to recover to the latest timeline.
+ */
+ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+ {
+ if (rescanLatestTimeLine(replayTLI, replayLSN))
+ {
+ currentSource = XLOG_FROM_ARCHIVE;
+ break;
+ }
+ }
+
+ /*
+ * XLOG_FROM_STREAM is the last state in our state
+ * machine, so we've exhausted all the options for
+ * obtaining the requested WAL. We're going to loop back
+ * and retry from the archive, but if it hasn't been long
+ * since last attempt, sleep wal_retrieve_retry_interval
+ * milliseconds to avoid busy-waiting.
+ */
+ now = GetCurrentTimestamp();
+ if (!TimestampDifferenceExceeds(last_fail_time, now,
+ wal_retrieve_retry_interval))
+ {
+ long wait_time;
+
+ wait_time = wal_retrieve_retry_interval -
+ TimestampDifferenceMilliseconds(last_fail_time, now);
+
+ elog(LOG, "waiting for WAL to become available at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr));
+
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT |
+ WL_EXIT_ON_PM_DEATH,
+ wait_time,
+ WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ now = GetCurrentTimestamp();
+
+ /* Handle interrupt signals of startup process */
+ HandleStartupProcInterrupts();
+ }
+ last_fail_time = now;
+ currentSource = XLOG_FROM_ARCHIVE;
+ break;
+
+ default:
+ elog(ERROR, "unexpected WAL source %d", currentSource);
+ }
+ }
+ else if (currentSource == XLOG_FROM_PG_WAL)
+ {
+ /*
+ * We just successfully read a file in pg_wal. We prefer files in
+ * the archive over ones in pg_wal, so try the next file again
+ * from the archive first.
+ */
+ if (InArchiveRecovery)
+ currentSource = XLOG_FROM_ARCHIVE;
+ }
+
+ if (currentSource != oldSource)
+ elog(DEBUG2, "switched WAL source from %s to %s after %s",
+ xlogSourceNames[oldSource], xlogSourceNames[currentSource],
+ lastSourceFailed ? "failure" : "success");
+
+ /*
+ * We've now handled possible failure. Try to read from the chosen
+ * source.
+ */
+ lastSourceFailed = false;
+
+ switch (currentSource)
+ {
+ case XLOG_FROM_ARCHIVE:
+ case XLOG_FROM_PG_WAL:
+
+ /*
+ * WAL receiver must not be running when reading WAL from
+ * archive or pg_wal.
+ */
+ Assert(!WalRcvStreaming());
+
+ /* Close any old file we might have open. */
+ if (readFile >= 0)
+ {
+ close(readFile);
+ readFile = -1;
+ }
+ /* Reset curFileTLI if random fetch. */
+ if (randAccess)
+ curFileTLI = 0;
+
+ /*
+ * Try to restore the file from archive, or read an existing
+ * file from pg_wal.
+ */
+ readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+ currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+ currentSource);
+ if (readFile >= 0)
+ return true; /* success! */
+
+ /*
+ * Nope, not found in archive or pg_wal.
+ */
+ lastSourceFailed = true;
+ break;
+
+ case XLOG_FROM_STREAM:
+ {
+ bool havedata;
+
+ /*
+ * We should be able to move to XLOG_FROM_STREAM only in
+ * standby mode.
+ */
+ Assert(StandbyMode);
+
+ /*
+ * First, shutdown walreceiver if its restart has been
+ * requested -- but no point if we're already slated for
+ * starting it.
+ */
+ if (pendingWalRcvRestart && !startWalReceiver)
+ {
+ XLogShutdownWalRcv();
+
+ /*
+ * Re-scan for possible new timelines if we were
+ * requested to recover to the latest timeline.
+ */
+ if (recoveryTargetTimeLineGoal ==
+ RECOVERY_TARGET_TIMELINE_LATEST)
+ rescanLatestTimeLine(replayTLI, replayLSN);
+
+ startWalReceiver = true;
+ }
+ pendingWalRcvRestart = false;
+
+ /*
+ * Launch walreceiver if needed.
+ *
+ * If fetching_ckpt is true, RecPtr points to the initial
+ * checkpoint location. In that case, we use RedoStartLSN
+ * as the streaming start position instead of RecPtr, so
+ * that when we later jump backwards to start redo at
+ * RedoStartLSN, we will have the logs streamed already.
+ */
+ if (startWalReceiver &&
+ PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
+ {
+ XLogRecPtr ptr;
+ TimeLineID tli;
+
+ if (fetching_ckpt)
+ {
+ ptr = RedoStartLSN;
+ tli = RedoStartTLI;
+ }
+ else
+ {
+ ptr = RecPtr;
+
+ /*
+ * Use the record begin position to determine the
+ * TLI, rather than the position we're reading.
+ */
+ tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
+
+ if (curFileTLI > 0 && tli < curFileTLI)
+ elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+ LSN_FORMAT_ARGS(tliRecPtr),
+ tli, curFileTLI);
+ }
+ curFileTLI = tli;
+ SetInstallXLogFileSegmentActive();
+ RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+ PrimarySlotName,
+ wal_receiver_create_temp_slot);
+ flushedUpto = 0;
+ }
+
+ /*
+ * Check if WAL receiver is active or wait to start up.
+ */
+ if (!WalRcvStreaming())
+ {
+ lastSourceFailed = true;
+ break;
+ }
+
+ /*
+ * Walreceiver is active, so see if new data has arrived.
+ *
+ * We only advance XLogReceiptTime when we obtain fresh
+ * WAL from walreceiver and observe that we had already
+ * processed everything before the most recent "chunk"
+ * that it flushed to disk. In steady state where we are
+ * keeping up with the incoming data, XLogReceiptTime will
+ * be updated on each cycle. When we are behind,
+ * XLogReceiptTime will not advance, so the grace time
+ * allotted to conflicting queries will decrease.
+ */
+ if (RecPtr < flushedUpto)
+ havedata = true;
+ else
+ {
+ XLogRecPtr latestChunkStart;
+
+ flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
+ if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
+ {
+ havedata = true;
+ if (latestChunkStart <= RecPtr)
+ {
+ XLogReceiptTime = GetCurrentTimestamp();
+ SetCurrentChunkStartTime(XLogReceiptTime);
+ }
+ }
+ else
+ havedata = false;
+ }
+ if (havedata)
+ {
+ /*
+ * Great, streamed far enough. Open the file if it's
+ * not open already. Also read the timeline history
+ * file if we haven't initialized timeline history
+ * yet; it should be streamed over and present in
+ * pg_wal by now. Use XLOG_FROM_STREAM so that source
+ * info is set correctly and XLogReceiptTime isn't
+ * changed.
+ *
+ * NB: We must set readTimeLineHistory based on
+ * recoveryTargetTLI, not receiveTLI. Normally they'll
+ * be the same, but if recovery_target_timeline is
+ * 'latest' and archiving is configured, then it's
+ * possible that we managed to retrieve one or more
+ * new timeline history files from the archive,
+ * updating recoveryTargetTLI.
+ */
+ if (readFile < 0)
+ {
+ if (!expectedTLEs)
+ expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+ readFile = XLogFileRead(readSegNo, PANIC,
+ receiveTLI,
+ XLOG_FROM_STREAM, false);
+ Assert(readFile >= 0);
+ }
+ else
+ {
+ /* just make sure source info is correct... */
+ readSource = XLOG_FROM_STREAM;
+ XLogReceiptSource = XLOG_FROM_STREAM;
+ return true;
+ }
+ break;
+ }
+
+ /*
+ * Data not here yet. Check for trigger, then wait for
+ * walreceiver to wake us up when new WAL arrives.
+ */
+ if (CheckForStandbyTrigger())
+ {
+ /*
+ * Note that we don't "return false" immediately here.
+ * After being triggered, we still want to replay all
+ * the WAL that was already streamed. It's in pg_wal
+ * now, so we just treat this as a failure, and the
+ * state machine will move on to replay the streamed
+ * WAL from pg_wal, and then recheck the trigger and
+ * exit replay.
+ */
+ lastSourceFailed = true;
+ break;
+ }
+
+ /*
+ * Since we have replayed everything we have received so
+ * far and are about to start waiting for more WAL, let's
+ * tell the upstream server our replay location now so
+ * that pg_stat_replication doesn't show stale
+ * information.
+ */
+ if (!streaming_reply_sent)
+ {
+ WalRcvForceReply();
+ streaming_reply_sent = true;
+ }
+
+ /*
+ * Wait for more WAL to arrive. Time out after 5 seconds
+ * to react to a trigger file promptly and to check if the
+ * WAL receiver is still active.
+ */
+ (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
+ WL_LATCH_SET | WL_TIMEOUT |
+ WL_EXIT_ON_PM_DEATH,
+ 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
+ ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+ break;
+ }
+
+ default:
+ elog(ERROR, "unexpected WAL source %d", currentSource);
+ }
+
+ /*
+ * Check for recovery pause here so that we can confirm more quickly
+ * that a requested pause has actually taken effect.
+ */
+ if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
+ RECOVERY_NOT_PAUSED)
+ recoveryPausesHere(false);
+
+ /*
+ * This possibly-long loop needs to handle interrupts of startup
+ * process.
+ */
+ HandleStartupProcInterrupts();
+ }
+
+ return false; /* not reached */
+}
+
+
+/*
+ * Determine what log level should be used to report a corrupt WAL record
+ * in the current WAL page, previously read by XLogPageRead().
+ *
+ * 'emode' is the error mode that would be used to report a file-not-found
+ * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
+ * we're retrying the exact same record that we've tried previously, only
+ * complain the first time to keep the noise down. However, we only do when
+ * reading from pg_wal, because we don't expect any invalid records in archive
+ * or in records streamed from the primary. Files in the archive should be complete,
+ * and we should never hit the end of WAL because we stop and wait for more WAL
+ * to arrive before replaying it.
+ *
+ * NOTE: This function remembers the RecPtr value it was last called with,
+ * to suppress repeated messages about the same record. Only call this when
+ * you are about to ereport(), or you might cause a later message to be
+ * erroneously suppressed.
+ */
+static int
+emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
+{
+ static XLogRecPtr lastComplaint = 0;
+
+ if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
+ {
+ if (RecPtr == lastComplaint)
+ emode = DEBUG1;
+ else
+ lastComplaint = RecPtr;
+ }
+ return emode;
+}
+
+
+/*
+ * Subroutine to try to fetch and validate a prior checkpoint record.
+ *
+ * whichChkpt identifies the checkpoint (merely for reporting purposes).
+ * 1 for "primary", 0 for "other" (backup_label)
+ */
+static XLogRecord *
+ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
+ int whichChkpt, bool report, TimeLineID replayTLI)
+{
+ XLogRecord *record;
+ uint8 info;
+
+ Assert(xlogreader != NULL);
+
+ if (!XRecOffIsValid(RecPtr))
+ {
+ if (!report)
+ return NULL;
+
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid primary checkpoint link in control file")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid checkpoint link in backup_label file")));
+ break;
+ }
+ return NULL;
+ }
+
+ XLogBeginRead(xlogreader, RecPtr);
+ record = ReadRecord(xlogreader, LOG, true, replayTLI);
+
+ if (record == NULL)
+ {
+ if (!report)
+ return NULL;
+
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ if (record->xl_rmid != RM_XLOG_ID)
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ info = record->xl_info & ~XLR_INFO_MASK;
+ if (info != XLOG_CHECKPOINT_SHUTDOWN &&
+ info != XLOG_CHECKPOINT_ONLINE)
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid xl_info in primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid xl_info in checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
+ {
+ switch (whichChkpt)
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("invalid length of primary checkpoint record")));
+ break;
+ default:
+ ereport(LOG,
+ (errmsg("invalid length of checkpoint record")));
+ break;
+ }
+ return NULL;
+ }
+ return record;
+}
+
+/*
+ * Scan for new timelines that might have appeared in the archive since we
+ * started recovery.
+ *
+ * If there are any, the function changes recovery target TLI to the latest
+ * one and returns 'true'.
+ */
+static bool
+rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
+{
+ List *newExpectedTLEs;
+ bool found;
+ ListCell *cell;
+ TimeLineID newtarget;
+ TimeLineID oldtarget = recoveryTargetTLI;
+ TimeLineHistoryEntry *currentTle = NULL;
+
+ newtarget = findNewestTimeLine(recoveryTargetTLI);
+ if (newtarget == recoveryTargetTLI)
+ {
+ /* No new timelines found */
+ return false;
+ }
+
+ /*
+ * Determine the list of expected TLIs for the new TLI
+ */
+
+ newExpectedTLEs = readTimeLineHistory(newtarget);
+
+ /*
+ * If the current timeline is not part of the history of the new timeline,
+ * we cannot proceed to it.
+ */
+ found = false;
+ foreach(cell, newExpectedTLEs)
+ {
+ currentTle = (TimeLineHistoryEntry *) lfirst(cell);
+
+ if (currentTle->tli == recoveryTargetTLI)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ {
+ ereport(LOG,
+ (errmsg("new timeline %u is not a child of database system timeline %u",
+ newtarget,
+ replayTLI)));
+ return false;
+ }
+
+ /*
+ * The current timeline was found in the history file, but check that the
+ * next timeline was forked off from it *after* the current recovery
+ * location.
+ */
+ if (currentTle->end < replayLSN)
+ {
+ ereport(LOG,
+ (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
+ newtarget,
+ replayTLI,
+ LSN_FORMAT_ARGS(replayLSN))));
+ return false;
+ }
+
+ /* The new timeline history seems valid. Switch target */
+ recoveryTargetTLI = newtarget;
+ list_free_deep(expectedTLEs);
+ expectedTLEs = newExpectedTLEs;
+
+ /*
+ * As in StartupXLOG(), try to ensure we have all the history files
+ * between the old target and new target in pg_wal.
+ */
+ restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
+ ereport(LOG,
+ (errmsg("new target timeline is %u",
+ recoveryTargetTLI)));
+
+ return true;
+}
+
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
+ * Otherwise, it's assumed to be already available in pg_wal.
+ */
+static int
+XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+ XLogSource source, bool notfoundOk)
+{
+ char xlogfname[MAXFNAMELEN];
+ char activitymsg[MAXFNAMELEN + 16];
+ char path[MAXPGPATH];
+ int fd;
+
+ XLogFileName(xlogfname, tli, segno, wal_segment_size);
+
+ switch (source)
+ {
+ case XLOG_FROM_ARCHIVE:
+ /* Report recovery progress in PS display */
+ snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+ xlogfname);
+ set_ps_display(activitymsg);
+
+ if (!RestoreArchivedFile(path, xlogfname,
+ "RECOVERYXLOG",
+ wal_segment_size,
+ InRedo))
+ return -1;
+ break;
+
+ case XLOG_FROM_PG_WAL:
+ case XLOG_FROM_STREAM:
+ XLogFilePath(path, tli, segno, wal_segment_size);
+ break;
+
+ default:
+ elog(ERROR, "invalid XLogFileRead source %d", source);
+ }
+
+ /*
+ * If the segment was fetched from archival storage, replace the existing
+ * xlog segment (if any) with the archival version.
+ */
+ if (source == XLOG_FROM_ARCHIVE)
+ {
+ Assert(!IsInstallXLogFileSegmentActive());
+ KeepFileRestoredFromArchive(path, xlogfname);
+
+ /*
+ * Set path to point at the new file in pg_wal.
+ */
+ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+ }
+
+ fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+ if (fd >= 0)
+ {
+ /* Success! */
+ curFileTLI = tli;
+
+ /* Report recovery progress in PS display */
+ snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
+ xlogfname);
+ set_ps_display(activitymsg);
+
+ /* Track source of data in assorted state variables */
+ readSource = source;
+ XLogReceiptSource = source;
+ /* In FROM_STREAM case, caller tracks receipt time, not me */
+ if (source != XLOG_FROM_STREAM)
+ XLogReceiptTime = GetCurrentTimestamp();
+
+ return fd;
+ }
+ if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return -1;
+}
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * This version searches for the segment with any TLI listed in expectedTLEs.
+ */
+static int
+XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+{
+ char path[MAXPGPATH];
+ ListCell *cell;
+ int fd;
+ List *tles;
+
+ /*
+ * Loop looking for a suitable timeline ID: we might need to read any of
+ * the timelines listed in expectedTLEs.
+ *
+ * We expect curFileTLI on entry to be the TLI of the preceding file in
+ * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
+ * to go backwards; this prevents us from picking up the wrong file when a
+ * parent timeline extends to higher segment numbers than the child we
+ * want to read.
+ *
+ * If we haven't read the timeline history file yet, read it now, so that
+ * we know which TLIs to scan. We don't save the list in expectedTLEs,
+ * however, unless we actually find a valid segment. That way if there is
+ * neither a timeline history file nor a WAL segment in the archive, and
+ * streaming replication is set up, we'll read the timeline history file
+ * streamed from the primary when we start streaming, instead of
+ * recovering with a dummy history generated here.
+ */
+ if (expectedTLEs)
+ tles = expectedTLEs;
+ else
+ tles = readTimeLineHistory(recoveryTargetTLI);
+
+ foreach(cell, tles)
+ {
+ TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
+ TimeLineID tli = hent->tli;
+
+ if (tli < curFileTLI)
+ break; /* don't bother looking at too-old TLIs */
+
+ /*
+ * Skip scanning the timeline ID that the logfile segment to read
+ * doesn't belong to
+ */
+ if (hent->begin != InvalidXLogRecPtr)
+ {
+ XLogSegNo beginseg = 0;
+
+ XLByteToSeg(hent->begin, beginseg, wal_segment_size);
+
+ /*
+ * The logfile segment that doesn't belong to the timeline is
+ * older or newer than the segment that the timeline started or
+ * ended at, respectively. It's sufficient to check only the
+ * starting segment of the timeline here. Since the timelines are
+ * scanned in descending order in this loop, any segments newer
+ * than the ending segment should belong to newer timeline and
+ * have already been read before. So it's not necessary to check
+ * the ending segment of the timeline here.
+ */
+ if (segno < beginseg)
+ continue;
+ }
+
+ if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
+ {
+ fd = XLogFileRead(segno, emode, tli,
+ XLOG_FROM_ARCHIVE, true);
+ if (fd != -1)
+ {
+ elog(DEBUG1, "got WAL segment from archive");
+ if (!expectedTLEs)
+ expectedTLEs = tles;
+ return fd;
+ }
+ }
+
+ if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
+ {
+ fd = XLogFileRead(segno, emode, tli,
+ XLOG_FROM_PG_WAL, true);
+ if (fd != -1)
+ {
+ if (!expectedTLEs)
+ expectedTLEs = tles;
+ return fd;
+ }
+ }
+ }
+
+ /* Couldn't find it. For simplicity, complain about front timeline */
+ XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
+ errno = ENOENT;
+ ereport(emode,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return -1;
+}
+
+/*
+ * Set flag to signal the walreceiver to restart. (The startup process calls
+ * this on noticing a relevant configuration change.)
+ */
+void
+StartupRequestWalReceiverRestart(void)
+{
+ if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
+ {
+ ereport(LOG,
+ (errmsg("WAL receiver process shutdown requested")));
+
+ pendingWalRcvRestart = true;
+ }
+}
+
+
+/*
+ * Has a standby promotion already been triggered?
+ *
+ * Unlike CheckForStandbyTrigger(), this works in any process
+ * that's connected to shared memory.
+ */
+bool
+PromoteIsTriggered(void)
+{
+ /*
+ * We check shared state each time only until a standby promotion is
+ * triggered. We can't trigger a promotion again, so there's no need to
+ * keep checking after the shared variable has once been seen true.
+ */
+ if (LocalPromoteIsTriggered)
+ return true;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return LocalPromoteIsTriggered;
+}
+
+static void
+SetPromoteIsTriggered(void)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->SharedPromoteIsTriggered = true;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ /*
+ * Mark the recovery pause state as 'not paused' because the paused state
+ * ends and promotion continues if a promotion is triggered while recovery
+ * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
+ * return 'paused' while a promotion is ongoing.
+ */
+ SetRecoveryPause(false);
+
+ LocalPromoteIsTriggered = true;
+}
+
+/*
+ * Check to see whether the user-specified trigger file exists and whether a
+ * promote request has arrived. If either condition holds, return true.
+ */
+static bool
+CheckForStandbyTrigger(void)
+{
+ struct stat stat_buf;
+
+ if (LocalPromoteIsTriggered)
+ return true;
+
+ if (IsPromoteSignaled() && CheckPromoteSignal())
+ {
+ ereport(LOG, (errmsg("received promote request")));
+ RemovePromoteSignalFiles();
+ ResetPromoteSignaled();
+ SetPromoteIsTriggered();
+ return true;
+ }
+
+ if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
+ return false;
+
+ if (stat(PromoteTriggerFile, &stat_buf) == 0)
+ {
+ ereport(LOG,
+ (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
+ unlink(PromoteTriggerFile);
+ SetPromoteIsTriggered();
+ return true;
+ }
+ else if (errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not stat promote trigger file \"%s\": %m",
+ PromoteTriggerFile)));
+
+ return false;
+}
+
+/*
+ * Remove the files signaling a standby promotion request.
+ */
+void
+RemovePromoteSignalFiles(void)
+{
+ unlink(PROMOTE_SIGNAL_FILE);
+}
+
+/*
+ * Check to see if a promote request has arrived.
+ */
+bool
+CheckPromoteSignal(void)
+{
+ struct stat stat_buf;
+
+ if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * Wake up startup process to replay newly arrived WAL, or to notice that
+ * failover has been requested.
+ */
+void
+WakeupRecovery(void)
+{
+ SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Schedule a walreceiver wakeup in the main recovery loop.
+ */
+void
+XLogRequestWalReceiverReply(void)
+{
+ doRequestWalReceiverReply = true;
+}
+
+/*
+ * Is HotStandby active yet? This is only important in special backends
+ * since normal backends won't ever be able to connect until this returns
+ * true. Postmaster knows this by way of signal, not via shared memory.
+ *
+ * Unlike testing standbyState, this works in any process that's connected to
+ * shared memory. (And note that standbyState alone doesn't tell the truth
+ * anyway.)
+ */
+bool
+HotStandbyActive(void)
+{
+ /*
+ * We check shared state each time only until Hot Standby is active. We
+ * can't de-activate Hot Standby, so there's no need to keep checking
+ * after the shared variable has once been seen true.
+ */
+ if (LocalHotStandbyActive)
+ return true;
+ else
+ {
+ /* spinlock is essential on machines with weak memory ordering! */
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return LocalHotStandbyActive;
+ }
+}
+
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+static bool
+HotStandbyActiveInReplay(void)
+{
+ Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+ return LocalHotStandbyActive;
+}
+
+/*
+ * Get latest redo apply position.
+ *
+ * Exported to allow WALReceiver to read the pointer directly.
+ */
+XLogRecPtr
+GetXLogReplayRecPtr(TimeLineID *replayTLI)
+{
+ XLogRecPtr recptr;
+ TimeLineID tli;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
+ tli = XLogRecoveryCtl->lastReplayedTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (replayTLI)
+ *replayTLI = tli;
+ return recptr;
+}
+
+
+/*
+ * Get position of last applied, or the record being applied.
+ *
+ * This is different from GetLogReplayRecPtr() in that if a WAL
+ * record is currently being applied, this includes that record.
+ */
+XLogRecPtr
+GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
+{
+ XLogRecPtr recptr;
+ TimeLineID tli;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ recptr = XLogRecoveryCtl->replayEndRecPtr;
+ tli = XLogRecoveryCtl->replayEndTLI;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ if (replayEndTLI)
+ *replayEndTLI = tli;
+ return recptr;
+}
+
+/*
+ * Save timestamp of latest processed commit/abort record.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by processes other than the startup process. Note in particular
+ * that CreateRestartPoint is executed in the checkpointer.
+ */
+static void
+SetLatestXTime(TimestampTz xtime)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->recoveryLastXTime = xtime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ */
+TimestampTz
+GetLatestXTime(void)
+{
+ TimestampTz xtime;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ xtime = XLogRecoveryCtl->recoveryLastXTime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return xtime;
+}
+
+/*
+ * Save timestamp of the next chunk of WAL records to apply.
+ *
+ * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
+ * seen by all backends.
+ */
+static void
+SetCurrentChunkStartTime(TimestampTz xtime)
+{
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ XLogRecoveryCtl->currentChunkStartTime = xtime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ * Startup process maintains an accurate local copy in XLogReceiptTime
+ */
+TimestampTz
+GetCurrentChunkReplayStartTime(void)
+{
+ TimestampTz xtime;
+
+ SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+ xtime = XLogRecoveryCtl->currentChunkStartTime;
+ SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+ return xtime;
+}
+
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+ /*
+ * This must be executed in the startup process, since we don't export the
+ * relevant state to shared memory.
+ */
+ Assert(InRecovery);
+
+ *rtime = XLogReceiptTime;
+ *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
+/*
+ * Note that text field supplied is a parameter name and does not require
+ * translation
+ */
+void
+RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
+{
+ if (currValue < minValue)
+ {
+ if (HotStandbyActiveInReplay())
+ {
+ bool warned_for_promote = false;
+
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("hot standby is not possible because of insufficient parameter settings"),
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue)));
+
+ SetRecoveryPause(true);
+
+ ereport(LOG,
+ (errmsg("recovery has paused"),
+ errdetail("If recovery is unpaused, the server will shut down."),
+ errhint("You can then restart the server after making the necessary configuration changes.")));
+
+ while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+ {
+ HandleStartupProcInterrupts();
+
+ if (CheckForStandbyTrigger())
+ {
+ if (!warned_for_promote)
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("promotion is not possible because of insufficient parameter settings"),
+
+ /*
+ * Repeat the detail from above so it's easy to find
+ * in the log.
+ */
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue),
+ errhint("Restart the server after making the necessary configuration changes.")));
+ warned_for_promote = true;
+ }
+
+ /*
+ * If recovery pause is requested then set it paused. While
+ * we are in the loop, user might resume and pause again so
+ * set this every time.
+ */
+ ConfirmRecoveryPaused();
+
+ /*
+ * We wait on a condition variable that will wake us as soon
+ * as the pause ends, but we use a timeout so we can check the
+ * above conditions periodically too.
+ */
+ ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
+ WAIT_EVENT_RECOVERY_PAUSE);
+ }
+ ConditionVariableCancelSleep();
+ }
+
+ ereport(FATAL,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("recovery aborted because of insufficient parameter settings"),
+ /* Repeat the detail from above so it's easy to find in the log. */
+ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+ param_name,
+ currValue,
+ minValue),
+ errhint("You can restart the server after making the necessary configuration changes.")));
+ }
+}