From: Robert Haas Date: Thu, 24 Mar 2022 18:50:06 +0000 (-0400) Subject: Fix possible recovery trouble if TRUNCATE overlaps a checkpoint. X-Git-Tag: REL_10_21~37 X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=57f618310f837101ba13757baf95bae37e3766ac;p=postgresql.git Fix possible recovery trouble if TRUNCATE overlaps a checkpoint. If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com --- diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index cdaf4993480..1e52972bbf6 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -3069,8 +3069,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) * crash/basebackup, even though the state of the data directory would * require it. */ - Assert(!MyPgXact->delayChkpt); - MyPgXact->delayChkpt = true; + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0); + MyPgXact->delayChkpt |= DELAY_CHKPT_START; /* WAL log truncation */ WriteMTruncateXlogRec(newOldestMultiDB, @@ -3096,7 +3096,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) /* Then offsets */ PerformOffsetsTruncation(oldestMulti, newOldestMulti); - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); LWLockRelease(MultiXactTruncationLock); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 3eb33be69b8..c61b2736a1f 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -478,7 +478,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, } pgxact->xid = xid; pgxact->xmin = InvalidTransactionId; - pgxact->delayChkpt = false; + pgxact->delayChkpt = 0; pgxact->vacuumFlags = 0; proc->pid = 0; proc->databaseId = databaseid; @@ -1159,7 +1159,8 @@ EndPrepare(GlobalTransaction gxact) START_CRIT_SECTION(); - MyPgXact->delayChkpt = true; + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0); + MyPgXact->delayChkpt |= DELAY_CHKPT_START; XLogBeginInsert(); for (record = records.head; record != NULL; record = record->next) @@ -1191,7 +1192,7 @@ EndPrepare(GlobalTransaction gxact) * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt &= ~DELAY_CHKPT_START; /* * Remember that we have this GlobalTransaction entry locked for us. If @@ -2284,7 +2285,8 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - MyPgXact->delayChkpt = true; + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0); + MyPgXact->delayChkpt |= DELAY_CHKPT_START; /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2332,7 +2334,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 25a3a4f97e6..ccd99c38c27 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1247,8 +1247,9 @@ RecordTransactionCommit(void) * This makes checkpoint's determination of which xacts are delayChkpt * a bit fuzzy, but it doesn't matter. */ + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0); START_CRIT_SECTION(); - MyPgXact->delayChkpt = true; + MyPgXact->delayChkpt |= DELAY_CHKPT_START; SetCurrentTransactionStopTimestamp(); @@ -1349,7 +1350,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6772e248222..51efc106a2c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9022,18 +9022,30 @@ CreateCheckPoint(int flags) * and we will correctly flush the update below. So we cannot miss any * xacts we need to wait for. */ - vxids = GetVirtualXIDsDelayingChkpt(&nvxids); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START); if (nvxids > 0) { do { pg_usleep(10000L); /* wait for 10 msec */ - } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_START)); } pfree(vxids); CheckPointGuts(checkPoint.redo, flags); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_COMPLETE)); + } + pfree(vxids); + /* * Take a snapshot of running transactions and write this to WAL. This * allows us to reconstruct the state of running transactions during diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 579d8de7759..6ff19814d49 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -899,7 +899,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * Ensure no checkpoint can change our view of RedoRecPtr. */ - Assert(MyPgXact->delayChkpt); + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) != 0); /* * Update RedoRecPtr so that we can make the right decision diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 9a5fde00ca1..729fb92c5fe 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -28,6 +28,7 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "storage/freespace.h" +#include "storage/proc.h" #include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -249,6 +250,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (vm) visibilitymap_truncate(rel, nblocks); + /* + * Make sure that a concurrent checkpoint can't complete while truncation + * is in progress. + * + * The truncation operation might drop buffers that the checkpoint + * otherwise would have flushed. If it does, then it's essential that + * the files actually get truncated on disk before the checkpoint record + * is written. Otherwise, if reply begins from that checkpoint, the + * to-be-truncated blocks might still exist on disk but have older + * contents than expected, which can cause replay to fail. It's OK for + * the blocks to not exist on disk at all, but not for them to have the + * wrong contents. + */ + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_COMPLETE) == 0); + MyPgXact->delayChkpt |= DELAY_CHKPT_COMPLETE; + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay @@ -287,8 +304,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks) XLogFlush(lsn); } - /* Do the real work */ + /* + * This will first remove any buffers from the buffer pool that should no + * longer exist after truncation is complete, and then truncate the + * corresponding files on disk. + */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks); + + /* We've done all the critical work, so checkpoints are OK now. */ + MyPgXact->delayChkpt &= ~DELAY_CHKPT_COMPLETE; } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index bafe91ab0d6..0b7bdb8634f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3469,7 +3469,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * essential that CreateCheckpoint waits for virtual transactions * rather than full transactionids. */ - MyPgXact->delayChkpt = delayChkpt = true; + Assert((MyPgXact->delayChkpt & DELAY_CHKPT_START) == 0); + MyPgXact->delayChkpt |= DELAY_CHKPT_START; + delayChkpt = true; lsn = XLogSaveBufferForHint(buffer, buffer_std); } @@ -3502,7 +3504,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) UnlockBufHdr(bufHdr, buf_state); if (delayChkpt) - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt &= ~DELAY_CHKPT_START; if (dirtied) { diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index d739812f230..134b63f28b8 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -433,7 +433,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + pgxact->delayChkpt = 0; + proc->recoveryConflictPending = false; Assert(pgxact->nxids == 0); @@ -455,7 +458,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, pgxact->xmin = InvalidTransactionId; /* must be cleared with xid/xmin: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + pgxact->delayChkpt = 0; + proc->recoveryConflictPending = false; /* Clear the subtransaction-XID cache too while holding the lock */ @@ -2259,7 +2265,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * delaying checkpoint because they have critical actions in progress. * * Constructs an array of VXIDs of transactions that are currently in commit - * critical sections, as shown by having delayChkpt set in their PGXACT. + * critical sections, as shown by having specified delayChkpt bits set in their + * PGXACT. * * Returns a palloc'd array that should be freed by the caller. * *nvxids is the number of valid entries. @@ -2273,13 +2280,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * for clearing of delayChkpt to propagate is unimportant for correctness. */ VirtualTransactionId * -GetVirtualXIDsDelayingChkpt(int *nvxids) +GetVirtualXIDsDelayingChkpt(int *nvxids, int type) { VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; int count = 0; int index; + Assert(type != 0); + /* allocate what's certainly enough result space */ vxids = (VirtualTransactionId *) palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); @@ -2292,7 +2301,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) volatile PGPROC *proc = &allProcs[pgprocno]; volatile PGXACT *pgxact = &allPgXact[pgprocno]; - if (pgxact->delayChkpt) + if ((pgxact->delayChkpt & type) != 0) { VirtualTransactionId vxid; @@ -2318,12 +2327,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) * those numbers should be small enough for it not to be a problem. */ bool -HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type) { bool result = false; ProcArrayStruct *arrayP = procArray; int index; + Assert(type != 0); + LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) @@ -2335,7 +2346,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) GET_VXID_FROM_PGPROC(vxid, *proc); - if (pgxact->delayChkpt && VirtualTransactionIdIsValid(vxid)) + if ((pgxact->delayChkpt & type) != 0 && + VirtualTransactionIdIsValid(vxid)) { int i; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 857dfdab09d..e5370df019a 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -377,7 +377,7 @@ InitProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt = 0; MyPgXact->vacuumFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ if (IsAutoVacuumWorkerProcess()) @@ -550,7 +550,7 @@ InitAuxiliaryProcess(void) MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyPgXact->delayChkpt = false; + MyPgXact->delayChkpt = 0; MyPgXact->vacuumFlags = 0; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 947f69d6346..d8dd7bf5e1c 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -75,6 +75,41 @@ struct XidCache */ #define INVALID_PGPROCNO PG_INT32_MAX +/* + * Flags for PGPROC.delayChkpt + * + * These flags can be used to delay the start or completion of a checkpoint + * for short periods. A flag is in effect if the corresponding bit is set in + * the PGPROC of any backend. + * + * For our purposes here, a checkpoint has three phases: (1) determine the + * location to which the redo pointer will be moved, (2) write all the + * data durably to disk, and (3) WAL-log the checkpoint. + * + * Setting DELAY_CHKPT_START prevents the system from moving from phase 1 + * to phase 2. This is useful when we are performing a WAL-logged modification + * of data that will be flushed to disk in phase 2. By setting this flag + * before writing WAL and clearing it after we've both written WAL and + * performed the corresponding modification, we ensure that if the WAL record + * is inserted prior to the new redo point, the corresponding data changes will + * also be flushed to disk before the checkpoint can complete. (In the + * extremely common case where the data being modified is in shared buffers + * and we acquire an exclusive content lock on the relevant buffers before + * writing WAL, this mechanism is not needed, because phase 2 will block + * until we release the content lock and then flush the modified data to + * disk.) + * + * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2 + * to phase 3. This is useful if we are performing a WAL-logged operation that + * might invalidate buffers, such as relation truncation. In this case, we need + * to ensure that any buffers which were invalidated and thus not flushed by + * the checkpoint are actaully destroyed on disk. Replay can cope with a file + * or block that doesn't exist, but not with a block that has the wrong + * contents. + */ +#define DELAY_CHKPT_START (1<<0) +#define DELAY_CHKPT_COMPLETE (1<<1) + /* * Each backend has a PGPROC struct in shared memory. There is also a list of * currently-unused PGPROC structs that will be reallocated to new backends. @@ -217,8 +252,7 @@ typedef struct PGXACT uint8 vacuumFlags; /* vacuum-related flags, see above */ bool overflowed; - bool delayChkpt; /* true if this proc delays checkpoint start; - * previously called InCommit */ + int delayChkpt; /* for DELAY_CHKPT_* flags */ uint8 nxids; } PGXACT; diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 08b4b030bb8..2b60b276049 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -92,8 +92,9 @@ extern TransactionId GetOldestXmin(Relation rel, int flags); extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); -extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids); -extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids); +extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type); +extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, + int nvxids, int type); extern PGPROC *BackendPidGetProc(int pid); extern PGPROC *BackendPidGetProcWithLock(int pid);