From: Amit Kapila Date: Thu, 27 Feb 2025 04:17:04 +0000 (+0530) Subject: Fix the race condition in ReplicationSlotAcquire(). X-Git-Tag: REL_18_BETA1~739 X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=8709dccc793da0c0c6619cafa182c8e67a871154;p=postgresql.git Fix the race condition in ReplicationSlotAcquire(). After commit f41d8468dd, a process could acquire and use a replication slot that had just been invalidated, leading to failures while accessing WAL. To ensure that we don't accidentally start using invalid slots, we must perform the invalidation check after acquiring the slot or under the spinlock where we associate the slot with a particular process. We choose the earlier method to keep the code simple. Reported-by: Hou Zhijie Author: Nisha Moond Reviewed-by: Hou Zhijie Reviewed-by: Amit Kapila Discussion: https://postgr.es/m/CABdArM7J-LbGoMPGUPiFiLOyB_TZ5+YaZb=HMES0mQqzVTn8Gg@mail.gmail.com --- diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index d089085b491..719e531eb90 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -580,19 +580,6 @@ retry: name))); } - /* Invalid slots can't be modified or used before accessing the WAL. */ - if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE) - { - LWLockRelease(ReplicationSlotControlLock); - - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("can no longer access replication slot \"%s\"", - NameStr(s->data.name)), - errdetail("This replication slot has been invalidated due to \"%s\".", - GetSlotInvalidationCauseName(s->data.invalidated))); - } - /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. @@ -650,12 +637,25 @@ retry: else if (!nowait) ConditionVariableCancelSleep(); /* no sleep needed after all */ - /* Let everybody know we've modified this slot */ - ConditionVariableBroadcast(&s->active_cv); - /* We made this slot active, so it's ours now. */ MyReplicationSlot = s; + /* + * We need to check for invalidation after making the slot ours to avoid + * the possible race condition with the checkpointer that can otherwise + * invalidate the slot immediately after the check. + */ + if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("can no longer access replication slot \"%s\"", + NameStr(s->data.name)), + errdetail("This replication slot has been invalidated due to \"%s\".", + GetSlotInvalidationCauseName(s->data.invalidated))); + + /* Let everybody know we've modified this slot */ + ConditionVariableBroadcast(&s->active_cv); + /* * The call to pgstat_acquire_replslot() protects against stats for a * different slot, from before a restart or such, being present during