Fix the race condition in ReplicationSlotAcquire().
authorAmit Kapila <[email protected]>
Thu, 27 Feb 2025 04:17:04 +0000 (09:47 +0530)
committerAmit Kapila <[email protected]>
Thu, 27 Feb 2025 04:17:04 +0000 (09:47 +0530)
After commit f41d8468dd, a process could acquire and use a replication
slot that had just been invalidated, leading to failures while accessing
WAL.

To ensure that we don't accidentally start using invalid slots, we must
perform the invalidation check after acquiring the slot or under the
spinlock where we associate the slot with a particular process. We choose
the earlier method to keep the code simple.

Reported-by: Hou Zhijie <[email protected]>
Author: Nisha Moond <[email protected]>
Reviewed-by: Hou Zhijie <[email protected]>
Reviewed-by: Amit Kapila <[email protected]>
Discussion: https://postgr.es/m/CABdArM7J-LbGoMPGUPiFiLOyB_TZ5+YaZb=HMES0mQqzVTn8Gg@mail.gmail.com

src/backend/replication/slot.c

index d089085b491a60884e795e3fc419df1d1f1a0af4..719e531eb907cb3e0e8d66bf0957846d1c99b00a 100644 (file)
@@ -580,19 +580,6 @@ retry:
                        name)));
    }
 
-   /* Invalid slots can't be modified or used before accessing the WAL. */
-   if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE)
-   {
-       LWLockRelease(ReplicationSlotControlLock);
-
-       ereport(ERROR,
-               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-               errmsg("can no longer access replication slot \"%s\"",
-                      NameStr(s->data.name)),
-               errdetail("This replication slot has been invalidated due to \"%s\".",
-                         GetSlotInvalidationCauseName(s->data.invalidated)));
-   }
-
    /*
     * This is the slot we want; check if it's active under some other
     * process.  In single user mode, we don't need this check.
@@ -650,12 +637,25 @@ retry:
    else if (!nowait)
        ConditionVariableCancelSleep(); /* no sleep needed after all */
 
-   /* Let everybody know we've modified this slot */
-   ConditionVariableBroadcast(&s->active_cv);
-
    /* We made this slot active, so it's ours now. */
    MyReplicationSlot = s;
 
+   /*
+    * We need to check for invalidation after making the slot ours to avoid
+    * the possible race condition with the checkpointer that can otherwise
+    * invalidate the slot immediately after the check.
+    */
+   if (error_if_invalid && s->data.invalidated != RS_INVAL_NONE)
+       ereport(ERROR,
+               errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+               errmsg("can no longer access replication slot \"%s\"",
+                      NameStr(s->data.name)),
+               errdetail("This replication slot has been invalidated due to \"%s\".",
+                         GetSlotInvalidationCauseName(s->data.invalidated)));
+
+   /* Let everybody know we've modified this slot */
+   ConditionVariableBroadcast(&s->active_cv);
+
    /*
     * The call to pgstat_acquire_replslot() protects against stats for a
     * different slot, from before a restart or such, being present during