Fix false reports in pg_visibility
authorAlexander Korotkov <[email protected]>
Thu, 14 Mar 2024 11:08:53 +0000 (13:08 +0200)
committerAlexander Korotkov <[email protected]>
Thu, 14 Mar 2024 11:12:05 +0000 (13:12 +0200)
Currently, pg_visibility computes its xid horizon using the
GetOldestNonRemovableTransactionId().  The problem is that this horizon can
sometimes go backward.  That can lead to reporting false errors.

In order to fix that, this commit implements a new function
GetStrictOldestNonRemovableTransactionId().  This function computes the xid
horizon, which would be guaranteed to be newer or equal to any xid horizon
computed before.

We have to do the following to achieve this.

1. Ignore processes xmin's, because they consider connection to other databases
   that were ignored before.
2. Ignore KnownAssignedXids, because they are not database-aware. At the same
   time, the primary could compute its horizons database-aware.
3. Ignore walsender xmin, because it could go backward if some replication
   connections don't use replication slots.

As a result, we're using only currently running xids to compute the horizon.
Surely these would significantly sacrifice accuracy.  But we have to do so to
avoid reporting false errors.

Inspired by earlier patch by Daniel Shelepanov and the following discussion
with Robert Haas and Tom Lane.

Discussion: https://postgr.es/m/1649062270.289865713%40f403.i.mail.ru
Reviewed-by: Alexander Lakhin, Dmitry Koval
contrib/pg_visibility/Makefile
contrib/pg_visibility/meson.build
contrib/pg_visibility/pg_visibility.c
contrib/pg_visibility/t/001_concurrent_transaction.pl [new file with mode: 0644]
src/backend/storage/ipc/procarray.c
src/include/storage/standby.h

index b3b1a89e47d62119bd85481791132af7e4b5ebfe..d3cb411cc90d9eac9436b2464f30287c0002d686 100644 (file)
@@ -11,6 +11,7 @@ DATA = pg_visibility--1.1.sql pg_visibility--1.1--1.2.sql \
 PGFILEDESC = "pg_visibility - page visibility information"
 
 REGRESS = pg_visibility
+TAP_TESTS = 1
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
index 38e242d95c0937b03e70e09b3e8b5f1fff70c451..f932371f62d25643dff2fcfd50d107a3b5fa8225 100644 (file)
@@ -32,5 +32,9 @@ tests += {
     'sql': [
       'pg_visibility',
     ],
+  'tap': {
+    'tests': [
+      't/001_concurrent_transaction.pl',
+    ],
   },
 }
index 17c50a44722ffd57cc9834cc11134b118cf38c5a..1a1a4ff7be702b02e3fee9e9045e6e211d7efd0b 100644 (file)
@@ -19,6 +19,7 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
@@ -532,6 +533,63 @@ collect_visibility_data(Oid relid, bool include_pd)
    return info;
 }
 
+/*
+ * The "strict" version of GetOldestNonRemovableTransactionId().  The
+ * pg_visibility check can tolerate false positives (don't report some of the
+ * errors), but can't tolerate false negatives (report false errors). Normally,
+ * horizons move forwards, but there are cases when it could move backward
+ * (see comment for ComputeXidHorizons()).
+ *
+ * This is why we have to implement our own function for xid horizon, which
+ * would be guaranteed to be newer or equal to any xid horizon computed before.
+ * We have to do the following to achieve this.
+ *
+ * 1. Ignore processes xmin's, because they consider connection to other
+ *    databases that were ignored before.
+ * 2. Ignore KnownAssignedXids, because they are not database-aware. At the
+ *    same time, the primary could compute its horizons database-aware.
+ * 3. Ignore walsender xmin, because it could go backward if some replication
+ *    connections don't use replication slots.
+ *
+ * As a result, we're using only currently running xids to compute the horizon.
+ * Surely these would significantly sacrifice accuracy.  But we have to do so
+ * to avoid reporting false errors.
+ */
+static TransactionId
+GetStrictOldestNonRemovableTransactionId(Relation rel)
+{
+   RunningTransactions runningTransactions;
+
+   if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
+   {
+       /* Shared relation: take into account all running xids */
+       runningTransactions = GetRunningTransactionData();
+       LWLockRelease(ProcArrayLock);
+       LWLockRelease(XidGenLock);
+       return runningTransactions->oldestRunningXid;
+   }
+   else if (!RELATION_IS_LOCAL(rel))
+   {
+       /*
+        * Normal relation: take into account xids running within the current
+        * database
+        */
+       runningTransactions = GetRunningTransactionData();
+       LWLockRelease(ProcArrayLock);
+       LWLockRelease(XidGenLock);
+       return runningTransactions->oldestDatabaseRunningXid;
+   }
+   else
+   {
+       /*
+        * For temporary relations, ComputeXidHorizons() uses only
+        * TransamVariables->latestCompletedXid and MyProc->xid.  These two
+        * shouldn't go backwards.  So we're fine with this horizon.
+        */
+       return GetOldestNonRemovableTransactionId(rel);
+   }
+}
+
 /*
  * Returns a list of items whose visibility map information does not match
  * the status of the tuples on the page.
@@ -563,7 +621,7 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
    check_relation_relkind(rel);
 
    if (all_visible)
-       OldestXmin = GetOldestNonRemovableTransactionId(rel);
+       OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 
    nblocks = RelationGetNumberOfBlocks(rel);
 
@@ -671,11 +729,11 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
                 * retake ProcArrayLock here while we're holding the buffer
                 * exclusively locked, but it should be safe against
                 * deadlocks, because surely
-                * GetOldestNonRemovableTransactionId() should never take a
-                * buffer lock. And this shouldn't happen often, so it's worth
-                * being careful so as to avoid false positives.
+                * GetStrictOldestNonRemovableTransactionId() should never
+                * take a buffer lock. And this shouldn't happen often, so
+                * it's worth being careful so as to avoid false positives.
                 */
-               RecomputedOldestXmin = GetOldestNonRemovableTransactionId(rel);
+               RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 
                if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
                    record_corrupt_item(items, &tuple.t_self);
diff --git a/contrib/pg_visibility/t/001_concurrent_transaction.pl b/contrib/pg_visibility/t/001_concurrent_transaction.pl
new file mode 100644 (file)
index 0000000..c31d041
--- /dev/null
@@ -0,0 +1,47 @@
+
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Check that a concurrent transaction doesn't cause false negatives in
+# pg_check_visible() function
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+
+$node->init;
+$node->start;
+
+# Setup another database
+$node->safe_psql("postgres", "CREATE DATABASE other_database;\n");
+my $bsession = $node->background_psql('other_database');
+
+# Run a concurrent transaction
+$bsession->query_safe(
+   qq[
+   BEGIN;
+   SELECT txid_current();
+]);
+
+# Create a sample table and run vacuum
+$node->safe_psql("postgres",
+       "CREATE EXTENSION pg_visibility;\n"
+     . "CREATE TABLE vacuum_test AS SELECT 42 i;\n"
+     . "VACUUM (disable_page_skipping) vacuum_test;");
+
+# Run pg_check_visible()
+my $result = $node->safe_psql("postgres",
+   "SELECT * FROM pg_check_visible('vacuum_test');");
+
+# There should be no false negatives
+ok($result eq "", "pg_check_visible() detects no errors");
+
+# Shutdown
+$bsession->query_safe("COMMIT;");
+$bsession->quit;
+$node->stop;
+
+done_testing();
index 9eea1ed315a589f62da425379c0788e4461da7f1..b3cd248fb64111f4a418e00d32faa286e89664a3 100644 (file)
@@ -2688,6 +2688,7 @@ GetRunningTransactionData(void)
    RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
    TransactionId latestCompletedXid;
    TransactionId oldestRunningXid;
+   TransactionId oldestDatabaseRunningXid;
    TransactionId *xids;
    int         index;
    int         count;
@@ -2732,7 +2733,7 @@ GetRunningTransactionData(void)
 
    latestCompletedXid =
        XidFromFullTransactionId(TransamVariables->latestCompletedXid);
-   oldestRunningXid =
+   oldestDatabaseRunningXid = oldestRunningXid =
        XidFromFullTransactionId(TransamVariables->nextXid);
 
    /*
@@ -2740,6 +2741,8 @@ GetRunningTransactionData(void)
     */
    for (index = 0; index < arrayP->numProcs; index++)
    {
+       int         pgprocno = arrayP->pgprocnos[index];
+       PGPROC     *proc = &allProcs[pgprocno];
        TransactionId xid;
 
        /* Fetch xid just once - see GetNewTransactionId */
@@ -2760,6 +2763,13 @@ GetRunningTransactionData(void)
        if (TransactionIdPrecedes(xid, oldestRunningXid))
            oldestRunningXid = xid;
 
+       /*
+        * Also, update the oldest running xid within the current database.
+        */
+       if (proc->databaseId == MyDatabaseId &&
+           TransactionIdPrecedes(xid, oldestRunningXid))
+           oldestDatabaseRunningXid = xid;
+
        if (ProcGlobal->subxidStates[index].overflowed)
            suboverflowed = true;
 
@@ -2826,6 +2836,7 @@ GetRunningTransactionData(void)
    CurrentRunningXacts->subxid_overflow = suboverflowed;
    CurrentRunningXacts->nextXid = XidFromFullTransactionId(TransamVariables->nextXid);
    CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+   CurrentRunningXacts->oldestDatabaseRunningXid = oldestDatabaseRunningXid;
    CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
 
    Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
index 534d56fbc9fa53d217102362551132779efb503b..0fc0804e2660e5e92b7d2dfd1c33e3c3d8c2e9a1 100644 (file)
@@ -82,6 +82,8 @@ typedef struct RunningTransactionsData
    bool        subxid_overflow;    /* snapshot overflowed, subxids missing */
    TransactionId nextXid;      /* xid from TransamVariables->nextXid */
    TransactionId oldestRunningXid; /* *not* oldestXmin */
+   TransactionId oldestDatabaseRunningXid; /* same as above, but within the
+                                            * current database */
    TransactionId latestCompletedXid;   /* so we can set xmax */
 
    TransactionId *xids;        /* array of (sub)xids still running */