Add undo log manager.
authorThomas Munro <[email protected]>
Wed, 6 Mar 2019 03:46:04 +0000 (16:46 +1300)
committerThomas Munro <[email protected]>
Wed, 17 Jul 2019 03:28:27 +0000 (15:28 +1200)
Add a new subsystem to manage undo logs.  Undo logs allow data to be appended
efficiently, like logs.  They also allow data to be discarded efficiently from
the other end, like a queue.  Thirdly, they allow efficient buffered random
access, like a relation.

Undo logs physically consist of a set of 1MB segment files under
$PGDATA/base/undo (or per-tablespace equivalent) that are created, deleted or
renamed as required, similarly to the way that WAL segments are managed.
Meta-data about the set of undo logs is stored in shared memory, and written
to per-checkpoint files under $PGDATA/pg_undo.

Provide access to the undo files managed by undolog.c through bufmgr.c.
A new SMGR implementation allows bufmgr.c to access files created by
undolog.c.

Author: Thomas Munro, with contributions from Dilip Kumar, Rafia Sabih,
        Robert Haas and Amit Kapila
Reviewed-by:
Discussion: https://postgr.es/m/CAEepm%3D2EqROYJ_xYz4v5kfr4b0qw_Lq_6Pe8RTEC8rx3upWsSQ%40mail.gmail.com

53 files changed:
src/backend/access/Makefile
src/backend/access/rmgrdesc/Makefile
src/backend/access/rmgrdesc/undologdesc.c [new file with mode: 0644]
src/backend/access/transam/rmgr.c
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogutils.c
src/backend/access/undo/Makefile [new file with mode: 0644]
src/backend/access/undo/undolog.c [new file with mode: 0644]
src/backend/bootstrap/bootstrap.c
src/backend/catalog/system_views.sql
src/backend/commands/tablespace.c
src/backend/postmaster/pgstat.c
src/backend/replication/basebackup.c
src/backend/replication/logical/decode.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/file/fd.c
src/backend/storage/ipc/ipci.c
src/backend/storage/lmgr/lwlock.c
src/backend/storage/lmgr/lwlocknames.txt
src/backend/storage/smgr/Makefile
src/backend/storage/smgr/smgr.c
src/backend/storage/smgr/undofile.c [new file with mode: 0644]
src/backend/storage/sync/sync.c
src/backend/utils/init/postinit.c
src/backend/utils/misc/guc.c
src/bin/initdb/initdb.c
src/bin/pg_checksums/pg_checksums.c
src/bin/pg_resetwal/pg_resetwal.c
src/bin/pg_upgrade/Makefile
src/bin/pg_upgrade/check.c
src/bin/pg_upgrade/controldata.c
src/bin/pg_upgrade/exec.c
src/bin/pg_upgrade/pg_upgrade.c
src/bin/pg_upgrade/pg_upgrade.h
src/bin/pg_upgrade/undo.c [new file with mode: 0644]
src/bin/pg_waldump/rmgrdesc.c
src/include/access/rmgrlist.h
src/include/access/session.h
src/include/access/undolog.h [new file with mode: 0644]
src/include/access/undolog_xlog.h [new file with mode: 0644]
src/include/access/xlogutils.h
src/include/catalog/database_internal.h [new file with mode: 0644]
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/include/storage/bufmgr.h
src/include/storage/fd.h
src/include/storage/lwlock.h
src/include/storage/smgr.h
src/include/storage/sync.h
src/include/storage/undofile.h [new file with mode: 0644]
src/include/utils/guc.h
src/test/regress/expected/rules.out

index 0880e0a8bbb63901164aef4a2de577db59c98b25..bf6d3fa1bd05900ccabf71d11b133d26fbf63129 100644 (file)
@@ -9,6 +9,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 SUBDIRS            = brin common gin gist hash heap index nbtree rmgrdesc spgist \
-                         table tablesample transam
+                         table tablesample transam undo
 
 include $(top_srcdir)/src/backend/common.mk
index 5514db1dda6ceaf95d3ef0ef37e66bddc82af420..91ad1ef8a3da1b76bc70dbc64533a56f5ce1abd3 100644 (file)
@@ -11,6 +11,6 @@ include $(top_builddir)/src/Makefile.global
 OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o genericdesc.o \
           gindesc.o gistdesc.o hashdesc.o heapdesc.o logicalmsgdesc.o \
           mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o seqdesc.o \
-          smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
+          smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o undologdesc.o xactdesc.o xlogdesc.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c
new file mode 100644 (file)
index 0000000..f89fcb3
--- /dev/null
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * undologdesc.c
+ *       rmgr descriptor routines for access/undo/undolog.c
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *       src/backend/access/rmgrdesc/undologdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/undolog.h"
+#include "access/undolog_xlog.h"
+
+void
+undolog_desc(StringInfo buf, XLogReaderState *record)
+{
+       char       *rec = XLogRecGetData(record);
+       uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+       if (info == XLOG_UNDOLOG_CREATE)
+       {
+               xl_undolog_create *xlrec = (xl_undolog_create *) rec;
+
+               appendStringInfo(buf, "logno %u", xlrec->logno);
+       }
+       else if (info == XLOG_UNDOLOG_EXTEND)
+       {
+               xl_undolog_extend *xlrec = (xl_undolog_extend *) rec;
+
+               appendStringInfo(buf, "logno %u end " UndoLogOffsetFormat,
+                                                xlrec->logno, xlrec->end);
+       }
+       else if (info == XLOG_UNDOLOG_DISCARD)
+       {
+               xl_undolog_discard *xlrec = (xl_undolog_discard *) rec;
+
+               appendStringInfo(buf, "logno %u discard " UndoLogOffsetFormat " end "
+                                                UndoLogOffsetFormat,
+                                                xlrec->logno, xlrec->discard, xlrec->end);
+       }
+       else if (info == XLOG_UNDOLOG_SWITCH)
+       {
+               xl_undolog_switch *xlrec = (xl_undolog_switch *) rec;
+
+               appendStringInfo(buf, "logno %u start " UndoLogOffsetFormat " last " UndoLogOffsetFormat,
+                                                xlrec->logno,
+                                                xlrec->prevlog_xact_start,
+                                                xlrec->prevlog_last_urp);
+       }
+
+}
+
+const char *
+undolog_identify(uint8 info)
+{
+       const char *id = NULL;
+
+       switch (info & ~XLR_INFO_MASK)
+       {
+               case XLOG_UNDOLOG_CREATE:
+                       id = "CREATE";
+                       break;
+               case XLOG_UNDOLOG_EXTEND:
+                       id = "EXTEND";
+                       break;
+               case XLOG_UNDOLOG_DISCARD:
+                       id = "DISCARD";
+                       break;
+               case XLOG_UNDOLOG_SWITCH:
+                       id = "SWITCH";
+                       break;
+       }
+
+       return id;
+}
index 9368b56c4ce5e8da39010287d6a39fad46a7aacf..8b0537405a9c3e7c7b937ff899d49f71dad5a026 100644 (file)
@@ -18,6 +18,7 @@
 #include "access/multixact.h"
 #include "access/nbtxlog.h"
 #include "access/spgxlog.h"
+#include "access/undolog_xlog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "catalog/storage_xlog.h"
index b6c9353cbd298eb201cea1283c64736027c05a08..5dbe485af238b050300f1cadf430dbec82b4016d 100644 (file)
@@ -31,6 +31,7 @@
 #include "access/transam.h"
 #include "access/tuptoaster.h"
 #include "access/twophase.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "access/xloginsert.h"
@@ -6710,6 +6711,9 @@ StartupXLOG(void)
         */
        restoreTwoPhaseData();
 
+       /* Recover undo log meta data corresponding to this checkpoint. */
+       StartupUndoLogs(ControlFile->checkPointCopy.redo);
+
        lastFullPageWrites = checkPoint.fullPageWrites;
 
        RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
@@ -8977,6 +8981,7 @@ static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
        CheckPointCLOG();
+       CheckPointUndoLogs(checkPointRedo, ControlFile->checkPointCopy.redo);
        CheckPointCommitTs();
        CheckPointSUBTRANS();
        CheckPointMultiXact();
index 10a663bae6292c8cd7664b7ed5b7f67f1f46f2f8..c227c03854059a88622d6901a02bf53ee89cd001 100644 (file)
@@ -293,6 +293,65 @@ XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
                                                                                 false, buf);
 }
 
+/*
+ * Find the block ID of the first block that matches the given rnode forknum
+ * and blockno.  If blockno is InvalidBlockNumber, then match any block
+ * number.  Return true if found.
+ */
+bool
+XLogFindBlockId(XLogReaderState *record,
+                               RelFileNode rnode,
+                               ForkNumber forknum,
+                               BlockNumber blockno,
+                               uint8 *block_id)
+{
+       uint8   i;
+
+       for (i = 0; i <= record->max_block_id; ++i)
+       {
+               DecodedBkpBlock *block = &record->blocks[i];
+
+               if (block->in_use &&
+                       RelFileNodeEquals(block->rnode, rnode) &&
+                       block->forknum == forknum &&
+                       (block->blkno == blockno || blockno == InvalidBlockNumber))
+               {
+                       *block_id = i;
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+/*
+ * If the caller doesn't know the the block_id, but does know the RelFileNode,
+ * forknum and block number, then we try to find it.
+ */
+XLogRedoAction
+XLogReadBufferForRedoBlock(XLogReaderState *record,
+                                                  RelFileNode rnode,
+                                                  ForkNumber forknum,
+                                                  BlockNumber blockno,
+                                                  ReadBufferMode mode,
+                                                  bool get_cleanup_lock,
+                                                  Buffer *buf)
+{
+       uint8   block_id;
+
+       if (XLogFindBlockId(record, rnode, forknum, blockno, &block_id))
+               return XLogReadBufferForRedoExtended(record,
+                                                                                        block_id,
+                                                                                        mode,
+                                                                                        get_cleanup_lock,
+                                                                                        buf);
+
+       elog(ERROR, "failed to find block reference rel %u/%u/%u, forknum = %u, block = %u",
+                rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blockno);
+
+       return BLK_NOTFOUND;    /* not reached */
+}
+
 /*
  * Pin and lock a buffer referenced by a WAL record, for the purpose of
  * re-initializing it.
@@ -346,7 +405,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
         * Make sure that if the block is marked with WILL_INIT, the caller is
         * going to initialize it. And vice versa.
         */
-       zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
+       zeromode = (mode == RBM_ZERO || mode == RBM_ZERO_AND_LOCK ||
+                               mode == RBM_ZERO_AND_CLEANUP_LOCK);
        willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0;
        if (willinit && !zeromode)
                elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
@@ -462,7 +522,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
        {
                /* page exists in file */
                buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
-                                                                                  mode, NULL);
+                                                                                  mode, NULL, RELPERSISTENCE_PERMANENT);
        }
        else
        {
@@ -487,7 +547,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                                ReleaseBuffer(buffer);
                        }
                        buffer = ReadBufferWithoutRelcache(rnode, forknum,
-                                                                                          P_NEW, mode, NULL);
+                                                                                          P_NEW, mode, NULL,
+                                                                                          RELPERSISTENCE_PERMANENT);
                }
                while (BufferGetBlockNumber(buffer) < blkno);
                /* Handle the corner case that P_NEW returns non-consecutive pages */
@@ -497,7 +558,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                        ReleaseBuffer(buffer);
                        buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
-                                                                                          mode, NULL);
+                                                                                          mode, NULL,
+                                                                                          RELPERSISTENCE_PERMANENT);
                }
        }
 
diff --git a/src/backend/access/undo/Makefile b/src/backend/access/undo/Makefile
new file mode 100644 (file)
index 0000000..219c696
--- /dev/null
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/undo
+#
+# IDENTIFICATION
+#    src/backend/access/undo/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/undo
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = undolog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/undo/undolog.c b/src/backend/access/undo/undolog.c
new file mode 100644 (file)
index 0000000..f2e0272
--- /dev/null
@@ -0,0 +1,2627 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog.c
+ *       management of undo logs
+ *
+ * PostgreSQL undo log manager.  This module is responsible for managing the
+ * lifecycle of undo logs and their segment files, associating undo logs with
+ * backends, and allocating space within undo logs.
+ *
+ * For the code that reads and writes blocks of data, see undofile.c.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/undo/undolog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/session.h"
+#include "access/transam.h"
+#include "access/undolog.h"
+#include "access/undolog_xlog.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlogreader.h"
+#include "access/xlogutils.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "pgstat.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/standby.h"
+#include "storage/sync.h"
+#include "storage/undofile.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/varlena.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+/*
+ * Main control structure for undo log management in shared memory.
+ * UndoLogSlot objects are arranged in a fixed-size array, with no particular
+ * ordering.
+ */
+typedef struct UndoLogSharedData
+{
+       UndoLogNumber   free_lists[UndoLogCategories];
+       UndoLogNumber   low_logno;
+       UndoLogNumber   next_logno;
+       UndoLogNumber   nslots;
+       UndoLogSlot             slots[FLEXIBLE_ARRAY_MEMBER];
+} UndoLogSharedData;
+
+/* The shared memory region that all backends are attach to. */
+UndoLogSharedData *UndoLogShared;
+
+undologtable_hash *undologtable_cache;
+
+/* GUC variables */
+char      *undo_tablespaces = NULL;
+
+static UndoLogSlot *find_undo_log_slot(UndoLogNumber logno, bool locked);
+static UndoLogSlot *allocate_undo_log_slot(void);
+static void free_undo_log_slot(UndoLogSlot *log);
+static void attach_undo_log(UndoLogCategory category, Oid tablespace);
+static void detach_current_undo_log(UndoLogCategory category, bool full);
+static void extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end);
+static void undo_log_before_exit(int code, Datum value);
+static void forget_undo_buffers(int logno, UndoLogOffset old_discard,
+                                                               UndoLogOffset new_discard,
+                                                               bool drop_tail);
+static bool choose_undo_tablespace(bool force_detach, Oid *oid);
+
+PG_FUNCTION_INFO_V1(pg_stat_get_undo_logs);
+
+/*
+ * How many undo logs can be active at a time?  This creates a theoretical
+ * maximum amount of undo data that can exist, but if we set it to a multiple
+ * of the maximum number of backends it will be a very high limit.
+ * Alternative designs involving demand paging or dynamic shared memory could
+ * remove this limit but would be complicated.
+ */
+static inline size_t
+UndoLogNumSlots(void)
+{
+       return MaxBackends * 4;
+}
+
+/*
+ * Return the amount of traditional shmem required for undo log management.
+ */
+Size
+UndoLogShmemSize(void)
+{
+       return sizeof(UndoLogSharedData) +
+               UndoLogNumSlots() * sizeof(UndoLogSlot);
+}
+
+/*
+ * Initialize the undo log subsystem.  Called in each backend.
+ */
+void
+UndoLogShmemInit(void)
+{
+       bool found;
+
+       UndoLogShared = (UndoLogSharedData *)
+               ShmemInitStruct("UndoLogShared", UndoLogShmemSize(), &found);
+
+       /* The postmaster initialized the shared memory state. */
+       if (!IsUnderPostmaster)
+       {
+               int             i;
+
+               Assert(!found);
+
+               /*
+                * We start with no active undo logs.  StartUpUndoLogs() will recreate
+                * the undo logs that were known at the last checkpoint.
+                */
+               memset(UndoLogShared, 0, sizeof(*UndoLogShared));
+               UndoLogShared->nslots = UndoLogNumSlots();
+               for (i = 0; i < UndoLogCategories; ++i)
+                       UndoLogShared->free_lists[i] = InvalidUndoLogNumber;
+               for (i = 0; i < UndoLogShared->nslots; ++i)
+               {
+                       memset(&UndoLogShared->slots[i], 0, sizeof(UndoLogShared->slots[i]));
+                       UndoLogShared->slots[i].logno = InvalidUndoLogNumber;
+                       LWLockInitialize(&UndoLogShared->slots[i].mutex,
+                                                        LWTRANCHE_UNDOLOG);
+                       LWLockInitialize(&UndoLogShared->slots[i].discard_lock,
+                                                        LWTRANCHE_UNDODISCARD);
+               }
+       }
+       else
+               Assert(found);
+
+       /* All backends prepare their per-backend lookup table. */
+       undologtable_cache = undologtable_create(TopMemoryContext,
+                                                                                        UndoLogNumSlots(),
+                                                                                        NULL);
+}
+
+void
+UndoLogInit(void)
+{
+       before_shmem_exit(undo_log_before_exit, 0);
+}
+
+/*
+ * Figure out which directory holds an undo log based on tablespace.
+ */
+void
+UndoLogDirectory(Oid tablespace, char *dir)
+{
+       if (tablespace == DEFAULTTABLESPACE_OID ||
+               tablespace == InvalidOid)
+               snprintf(dir, MAXPGPATH, "base/undo");
+       else
+               snprintf(dir, MAXPGPATH, "pg_tblspc/%u/%s/undo",
+                                tablespace, TABLESPACE_VERSION_DIRECTORY);
+}
+
+/*
+ * Compute the pathname to use for an undo log segment file.
+ */
+void
+UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, char *path)
+{
+       char            dir[MAXPGPATH];
+
+       /* Figure out which directory holds the segment, based on tablespace. */
+       UndoLogDirectory(tablespace, dir);
+
+       /*
+        * Build the path from log number and offset.  The pathname is the
+        * UndoRecPtr of the first byte in the segment in hexadecimal, with a
+        * period inserted between the components.
+        */
+       snprintf(path, MAXPGPATH, "%s/%06X.%010zX", dir, logno,
+                        segno * UndoLogSegmentSize);
+}
+
+/*
+ * Iterate through the set of currently active logs.  Pass in NULL to get the
+ * first undo log.  NULL indicates the end of the set of logs.  The caller
+ * must lock the returned log before accessing its members, and must skip if
+ * logno is not valid.
+ */
+UndoLogSlot *
+UndoLogNextSlot(UndoLogSlot *slot)
+{
+       LWLockAcquire(UndoLogLock, LW_SHARED);
+       for (;;)
+       {
+               /* Advance to the next log. */
+               if (slot == NULL)
+               {
+                       /* Start at the beginning. */
+                       slot = &UndoLogShared->slots[0];
+               }
+               else if (++slot == &UndoLogShared->slots[UndoLogShared->nslots])
+               {
+                       /* Past the end. */
+                       slot = NULL;
+                       break;
+               }
+               /* Have we found a slot with a valid log? */
+               if (slot->logno != InvalidUndoLogNumber)
+                       break;
+       }
+       LWLockRelease(UndoLogLock);
+
+       /* XXX: erm, which lock should the caller hold!? */
+       return slot;
+}
+
+/*
+ * Check if an undo log position has been discarded.  'pointer' must be an
+ * undo log pointer that was allocated at some point in the past, otherwise
+ * the result is undefined.
+ */
+bool
+UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer)
+{
+       UndoLogNumber logno = UndoRecPtrGetLogNo(pointer);
+       UndoLogSlot *slot;
+       UndoRecPtr discard;
+
+       slot = find_undo_log_slot(logno, false);
+
+       if (slot == NULL)
+       {
+               /*
+                * If we couldn't find the undo log number, then it must be entirely
+                * discarded.  Set this backend's recent_discard value to the highest
+                * possible value, so that all records appear to be discarded to the
+                * fast-path code.  Technically this value is too low by 1, but
+                * assuming only pointers to records are tested, and no record can
+                * have size 1, this value suffices.
+                */
+               discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1);
+       }
+       else
+       {
+               LWLockAcquire(&slot->mutex, LW_SHARED);
+               if (unlikely(logno != slot->logno))
+               {
+                       /*
+                        * The undo log has been entirely discarded since we looked it up
+                        * above, and the UndoLogSlot is now unused or being used for some
+                        * other undo log.  This is the same as not finding it.
+                        */
+                       discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1);
+               }
+               else
+                       discard = MakeUndoRecPtr(logno, slot->meta.discard);
+               LWLockRelease(&slot->mutex);
+       }
+
+       /*
+        * Remember this discard pointer in this backend so that future lookups
+        * via UndoLogRecPtrIsDiscarded() have a chance of avoiding the slow path.
+        */
+       UndoLogGetTableEntry(logno)->recent_discard = discard;
+
+       return pointer < discard;
+}
+
+/*
+ * Fetch the previous transaction's start undo record point.
+ */
+UndoRecPtr
+UndoLogGetLastXactStartPoint(UndoLogNumber logno)
+{
+       UndoLogSlot *slot = find_undo_log_slot(logno, false);
+       uint64 last_xact_start = 0;
+
+       if (unlikely(slot == NULL))
+               return InvalidUndoRecPtr;
+
+       LWLockAcquire(&slot->mutex, LW_SHARED);
+       /* TODO: review */
+       last_xact_start = slot->meta.unlogged.last_xact_start;
+       LWLockRelease(&slot->mutex);
+
+       if (last_xact_start == 0)
+               return InvalidUndoRecPtr;
+
+       return MakeUndoRecPtr(logno, last_xact_start);
+}
+
+/*
+ * Detach from the undo log we are currently attached to, returning it to the
+ * appropriate free list if it still has space.
+ */
+static void
+detach_current_undo_log(UndoLogCategory category, bool full)
+{
+       UndoLogSlot *slot;
+
+       slot = CurrentSession->attached_undo_slots[category];
+
+       Assert(slot != NULL);
+
+       CurrentSession->attached_undo_slots[category] = NULL;
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->pid = InvalidPid;
+       slot->meta.unlogged.xid = InvalidTransactionId;
+       if (full)
+               slot->meta.status = UNDO_LOG_STATUS_FULL;
+       LWLockRelease(&slot->mutex);
+
+       /* Push back onto the appropriate free list, unless it's full. */
+       if (!full)
+       {
+               LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+               slot->next_free = UndoLogShared->free_lists[category];
+               UndoLogShared->free_lists[category] = slot->logno;
+               LWLockRelease(UndoLogLock);
+       }
+}
+
+/*
+ * Exit handler, detaching from all undo logs.
+ */
+static void
+undo_log_before_exit(int code, Datum arg)
+{
+       int             i;
+
+       if (!CurrentSession)
+               return;
+
+       for (i = 0; i < UndoLogCategories; ++i)
+       {
+               if (CurrentSession->attached_undo_slots[i] != NULL)
+                       detach_current_undo_log(i, false);
+       }
+}
+
+/*
+ * Create a new empty segment file on disk for the byte starting at 'end'.
+ */
+static void
+allocate_empty_undo_segment(UndoLogNumber logno, Oid tablespace,
+                                                       UndoLogOffset end)
+{
+       struct stat     stat_buffer;
+       off_t   size;
+       char    path[MAXPGPATH];
+       void   *zeroes;
+       size_t  nzeroes = 8192;
+       int             fd;
+
+       UndoLogSegmentPath(logno, end / UndoLogSegmentSize, tablespace, path);
+
+       /*
+        * Create and fully allocate a new file.  If we crashed and recovered
+        * then the file might already exist, so use flags that tolerate that.
+        * It's also possible that it exists but is too short, in which case
+        * we'll write the rest.  We don't really care what's in the file, we
+        * just want to make sure that the filesystem has allocated physical
+        * blocks for it, so that non-COW filesystems will report ENOSPC now
+        * rather than later when the space is needed and we'll avoid creating
+        * files with holes.
+        */
+       fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+       if (fd < 0 && tablespace != 0)
+       {
+               char undo_path[MAXPGPATH];
+
+               /* Try creating the undo directory for this tablespace. */
+               UndoLogDirectory(tablespace, undo_path);
+               if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST)
+               {
+                       char       *parentdir;
+
+                       if (errno != ENOENT || !InRecovery)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create directory \"%s\": %m",
+                                                               undo_path)));
+
+                       /*
+                        * In recovery, it's possible that the tablespace directory
+                        * doesn't exist because a later WAL record removed the whole
+                        * tablespace.  In that case we create a regular directory to
+                        * stand in for it.  This is similar to the logic in
+                        * TablespaceCreateDbspace().
+                        */
+
+                       /* create two parents up if not exist */
+                       parentdir = pstrdup(undo_path);
+                       get_parent_directory(parentdir);
+                       get_parent_directory(parentdir);
+                       /* Can't create parent and it doesn't already exist? */
+                       if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create directory \"%s\": %m",
+                                                               parentdir)));
+                       pfree(parentdir);
+
+                       /* create one parent up if not exist */
+                       parentdir = pstrdup(undo_path);
+                       get_parent_directory(parentdir);
+                       /* Can't create parent and it doesn't already exist? */
+                       if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create directory \"%s\": %m",
+                                                               parentdir)));
+                       pfree(parentdir);
+
+                       if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST)
+                               ereport(ERROR,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not create directory \"%s\": %m",
+                                                               undo_path)));
+               }
+
+               fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+       }
+       if (fd < 0)
+               elog(ERROR, "could not create new file \"%s\": %m", path);
+       if (fstat(fd, &stat_buffer) < 0)
+               elog(ERROR, "could not stat \"%s\": %m", path);
+       size = stat_buffer.st_size;
+
+       /* A buffer full of zeroes we'll use to fill up new segment files. */
+       zeroes = palloc0(nzeroes);
+
+       while (size < UndoLogSegmentSize)
+       {
+               ssize_t written;
+
+               written = write(fd, zeroes, Min(nzeroes, UndoLogSegmentSize - size));
+               if (written < 0)
+                       elog(ERROR, "cannot initialize undo log segment file \"%s\": %m",
+                                path);
+               size += written;
+       }
+
+       /* Flush the contents of the file to disk before the next checkpoint. */
+       undofile_request_sync(logno, end / UndoLogSegmentSize, tablespace);
+
+       CloseTransientFile(fd);
+
+       pfree(zeroes);
+
+       elog(DEBUG1, "created undo segment \"%s\"", path);
+}
+
+/*
+ * Create a new undo segment, when it is unexpectedly not present.
+ */
+void
+UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno)
+{
+       Assert(InRecovery);
+       allocate_empty_undo_segment(logno, tablespace, segno * UndoLogSegmentSize);
+}
+
+/*
+ * Create and zero-fill a new segment for a given undo log number.
+ */
+static void
+extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end)
+{
+       UndoLogSlot *slot;
+       size_t          end;
+
+       slot = find_undo_log_slot(logno, false);
+
+       /* TODO review interlocking */
+
+       Assert(slot != NULL);
+       Assert(slot->meta.end % UndoLogSegmentSize == 0);
+       Assert(new_end % UndoLogSegmentSize == 0);
+       Assert(InRecovery ||
+                  CurrentSession->attached_undo_slots[slot->meta.category] == slot);
+
+       /*
+        * Create all the segments needed to increase 'end' to the requested
+        * size.  This is quite expensive, so we will try to avoid it completely
+        * by renaming files into place in UndoLogDiscard() instead.
+        */
+       end = slot->meta.end;
+       while (end < new_end)
+       {
+               allocate_empty_undo_segment(logno, slot->meta.tablespace, end);
+               end += UndoLogSegmentSize;
+       }
+
+       /* Flush the directory entries before next checkpoint. */
+       undofile_request_sync_dir(slot->meta.tablespace);
+
+       /*
+        * If we're not in recovery, we need to WAL-log the creation of the new
+        * file(s).  We do that after the above filesystem modifications, in
+        * violation of the data-before-WAL rule as exempted by
+        * src/backend/access/transam/README.  This means that it's possible for
+        * us to crash having made some or all of the filesystem changes but
+        * before WAL logging, but in that case we'll eventually try to create the
+        * same segment(s) again, which is tolerated.
+        */
+       if (!InRecovery)
+       {
+               xl_undolog_extend xlrec;
+               XLogRecPtr      ptr;
+
+               xlrec.logno = logno;
+               xlrec.end = end;
+
+               XLogBeginInsert();
+               XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+               ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_EXTEND);
+               XLogFlush(ptr);
+       }
+
+       /*
+        * We didn't need to acquire the mutex to read 'end' above because only
+        * we write to it.  But we need the mutex to update it, because the
+        * checkpointer might read it concurrently.
+        *
+        * XXX It's possible for meta.end to be higher already during
+        * recovery, because of the timing of a checkpoint; in that case we did
+        * nothing above and we shouldn't update shmem here.  That interaction
+        * needs more analysis.
+        */
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       if (slot->meta.end < end)
+               slot->meta.end = end;
+       LWLockRelease(&slot->mutex);
+}
+
+/*
+ * This function must be called before all of the undo log activity that will
+ * be covered by a single WAL record.
+ */
+void
+UndoLogBeginInsert(UndoLogAllocContext *context,
+                                  UndoLogCategory category,
+                                  XLogReaderState *xlog_record)
+{
+       context->try_location = InvalidUndoRecPtr;
+       context->category = category;
+
+       /*
+        * Tell UndoLogAllocate() to capture undo log meta-data before-change
+        * images, so that UndoLogRegister() can find them and they can be written
+        * to the WAL once per checkpoint.
+        */
+       context->num_meta_data_images = 0;
+
+       /*
+        * Tell UndoLogAllocateInRecovery() that we don't know which undo log to
+        * allocate in yet, and to start its search for registered blocks at
+        * the lowest-numbered block_id.
+        */
+       context->xlog_record = xlog_record;
+       context->recovery_logno = InvalidUndoLogNumber;
+       context->recovery_block_id = 0;
+
+       /*
+        * For UNDO_SHARED, this always denotes the beginning of a new record set.
+        * For other categories, the boundaries are detected by transaction ID
+        * changes.
+        */
+       context->new_shared_record_set = category == UNDO_SHARED;
+}
+
+/*
+ * Get an insertion point that is guaranteed to be backed by enough space to
+ * hold 'size' bytes of data.  To actually write into the undo log, client
+ * code should call this first and then use bufmgr routines to access buffers
+ * and provide WAL logs and redo handlers.  In other words, while this module
+ * looks after making sure the undo log has sufficient space and the undo meta
+ * data is crash safe, the *contents* of the undo log and (indirectly) the
+ * insertion point are the responsibility of client code.
+ *
+ * A suggested insertion point can optionally be passed in as 'try_location',
+ * and will be returned if possible.  If not InvalidUndoRecPtr, it must fall
+ * with, or exactly one byte after, the most recent allocation for the same
+ * persistence level.  This interface allows for a series of allocation to be
+ * made without committing to using the space yet; call UndoLogAdvance() to
+ * actually advance the insert pointer.
+ *
+ * Return an undo log insertion point that can be converted to a buffer tag
+ * and an insertion point within a buffer page.
+ */
+UndoRecPtr
+UndoLogAllocate(UndoLogAllocContext *context,
+                               uint16 size,
+                               bool *need_xact_header,
+                               UndoRecPtr *last_xact_start,
+                               UndoRecPtr *prevlog_xact_start,
+                               UndoRecPtr *prevlog_insert_urp)
+{
+       Session *session = CurrentSession;
+       UndoLogSlot *slot;
+       UndoLogOffset new_insert;
+       TransactionId logxid;
+
+       slot = CurrentSession->attached_undo_slots[context->category];
+
+       /*
+        * We may need to attach to an undo log, either because this is the first
+        * time this backend as needed to write to an undo log at all or because
+        * the undo_tablespaces GUC was changed.  When doing that, we'll need
+        * interlocking against tablespaces being concurrently dropped.
+        */
+
+ retry:
+       /* See if we need to check the undo_tablespaces GUC. */
+       if (unlikely(session->need_to_choose_undo_tablespace || slot == NULL))
+       {
+               Oid             tablespace;
+               bool    need_to_unlock;
+
+               need_to_unlock =
+                       choose_undo_tablespace(session->need_to_choose_undo_tablespace,
+                                                                  &tablespace);
+               attach_undo_log(context->category, tablespace);
+               if (need_to_unlock)
+                       LWLockRelease(TablespaceCreateLock);
+               slot = CurrentSession->attached_undo_slots[context->category];
+               session->need_to_choose_undo_tablespace = false;
+       }
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       logxid = slot->meta.unlogged.xid;
+
+       if (logxid != GetTopTransactionId())
+       {
+               /*
+                * While we have the lock, check if we have been forcibly detached by
+                * DROP TABLESPACE.  That can only happen between transactions (see
+                * DropUndoLogsInsTablespace()).
+                */
+               if (slot->pid == InvalidPid)
+               {
+                       LWLockRelease(&slot->mutex);
+                       slot = NULL;
+                       goto retry;
+               }
+               /* Record that we are attached to this log. */
+               slot->meta.unlogged.xid = GetTopTransactionId();
+               /*
+                * Maintain our tracking of the and the previous transaction start
+                * locations.
+                */
+               if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert)
+               {
+                       slot->meta.unlogged.last_xact_start =
+                               slot->meta.unlogged.this_xact_start;
+                       slot->meta.unlogged.this_xact_start = slot->meta.unlogged.insert;
+               }
+       }
+       LWLockRelease(&slot->mutex);
+
+       /*
+        * 'size' is expressed in usable non-header bytes.  Figure out how far we
+        * have to move insert to create space for 'size' usable bytes, stepping
+        * over any intervening headers.
+        */
+       Assert(slot->meta.unlogged.insert % BLCKSZ >= UndoLogBlockHeaderSize);
+       if (context->try_location != InvalidUndoRecPtr)
+       {
+               /*
+                * The try location must be in the log we're attached to, at most one
+                * byte past the end of space backed by files.
+                */
+               UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location);
+
+               Assert(UndoRecPtrGetLogNo(context->try_location) == slot->logno);
+               Assert(try_offset <= slot->meta.end);
+               new_insert = UndoLogOffsetPlusUsableBytes(try_offset, size);
+       }
+       else
+       {
+               new_insert = UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert,
+                                                                                                 size);
+       }
+       Assert(new_insert % BLCKSZ >= UndoLogBlockHeaderSize);
+
+       /*
+        * We don't need to acquire log->mutex to read log->meta.insert and
+        * log->meta.end, because this backend is the only one that can
+        * modify them.
+        */
+       if (unlikely(new_insert > slot->meta.end))
+       {
+               if (new_insert > UndoLogMaxSize)
+               {
+                       /* This undo log is entirely full.  Get a new one. */
+                       if (logxid == GetTopTransactionId())
+                       {
+                               /*
+                                * If the same transaction is split over two undo logs then
+                                * store the previous log number in new log.  See detailed
+                                * comments in undorecord.c file header.
+                                */
+                               *prevlog_xact_start =
+                                       MakeUndoRecPtr(slot->logno,
+                                                                  slot->meta.unlogged.this_xact_start);
+                               *prevlog_insert_urp =
+                                       MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+                       }
+                       elog(DEBUG1, "undo log %u is full, switching to a new one", slot->logno);
+                       slot = NULL;
+                       detach_current_undo_log(context->category, true);
+                       context->try_location = InvalidUndoRecPtr;
+                       goto retry;
+               }
+               /*
+                * Extend the end of this undo log to cover new_insert (in other words
+                * round up to the segment size).
+                */
+               extend_undo_log(slot->logno,
+                                               new_insert + UndoLogSegmentSize -
+                                               new_insert % UndoLogSegmentSize);
+               Assert(new_insert <= slot->meta.end);
+       }
+
+       /*
+        * Create a back-up image of the unlogged part of the undo log's
+        * meta-data, if we haven't already done so since UndoLogBeginInsert() (ie
+        * for the WAL record that this undo allocation will be replayed by).
+        */
+       if (context->num_meta_data_images == 0 ||
+               context->meta_data_images[context->num_meta_data_images - 1].logno != slot->logno)
+       {
+               if (context->num_meta_data_images >= MAX_META_DATA_IMAGES)
+                       elog(ERROR, "too many undo log meta data images");
+               context->meta_data_images[context->num_meta_data_images].logno = slot->logno;
+               context->meta_data_images[context->num_meta_data_images++].data = slot->meta.unlogged;
+       }
+
+       /*
+        * If no try_location was passed in, or if we switched logs, then we'll
+        * return the current insertion point.
+        */
+       if (context->try_location == InvalidUndoRecPtr)
+               context->try_location = MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+
+       /*
+        * Is this location the first in this undo log for a transaction or a
+        * shared record set?
+        */
+       if (context->new_shared_record_set)
+       {
+               context->new_shared_record_set = false;
+               *need_xact_header = true;
+       }
+       else
+       {
+               *need_xact_header =
+                       UndoRecPtrGetOffset(context->try_location) ==
+                       slot->meta.unlogged.this_xact_start;
+       }
+       *last_xact_start =
+               MakeUndoRecPtr(slot->logno, slot->meta.unlogged.last_xact_start);
+
+       return context->try_location;
+}
+
+void
+UndoLogRegister(UndoLogAllocContext *context, uint8 block_id, UndoLogNumber logno)
+{
+       int             i;
+
+       for (i = 0; i < context->num_meta_data_images; ++i)
+       {
+               if (context->meta_data_images[i].logno == logno)
+               {
+                       XLogRegisterBufData(block_id,
+                                                               (char *) &context->meta_data_images[i].data,
+                                                               sizeof(context->meta_data_images[i].data));
+                       return;
+               }
+       }
+}
+
+/*
+ * In recovery, we expect exactly the same sequence of allocation sizes, but
+ * we also need the WAL record that is being replayed so we can figure out
+ * where the undo space was allocated.
+ */
+UndoRecPtr
+UndoLogAllocateInRecovery(UndoLogAllocContext *context,
+                                                 TransactionId xid,
+                                                 uint16 size,
+                                                 bool *need_xact_header,
+                                                 UndoRecPtr *last_xact_start,
+                                                 UndoRecPtr *prevlog_xact_start,
+                                                 UndoRecPtr *prevlog_last_urp)
+{
+       UndoLogSlot *slot;
+
+       Assert(InRecovery);
+
+       /*
+        * Just as in UndoLogAllocate(), the caller may be extending an existing
+        * allocation before committing with UndoLogAdvance().
+        */
+       if (context->try_location != InvalidUndoRecPtr)
+       {
+               /*
+                * The try location must be in the log we're attached to, at most one
+                * byte past the end of space backed by files.
+                */
+               UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location);
+               UndoLogNumber logno = UndoRecPtrGetLogNo(context->try_location);
+
+               /*
+                * You can only have a try_location on your second or later allocation
+                * for a given WAL record.  It had better be in the same log as the
+                * previous allocation for this WAL record (though it may not turn out
+                * to have enough space, below).
+                */
+               Assert(logno == context->recovery_logno);
+
+               /*
+                * Any log extension triggered by UndoLogAllocate() must have been
+                * replayed by now, so we can just check if this log has enough space,
+                * and if so, return.
+                */
+               slot = find_undo_log_slot(logno, false);
+               if (UndoLogOffsetPlusUsableBytes(try_offset, size) <= slot->meta.end)
+               {
+                       *need_xact_header = false;
+                       return try_offset;
+               }
+
+               /* Full.  Ignore try_location and find the next log that was used. */
+               Assert(slot->meta.status == UNDO_LOG_STATUS_FULL);
+       }
+       else
+       {
+               /*
+                * For now we only support one allocation per WAL record that doesn't
+                * have a try_location (ie the first one).  We'll have to find out
+                * which log was used first.
+                */
+               Assert(context->recovery_logno == InvalidUndoLogNumber);
+       }
+
+       /*
+        * In order to find the undo log that was used by UndoLogAllocate(), we
+        * consult the list of registered blocks to figure out which undo logs
+        * should be written to by this WAL record.
+        */
+       while (context->recovery_block_id <= context->xlog_record->max_block_id)
+       {
+               DecodedBkpBlock *block;
+
+               /* We're looking for the first block referencing a new undo log. */
+               block = &context->xlog_record->blocks[context->recovery_block_id];
+               if (block->rnode.dbNode == UndoDbOid &&
+                       block->rnode.relNode != context->recovery_logno)
+               {
+                       UndoLogNumber logno = block->rnode.relNode;
+                       const void *backup;
+                       size_t backup_size;
+
+                       /* We found a reference to a different (or first) undo log. */
+                       slot = find_undo_log_slot(logno, false);
+
+                       /*
+                        * Since on-line checkpoints capture an inconsistent snapshot of
+                        * undo log meta-data, we'll restore the unlogged part of the
+                        * meta-data image if one was attached to the WAL record (that is,
+                        * the members that don't have WAL records for every change
+                        * already).
+                        */
+                       backup =
+                               XLogRecGetBlockData(context->xlog_record,
+                                                                       context->recovery_block_id,
+                                                                       &backup_size);
+                       if (unlikely(backup))
+                       {
+                               Assert(backup_size == sizeof(UndoLogUnloggedMetaData));
+
+                               /* Restore the unlogged members from the backup-imaged. */
+                               LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+                               memcpy(&slot->meta.unlogged, backup, sizeof(UndoLogUnloggedMetaData));
+                               LWLockRelease(&slot->mutex);
+                       }
+                       else
+                       {
+                               /*
+                                * Otherwise we need to do our own transaction tracking
+                                * whenever we see a new xid, to match the logic in
+                                * UndoLogAllocate().
+                                */
+                               if (xid != slot->meta.unlogged.xid)
+                               {
+                                       slot->meta.unlogged.xid = xid;
+                                       if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert)
+                                               slot->meta.unlogged.last_xact_start =
+                                                       slot->meta.unlogged.this_xact_start;
+                                       slot->meta.unlogged.this_xact_start =
+                                               slot->meta.unlogged.insert;
+                               }
+                       }
+
+                       /* TODO: check locking against undo log slot recycling? */
+
+                       /*
+                        * At this stage we should have an undo log that can handle this
+                        * allocation.  If we don't, something is screwed up.
+                        */
+                       if (UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size) > slot->meta.end)
+                               elog(ERROR,
+                                        "cannot allocate %d bytes in undo log %d",
+                                        (int) size, slot->logno);
+
+                       *need_xact_header =
+                               context->try_location == InvalidUndoRecPtr &&
+                               slot->meta.unlogged.insert == slot->meta.unlogged.this_xact_start;
+                       *last_xact_start = slot->meta.unlogged.last_xact_start;
+                       context->recovery_logno = slot->logno;
+
+                       /* Read log switch information from meta and reset it. */
+                       *prevlog_xact_start = slot->meta.unlogged.prevlog_xact_start;
+                       *prevlog_last_urp = slot->meta.unlogged.prevlog_last_urp;
+
+                       slot->meta.unlogged.prevlog_xact_start = InvalidUndoRecPtr;
+                       slot->meta.unlogged.prevlog_last_urp = InvalidUndoRecPtr;
+
+                       return MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+               }
+               ++context->recovery_block_id;
+       }
+
+       /*
+        * If we've run out of blocks to inspect, then we must have replayed a
+        * different sequence of allocation sizes, or screwed up the
+        * XLOG_UNDOLOG_EXTEND records, indicating a bug somewhere.
+        */
+       elog(ERROR, "cannot determine undo log to allocate from");
+
+       return 0;               /* not reached */
+}
+
+/*
+ * Advance the insertion pointer in this context by 'size' usable (non-header)
+ * bytes.  This is the next place we'll try to allocate a record, if it fits.
+ * This is not committed to shared memory until after we've WAL-logged the
+ * record and UndoLogAdvanceFinal() is called.
+ */
+void
+UndoLogAdvance(UndoLogAllocContext *context, size_t size)
+{
+       context->try_location = UndoLogOffsetPlusUsableBytes(context->try_location,
+                                                                                                                size);
+}
+
+/*
+ * Advance the insertion pointer to 'size' usable (non-header) bytes past
+ * insertion_point.
+ */
+void
+UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size)
+{
+       UndoLogSlot *slot = NULL;
+       UndoLogNumber   logno = UndoRecPtrGetLogNo(insertion_point) ;
+
+       slot = find_undo_log_slot(logno, false);
+
+       /*
+        * Either we're in recovery, or is a log we are currently attached to, or
+        * recently detached from because it was full.
+        */
+       Assert(InRecovery ||
+                  AmAttachedToUndoLogSlot(slot) ||
+                  slot->meta.status == UNDO_LOG_STATUS_FULL);
+
+       /*
+        * The caller has the current insertion point, as returned by
+        * UndoLogAllocate[InRecovery]().
+        */
+       Assert(UndoRecPtrGetOffset(insertion_point) == slot->meta.unlogged.insert);
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->meta.unlogged.insert =
+               UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size);
+       LWLockRelease(&slot->mutex);
+}
+
+/*
+ * Advance the discard pointer in one undo log, discarding all undo data
+ * relating to one or more whole transactions.  The passed in undo pointer is
+ * the address of the oldest data that the called would like to keep, and the
+ * affected undo log is implied by this pointer, ie
+ * UndoRecPtrGetLogNo(discard_pointer).
+ *
+ * The caller asserts that there will be no attempts to access the undo log
+ * region being discarded after this moment.  This operation will cause the
+ * relevant buffers to be dropped immediately, without writing any data out to
+ * disk.  Any attempt to read the buffers (except a partial buffer at the end
+ * of this range which will remain) may result in IO errors, because the
+ * underlying segment file may have been physically removed.
+ *
+ * Return true if the discard point was updated, and false if nothing was done
+ * because the log precending the given point was already discarded.
+ *
+ * TODO: The return value is not yet reliable and the code still doesn't work
+ * correctly if called for the same undo log in two backends; more
+ * interlocking work required here.
+ */
+bool
+UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid)
+{
+       UndoLogNumber logno = UndoRecPtrGetLogNo(discard_point);
+       UndoLogOffset discard = UndoRecPtrGetOffset(discard_point);
+       UndoLogOffset old_discard;
+       UndoLogOffset end;
+       UndoLogSlot *slot;
+       int                     segno;
+       int                     new_segno;
+       bool            need_to_flush_wal = false;
+       bool            entirely_discarded = false;
+
+       slot = find_undo_log_slot(logno, false);
+       if (unlikely(slot == NULL))
+       {
+               /* Already discarded (entirely). */
+               return false;
+       }
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       if (unlikely(slot->logno != logno || discard <= slot->meta.discard))
+       {
+               /*
+                * Already discarded entirely and the slot has been recycled, or up
+                * to this point).
+                */
+               LWLockRelease(&slot->mutex);
+               return false;
+       }
+       if (discard > slot->meta.unlogged.insert)
+               elog(ERROR, "cannot move discard point past insert point");
+       old_discard = slot->meta.discard;
+       end = slot->meta.end;
+       /* Are we discarding the last remaining data in a log marked as full? */
+       if (slot->meta.status == UNDO_LOG_STATUS_FULL &&
+               discard == slot->meta.unlogged.insert)
+       {
+               /*
+                * Adjust the discard and insert pointers so that the final segment is
+                * deleted from disk, and remember not to recycle it.
+                */
+               entirely_discarded = true;
+               /* TODO: Check if the following line is replayed correctly */
+               slot->meta.unlogged.insert = slot->meta.end;
+               discard = slot->meta.end;
+       }
+       LWLockRelease(&slot->mutex);
+
+       /*
+        * TODO: I think we need a new lock just for this phase, so that buffer
+        * dropping and IO are done by only one backend if a superuser command and
+        * a discard worker both run this!
+        */
+
+       /*
+        * Drop all buffers holding this undo data out of the buffer pool (except
+        * the last one, if the new location is in the middle of it somewhere), so
+        * that the contained data doesn't ever touch the disk.  The caller
+        * promises that this data will not be needed again.  We have to drop the
+        * buffers from the buffer pool before removing files, otherwise a
+        * concurrent session might try to write the block to evict the buffer.
+        */
+       forget_undo_buffers(logno, old_discard, discard, entirely_discarded);
+
+       /*
+        * Check if we crossed a segment boundary and need to do some synchronous
+        * filesystem operations.
+        */
+       segno = old_discard / UndoLogSegmentSize;
+       new_segno = discard / UndoLogSegmentSize;
+       if (segno < new_segno)
+       {
+               int             recycle;
+               UndoLogOffset pointer;
+
+               /*
+                * We always WAL-log discards, but we only need to flush the WAL if we
+                * have performed a filesystem operation.
+                */
+               need_to_flush_wal = true;
+
+               /*
+                * XXX When we rename or unlink a file, it's possible that some
+                * backend still has it open because it has recently read a page from
+                * it.  smgr/undofile.c in any such backend will eventually close it,
+                * because it considers that fd to belong to the file with the name
+                * that we're unlinking or renaming and it doesn't like to keep more
+                * than one open at a time.  No backend should ever try to read from
+                * such a file descriptor; that is what it means when we say that the
+                * caller of UndoLogDiscard() asserts that there will be no attempts
+                * to access the discarded range of undo log.  In the case of a
+                * rename, if a backend were to attempt to read undo data in the range
+                * being discarded, it would read entirely the wrong data.
+                */
+
+               /*
+                * How many segments should we recycle (= rename from tail position to
+                * head position)?  For now it's always 1 unless there is already a
+                * spare one, but we could have an adaptive algorithm that recycles
+                * multiple segments at a time and pays just one fsync().
+                */
+               LWLockAcquire(&slot->mutex, LW_SHARED);
+               if ((slot->meta.end - slot->meta.unlogged.insert) < UndoLogSegmentSize &&
+                       slot->meta.status == UNDO_LOG_STATUS_ACTIVE)
+                       recycle = 1;
+               else
+                       recycle = 0;
+               LWLockRelease(&slot->mutex);
+
+               /* Rewind to the start of the segment. */
+               pointer = segno * UndoLogSegmentSize;
+
+               while (pointer < new_segno * UndoLogSegmentSize)
+               {
+                       char    discard_path[MAXPGPATH];
+
+                       /* Tell the checkpointer that the file is going away. */
+                       undofile_forget_sync(logno, pointer / UndoLogSegmentSize,
+                                                                slot->meta.tablespace);
+
+                       UndoLogSegmentPath(logno, pointer / UndoLogSegmentSize,
+                                                          slot->meta.tablespace, discard_path);
+
+                       /* Can we recycle the oldest segment? */
+                       if (recycle > 0)
+                       {
+                               char    recycle_path[MAXPGPATH];
+
+                               /*
+                                * End points one byte past the end of the current undo space,
+                                * ie to the first byte of the segment file we want to create.
+                                */
+                               UndoLogSegmentPath(logno, end / UndoLogSegmentSize,
+                                                                  slot->meta.tablespace, recycle_path);
+                               if (rename(discard_path, recycle_path) == 0)
+                               {
+                                       elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"",
+                                                discard_path, recycle_path);
+                                       end += UndoLogSegmentSize;
+                                       --recycle;
+                               }
+                               else
+                               {
+                                       elog(ERROR, "could not rename \"%s\" to \"%s\": %m",
+                                                discard_path, recycle_path);
+                               }
+                       }
+                       else
+                       {
+                               if (unlink(discard_path) == 0)
+                                       elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path);
+                               else
+                                       elog(ERROR, "could not unlink \"%s\": %m", discard_path);
+                       }
+                       pointer += UndoLogSegmentSize;
+               }
+       }
+
+       /* WAL log the discard. */
+       {
+               xl_undolog_discard xlrec;
+               XLogRecPtr ptr;
+
+               xlrec.logno = logno;
+               xlrec.discard = discard;
+               xlrec.end = end;
+               xlrec.latestxid = xid;
+               xlrec.entirely_discarded = entirely_discarded;
+
+               XLogBeginInsert();
+               XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+               ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_DISCARD);
+
+               if (need_to_flush_wal)
+                       XLogFlush(ptr);
+       }
+
+       /* Update shmem to show the new discard and end pointers. */
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->meta.discard = discard;
+       slot->meta.end = end;
+       LWLockRelease(&slot->mutex);
+
+       /* If we discarded everything, the slot can be given up. */
+       if (entirely_discarded)
+               free_undo_log_slot(slot);
+
+       return true;
+}
+
+/*
+ * Return an UndoRecPtr to the oldest valid data in an undo log, or
+ * InvalidUndoRecPtr if it is empty.
+ */
+UndoRecPtr
+UndoLogGetOldestRecord(UndoLogNumber logno, bool *full)
+{
+       UndoLogSlot *slot;
+       UndoRecPtr      result;
+
+       /* Try to find the slot for this undo log number. */
+       slot = find_undo_log_slot(logno, false);
+       if (slot == NULL)
+       {
+               /* It's unknown to us, so we assume it's been entirely discarded. */
+               if (full)
+                       *full = true;
+               return InvalidUndoRecPtr;
+       }
+
+       LWLockAcquire(&slot->mutex, LW_SHARED);
+       if (slot->logno != logno)
+       {
+               /* It's been recycled.  SO it must have been entirely discarded. */
+               result = InvalidUndoRecPtr;
+               if (full)
+                       *full = true;
+       }
+       else if (slot->meta.discard == slot->meta.unlogged.insert)
+       {
+               /* It's empty, so there is no oldest record pointer to return. */
+               result = InvalidUndoRecPtr;
+               if (full)
+                       *full = slot->meta.status == UNDO_LOG_STATUS_FULL;
+       }
+       else
+       {
+               /* There is a record here! */
+               result = MakeUndoRecPtr(slot->logno, slot->meta.discard);
+               if (full)
+                       *full = slot->meta.status == UNDO_LOG_STATUS_FULL;
+       }
+       LWLockRelease(&slot->mutex);
+
+       return result;
+}
+
+/*
+ * UndoLogSwitchSetPrevLogInfo - Store previous log info on the log switch and
+ * wal log the same.
+ */
+void
+UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno, UndoRecPtr prevlog_xact_start,
+                                                       UndoRecPtr prevlog_last_urp)
+{
+       UndoLogSlot *slot;
+
+       slot = find_undo_log_slot(logno, false);
+
+       /*
+        * Either we're in recovery, or is a log we are currently attached to, or
+        * recently detached from because it was full.
+        */
+       Assert(AmAttachedToUndoLogSlot(slot));
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->meta.unlogged.prevlog_xact_start = prevlog_last_urp;
+       slot->meta.unlogged.prevlog_last_urp = prevlog_last_urp;
+       LWLockRelease(&slot->mutex);
+
+       /* Wal log the log switch. */
+       {
+               xl_undolog_switch xlrec;
+
+               xlrec.logno = logno;
+               xlrec.prevlog_xact_start = prevlog_last_urp;
+               xlrec.prevlog_last_urp = prevlog_xact_start;
+
+               XLogBeginInsert();
+               XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+               XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_SWITCH);
+       }
+}
+
+/*
+ * Return the next insert location.
+ */
+UndoRecPtr
+UndoLogGetNextInsertPtr(UndoLogNumber logno)
+{
+       UndoLogSlot *slot = find_undo_log_slot(logno, false);
+       UndoRecPtr      insert;
+
+       LWLockAcquire(&slot->mutex, LW_SHARED);
+       /* TODO: what if the slot has been recycled? */
+       insert = slot->meta.unlogged.insert;
+       LWLockRelease(&slot->mutex);
+
+       return MakeUndoRecPtr(logno, insert);
+}
+
+/*
+ * Delete unreachable files under pg_undo.  Any files corresponding to LSN
+ * positions before the previous checkpoint are no longer needed.
+ */
+static void
+CleanUpUndoCheckPointFiles(XLogRecPtr checkPointRedo)
+{
+       DIR        *dir;
+       struct dirent *de;
+       char    path[MAXPGPATH];
+       char    oldest_path[MAXPGPATH];
+
+       /*
+        * If a base backup is in progress, we can't delete any checkpoint
+        * snapshot files because one of them corresponds to the backup label but
+        * there could be any number of checkpoints during the backup.
+        */
+       if (BackupInProgress())
+               return;
+
+       /* Otherwise keep only those >= the previous checkpoint's redo point. */
+       snprintf(oldest_path, MAXPGPATH, "%016" INT64_MODIFIER "X",
+                        checkPointRedo);
+       dir = AllocateDir("pg_undo");
+       while ((de = ReadDir(dir, "pg_undo")) != NULL)
+       {
+               /*
+                * Assume that fixed width uppercase hex strings sort the same way as
+                * the values they represent, so we can use strcmp to identify undo
+                * log snapshot files corresponding to checkpoints that we don't need
+                * anymore.  This assumption holds for ASCII.
+                */
+               if (!(strlen(de->d_name) == UNDO_CHECKPOINT_FILENAME_LENGTH))
+                       continue;
+
+               if (UndoCheckPointFilenamePrecedes(de->d_name, oldest_path))
+               {
+                       snprintf(path, MAXPGPATH, "pg_undo/%s", de->d_name);
+                       if (unlink(path) != 0)
+                               elog(ERROR, "could not unlink file \"%s\": %m", path);
+               }
+       }
+       FreeDir(dir);
+}
+
+/*
+ * Write out the undo log meta data to the pg_undo directory.  The actual
+ * contents of undo logs is in shared buffers and therefore handled by
+ * CheckPointBuffers(), but here we record the table of undo logs and their
+ * properties.
+ */
+void
+CheckPointUndoLogs(XLogRecPtr checkPointRedo, XLogRecPtr priorCheckPointRedo)
+{
+       UndoLogMetaData *serialized = NULL;
+       size_t  serialized_size = 0;
+       char   *data;
+       char    path[MAXPGPATH];
+       UndoLogNumber num_logs;
+       int             fd;
+       int             i;
+       pg_crc32c crc;
+
+       /*
+        * We acquire UndoLogLock to prevent any undo logs from being created or
+        * discarded while we build a snapshot of them.  This isn't expected to
+        * take long on a healthy system because the number of active logs should
+        * be around the number of backends.  Holding this lock won't prevent
+        * concurrent access to the undo log, except when segments need to be
+        * added or removed.
+        */
+       LWLockAcquire(UndoLogLock, LW_SHARED);
+
+       /*
+        * Rather than doing the file IO while we hold locks, we'll copy the
+        * meta-data into a palloc'd buffer.
+        */
+       serialized_size = sizeof(UndoLogMetaData) * UndoLogNumSlots();
+       serialized = (UndoLogMetaData *) palloc0(serialized_size);
+
+       /* Scan through all slots looking for non-empty ones. */
+       num_logs = 0;
+       for (i = 0; i < UndoLogNumSlots(); ++i)
+       {
+               UndoLogSlot *slot = &UndoLogShared->slots[i];
+
+               /* Skip empty slots. */
+               if (slot->logno == InvalidUndoLogNumber)
+                       continue;
+
+               /* Capture snapshot while holding each mutex. */
+               LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+               serialized[num_logs++] = slot->meta;
+               LWLockRelease(&slot->mutex);
+       }
+
+       LWLockRelease(UndoLogLock);
+
+       /* Dump into a file under pg_undo. */
+       snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X",
+                        checkPointRedo);
+       pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_WRITE);
+       fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+       if (fd < 0)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not create file \"%s\": %m", path)));
+
+       /* Compute header checksum. */
+       INIT_CRC32C(crc);
+       COMP_CRC32C(crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno));
+       COMP_CRC32C(crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno));
+       COMP_CRC32C(crc, &num_logs, sizeof(num_logs));
+       FIN_CRC32C(crc);
+
+       /* Write out the number of active logs + crc. */
+       if ((write(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)) != sizeof(UndoLogShared->low_logno)) ||
+               (write(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)) != sizeof(UndoLogShared->next_logno)) ||
+               (write(fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) ||
+               (write(fd, &crc, sizeof(crc)) != sizeof(crc)))
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not write to file \"%s\": %m", path)));
+
+       /* Write out the meta data for all active undo logs. */
+       data = (char *) serialized;
+       INIT_CRC32C(crc);
+       serialized_size = num_logs * sizeof(UndoLogMetaData);
+       while (serialized_size > 0)
+       {
+               ssize_t written;
+
+               written = write(fd, data, serialized_size);
+               if (written < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not write to file \"%s\": %m", path)));
+               COMP_CRC32C(crc, data, written);
+               serialized_size -= written;
+               data += written;
+       }
+       FIN_CRC32C(crc);
+
+       if (write(fd, &crc, sizeof(crc)) != sizeof(crc))
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not write to file \"%s\": %m", path)));
+
+
+       /* Flush file and directory entry. */
+       pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_SYNC);
+       pg_fsync(fd);
+       if (CloseTransientFile(fd) < 0)
+               ereport(data_sync_elevel(ERROR),
+                               (errcode_for_file_access(),
+                                errmsg("could not close file \"%s\": %m", path)));
+       fsync_fname("pg_undo", true);
+       pgstat_report_wait_end();
+
+       if (serialized)
+               pfree(serialized);
+
+       CleanUpUndoCheckPointFiles(priorCheckPointRedo);
+}
+
+void
+StartupUndoLogs(XLogRecPtr checkPointRedo)
+{
+       char    path[MAXPGPATH];
+       int             i;
+       int             fd;
+       int             nlogs;
+       pg_crc32c crc;
+       pg_crc32c new_crc;
+
+       /* If initdb is calling, there is no file to read yet. */
+       if (IsBootstrapProcessingMode())
+               return;
+
+       /* Open the pg_undo file corresponding to the given checkpoint. */
+       snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X",
+                        checkPointRedo);
+       pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_READ);
+       fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+       if (fd < 0)
+               elog(ERROR, "cannot open undo checkpoint snapshot \"%s\": %m", path);
+
+       /* Read the active log number range. */
+       if ((read(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno))
+                != sizeof(UndoLogShared->low_logno)) ||
+               (read(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno))
+                != sizeof(UndoLogShared->next_logno)) ||
+               (read(fd, &nlogs, sizeof(nlogs)) != sizeof(nlogs)) ||
+               (read(fd, &crc, sizeof(crc)) != sizeof(crc)))
+               elog(ERROR, "pg_undo file \"%s\" is corrupted", path);
+
+       /* Verify the header checksum. */
+       INIT_CRC32C(new_crc);
+       COMP_CRC32C(new_crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno));
+       COMP_CRC32C(new_crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno));
+       COMP_CRC32C(new_crc, &nlogs, sizeof(UndoLogShared->next_logno));
+       FIN_CRC32C(new_crc);
+
+       if (crc != new_crc)
+               elog(ERROR,
+                        "pg_undo file \"%s\" has incorrect checksum", path);
+
+       /*
+        * We'll acquire UndoLogLock just because allocate_undo_log() asserts we
+        * hold it (we don't actually expect concurrent access yet).
+        */
+       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+       /* Initialize all the logs and set up the freelist. */
+       INIT_CRC32C(new_crc);
+       for (i = 0; i < nlogs; ++i)
+       {
+               ssize_t size;
+               UndoLogSlot *slot;
+
+               /*
+                * Get a new UndoLogSlot.  If this checkpoint was created on a system
+                * with a higher max_connections setting, it's theoretically possible
+                * that we don't have enough space and cannot start up.
+                */
+               slot = allocate_undo_log_slot();
+               if (!slot)
+                       ereport(ERROR,
+                                       (errmsg("not enough undo log slots to recover from checkpoint: need at least %d, have %zu",
+                                                       nlogs, UndoLogNumSlots()),
+                                        errhint("Consider increasing max_connections")));
+
+               /* Read in the meta data for this undo log. */
+               if ((size = read(fd, &slot->meta, sizeof(slot->meta))) != sizeof(slot->meta))
+                       elog(ERROR, "short read of pg_undo meta data in file \"%s\": %m (got %zu, wanted %zu)",
+                                path, size, sizeof(slot->meta));
+               COMP_CRC32C(new_crc, &slot->meta, sizeof(slot->meta));
+
+               /*
+                * At normal start-up, or during recovery, all active undo logs start
+                * out on the appropriate free list.
+                */
+               slot->logno = slot->meta.logno;
+               slot->pid = InvalidPid;
+               slot->oldest_data = MakeUndoRecPtr(slot->logno, slot->meta.discard);
+               if (slot->meta.status == UNDO_LOG_STATUS_ACTIVE)
+               {
+                       slot->next_free = UndoLogShared->free_lists[slot->meta.category];
+                       UndoLogShared->free_lists[slot->meta.category] = slot->logno;
+               }
+       }
+       FIN_CRC32C(new_crc);
+
+       LWLockRelease(UndoLogLock);
+
+       /* Verify body checksum. */
+       if (read(fd, &crc, sizeof(crc)) != sizeof(crc))
+               elog(ERROR, "pg_undo file \"%s\" is corrupted", path);
+       if (crc != new_crc)
+               elog(ERROR,
+                        "pg_undo file \"%s\" has incorrect checksum", path);
+
+       CloseTransientFile(fd);
+       pgstat_report_wait_end();
+}
+
+/*
+ * Allocate a new UndoLogSlot object.
+ */
+static UndoLogSlot *
+allocate_undo_log_slot(void)
+{
+       UndoLogSlot *slot;
+       UndoLogNumber i;
+
+       Assert(LWLockHeldByMeInMode(UndoLogLock, LW_EXCLUSIVE));
+
+       for (i = 0; i < UndoLogNumSlots(); ++i)
+       {
+               slot = &UndoLogShared->slots[i];
+               if (slot->logno == InvalidUndoLogNumber)
+               {
+                       memset(&slot->meta, 0, sizeof(slot->meta));
+                       slot->pid = 0;
+                       slot->wait_fxmin = InvalidFullTransactionId;
+                       slot->oldest_data =0;
+                       slot->next_free = -1;
+                       slot->logno = -1;
+                       return slot;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+ * Free an UndoLogSlot object in shared memory, so that it can be reused.
+ * This is a rare event, and has complications for all code paths that access
+ * slots.  Unless the current session is attached to the slot, it must be
+ * prepared for it to be freed and then potentially recycled for use by
+ * another log.  See UndoLogGetSlot().
+ */
+static void
+free_undo_log_slot(UndoLogSlot *slot)
+{
+       /*
+        * When removing an undo log from a slot in shared memory, we acquire
+        * UndoLogLock, log->mutex and log->discard_lock, so that other code can
+        * hold any one of those locks to prevent the slot from being recycled.
+        */
+       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       Assert(slot->logno != InvalidUndoLogNumber);
+       slot->logno = InvalidUndoLogNumber;
+       memset(&slot->meta, 0, sizeof(slot->meta));
+       LWLockRelease(&slot->mutex);
+       LWLockRelease(UndoLogLock);
+}
+
+/*
+ * Find the UndoLogSlot object for a given log number.
+ *
+ * The caller may or may not already hold UndoLogLock, and should indicate
+ * this by passing 'locked'.  We'll acquire it in the slow path if necessary.
+ * If it is not held by the caller, the caller must deal with the possibility
+ * that the returned UndoLogSlot no longer contains the requested logno by the
+ * time it is accessed.
+ *
+ * To do that, one of the following approaches must be taken by the calling
+ * code:
+ *
+ * 1.  If the calling code knows that it is attached to this lock or is the
+ * recovery process, then there is no way for the slot to be recycled, so it's
+ * not necessary to check that the log number hasn't changed.  The slot cannot
+ * be recycled while a backend is attached.  It should probably assert that it
+ * is attached, however.
+ *
+ * 2.  All other code should acquire log->mutex before accessing any members,
+ * and after doing so, check that the logno hasn't moved.  If it is not, the
+ * entire undo log must be assumed to be discarded (as if this function
+ * returned NULL) and the caller must behave accordingly.
+ *
+ * Return NULL if the undo log has been entirely discarded.  It is an error to
+ * ask for undo logs that have never been created.
+ */
+static UndoLogSlot *
+find_undo_log_slot(UndoLogNumber logno, bool locked)
+{
+       UndoLogSlot *result = NULL;
+       UndoLogTableEntry *entry;
+       bool       found;
+
+       Assert(locked == LWLockHeldByMe(UndoLogLock));
+
+       /* First see if we already have it in our cache. */
+       entry = undologtable_lookup(undologtable_cache, logno);
+       if (likely(entry))
+               result = entry->slot;
+       else
+       {
+               UndoLogNumber i;
+
+               /* Nope.  Linear search for the slot in shared memory. */
+               if (!locked)
+                       LWLockAcquire(UndoLogLock, LW_SHARED);
+               for (i = 0; i < UndoLogNumSlots(); ++i)
+               {
+                       if (UndoLogShared->slots[i].logno == logno)
+                       {
+                               /* Found it. */
+
+                               /*
+                                * TODO: Should this function be usable in a critical section?
+                                * Would it make sense to detect that we are in a critical
+                                * section and just return the pointer to the log without
+                                * updating the cache, to avoid any chance of allocating
+                                * memory?
+                                */
+
+                               entry = undologtable_insert(undologtable_cache, logno, &found);
+                               entry->number = logno;
+                               entry->slot = &UndoLogShared->slots[i];
+                               entry->tablespace = entry->slot->meta.tablespace;
+                               entry->category = entry->slot->meta.category;
+                               entry->recent_discard =
+                                       MakeUndoRecPtr(logno, entry->slot->meta.discard);
+                               result = entry->slot;
+                               break;
+                       }
+               }
+
+               /*
+                * If we didn't find it, then it must already have been entirely
+                * discarded.  We create a negative cache entry so that we can answer
+                * this question quickly next time.
+                *
+                * TODO: We could track the lowest known undo log number, to reduce
+                * the negative cache entry bloat.
+                */
+               if (result == NULL)
+               {
+                       /*
+                        * Sanity check: the caller should not be asking about undo logs
+                        * that have never existed.
+                        */
+                       if (logno >= UndoLogShared->next_logno)
+                               elog(ERROR, "undo log %u hasn't been created yet", logno);
+                       entry = undologtable_insert(undologtable_cache, logno, &found);
+                       entry->number = logno;
+                       entry->slot = NULL;
+                       entry->tablespace = 0;
+               }
+               if (!locked)
+                       LWLockRelease(UndoLogLock);
+       }
+
+       return result;
+}
+
+/*
+ * Get a pointer to an UndoLogSlot object corresponding to a given logno.
+ *
+ * In general, the caller must acquire the UndoLogSlot's mutex to access
+ * the contents, and at that time must consider that the logno might have
+ * changed because the undo log it contained has been entirely discarded.
+ *
+ * If the calling backend is currently attached to the undo log, that is not
+ * possible, because logs can only reach UNDO_LOG_STATUS_DISCARDED after first
+ * reaching UNDO_LOG_STATUS_FULL, and that only happens while detaching.
+ */
+UndoLogSlot *
+UndoLogGetSlot(UndoLogNumber logno, bool missing_ok)
+{
+       UndoLogSlot *slot = find_undo_log_slot(logno, false);
+
+       if (slot == NULL && !missing_ok)
+               elog(ERROR, "unknown undo log number %d", logno);
+
+       return slot;
+}
+
+/*
+ * Attach to a free undo log, creating a new one if required.
+ */
+static void
+attach_undo_log(UndoLogCategory category, Oid tablespace)
+{
+       UndoLogSlot *slot = NULL;
+       UndoLogNumber logno;
+       UndoLogNumber *place;
+
+       Assert(!InRecovery);
+       Assert(CurrentSession->attached_undo_slots[category] == NULL);
+
+       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+       /*
+        * For now we have a simple linked list of unattached undo logs for each
+        * persistence level.  We'll grovel though it to find something for the
+        * tablespace you asked for.  If you're not using multiple tablespaces
+        * it'll be able to pop one off the front.  We might need a hash table
+        * keyed by tablespace if this simple scheme turns out to be too slow when
+        * using many tablespaces and many undo logs, but that seems like an
+        * unusual use case not worth optimizing for.
+        */
+       place = &UndoLogShared->free_lists[category];
+       while (*place != InvalidUndoLogNumber)
+       {
+               UndoLogSlot *candidate = find_undo_log_slot(*place, true);
+
+               /*
+                * There should never be an undo log on the freelist that has been
+                * entirely discarded, or hasn't been created yet.  The persistence
+                * level should match the freelist.
+                */
+               if (unlikely(candidate == NULL))
+                       elog(ERROR,
+                                "corrupted undo log freelist, no such undo log %u", *place);
+               if (unlikely(candidate->meta.category != category))
+                       elog(ERROR,
+                                "corrupted undo log freelist, undo log %u with persistence %d found on freelist %d",
+                                *place, candidate->meta.category, category);
+
+               if (candidate->meta.tablespace == tablespace)
+               {
+                       logno = *place;
+                       slot = candidate;
+                       *place = candidate->next_free;
+                       break;
+               }
+               place = &candidate->next_free;
+       }
+
+       /*
+        * If all existing undo logs for this tablespace and persistence level are
+        * busy, we'll have to create a new one.
+        */
+       if (slot == NULL)
+       {
+               if (UndoLogShared->next_logno > MaxUndoLogNumber)
+               {
+                       /*
+                        * You've used up all 16 exabytes of undo log addressing space.
+                        * This is a difficult state to reach using only 16 exabytes of
+                        * WAL.
+                        */
+                       elog(ERROR, "undo log address space exhausted");
+               }
+
+               /* Allocate a slot from the UndoLogSlot pool. */
+               slot = allocate_undo_log_slot();
+               if (unlikely(!slot))
+                       ereport(ERROR,
+                                       (errmsg("could not create new undo log"),
+                                        errdetail("The maximum number of active undo logs is %zu.",
+                                                          UndoLogNumSlots()),
+                                        errhint("Consider increasing max_connections.")));
+               slot->logno = logno = UndoLogShared->next_logno;
+
+               /*
+                * The insert and discard pointers start after the first block's
+                * header.  XXX That means that insert is > end for a short time in a
+                * newly created undo log.  Is there any problem with that?
+                */
+               slot->meta.unlogged.insert = UndoLogBlockHeaderSize;
+               slot->meta.discard = UndoLogBlockHeaderSize;
+
+               slot->meta.logno = logno;
+               slot->meta.tablespace = tablespace;
+               slot->meta.category = category;
+               slot->meta.status = UNDO_LOG_STATUS_ACTIVE;
+
+               /* Move the high log number pointer past this one. */
+               ++UndoLogShared->next_logno;
+
+               /* WAL-log the creation of this new undo log. */
+               {
+                       xl_undolog_create xlrec;
+
+                       xlrec.logno = logno;
+                       xlrec.tablespace = slot->meta.tablespace;
+                       xlrec.category = slot->meta.category;
+
+                       XLogBeginInsert();
+                       XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+                       XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_CREATE);
+               }
+
+               /*
+                * This undo log has no segments.  UndoLogAllocate will create the
+                * first one on demand.
+                */
+       }
+       LWLockRelease(UndoLogLock);
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->pid = MyProcPid;
+       LWLockRelease(&slot->mutex);
+
+       CurrentSession->attached_undo_slots[category] = slot;
+}
+
+/* check_hook: validate new undo_tablespaces */
+bool
+check_undo_tablespaces(char **newval, void **extra, GucSource source)
+{
+       char       *rawname;
+       List       *namelist;
+
+       /* Need a modifiable copy of string */
+       rawname = pstrdup(*newval);
+
+       /*
+        * Parse string into list of identifiers, just to check for
+        * well-formedness (unfortunateley we can't validate the names in the
+        * catalog yet).
+        */
+       if (!SplitIdentifierString(rawname, ',', &namelist))
+       {
+               /* syntax error in name list */
+               GUC_check_errdetail("List syntax is invalid.");
+               pfree(rawname);
+               list_free(namelist);
+               return false;
+       }
+
+       /*
+        * Make sure we aren't already in a transaction that has been assigned an
+        * XID.  This ensures we don't detach from an undo log that we might have
+        * started writing undo data into for this transaction.
+        */
+       if (GetTopTransactionIdIfAny() != InvalidTransactionId)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                (errmsg("undo_tablespaces cannot be changed while a transaction is in progress"))));
+       list_free(namelist);
+
+       return true;
+}
+
+/* assign_hook: do extra actions as needed */
+void
+assign_undo_tablespaces(const char *newval, void *extra)
+{
+       /*
+        * This is normally called only when GetTopTransactionIdIfAny() ==
+        * InvalidTransactionId (because you can't change undo_tablespaces in the
+        * middle of a transaction that's been asigned an xid), but we can't
+        * assert that because it's also called at the end of a transaction that's
+        * rolling back, to reset the GUC if it was set inside the transaction.
+        */
+
+       /* Tell UndoLogAllocate() to reexamine undo_tablespaces. */
+       if (CurrentSession)
+               CurrentSession->need_to_choose_undo_tablespace = true;
+}
+
+static bool
+choose_undo_tablespace(bool force_detach, Oid *tablespace)
+{
+       char   *rawname;
+       List   *namelist;
+       bool    need_to_unlock;
+       int             length;
+       int             i;
+
+       /* We need a modifiable copy of string. */
+       rawname = pstrdup(undo_tablespaces);
+
+       /* Break string into list of identifiers. */
+       if (!SplitIdentifierString(rawname, ',', &namelist))
+               elog(ERROR, "undo_tablespaces is unexpectedly malformed");
+
+       length = list_length(namelist);
+       if (length == 0 ||
+               (length == 1 && ((char *) linitial(namelist))[0] == '\0'))
+       {
+               /*
+                * If it's an empty string, then we'll use the default tablespace.  No
+                * locking is required because it can't be dropped.
+                */
+               *tablespace = DEFAULTTABLESPACE_OID;
+               need_to_unlock = false;
+       }
+       else
+       {
+               /*
+                * Choose an OID using our pid, so that if several backends have the
+                * same multi-tablespace setting they'll spread out.  We could easily
+                * do better than this if more serious load balancing is judged
+                * useful.
+                */
+               int             index = MyProcPid % length;
+               int             first_index = index;
+               Oid             oid = InvalidOid;
+
+               /*
+                * Take the tablespace create/drop lock while we look the name up.
+                * This prevents the tablespace from being dropped while we're trying
+                * to resolve the name, or while the called is trying to create an
+                * undo log in it.  The caller will have to release this lock.
+                */
+               LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
+               for (;;)
+               {
+                       const char *name = list_nth(namelist, index);
+
+                       oid = get_tablespace_oid(name, true);
+                       if (oid == InvalidOid)
+                       {
+                               /* Unknown tablespace, try the next one. */
+                               index = (index + 1) % length;
+                               /*
+                                * But if we've tried them all, it's time to complain.  We'll
+                                * arbitrarily complain about the last one we tried in the
+                                * error message.
+                                */
+                               if (index == first_index)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_UNDEFINED_OBJECT),
+                                                        errmsg("tablespace \"%s\" does not exist", name),
+                                                        errhint("Create the tablespace or set undo_tablespaces to a valid or empty list.")));
+                               continue;
+                       }
+                       if (oid == GLOBALTABLESPACE_OID)
+                               ereport(ERROR,
+                                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                errmsg("undo logs cannot be placed in pg_global tablespace")));
+                       /* If we got here we succeeded in finding one. */
+                       break;
+               }
+
+               Assert(oid != InvalidOid);
+               *tablespace = oid;
+               need_to_unlock = true;
+       }
+
+       /*
+        * If we came here because the user changed undo_tablesaces, then detach
+        * from any undo logs we happen to be attached to.
+        */
+       if (force_detach)
+       {
+               for (i = 0; i < UndoLogCategories; ++i)
+               {
+                       UndoLogSlot *slot = CurrentSession->attached_undo_slots[i];
+
+                       if (slot != NULL)
+                       {
+                               LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+                               slot->pid = InvalidPid;
+                               slot->meta.unlogged.xid = InvalidTransactionId;
+                               LWLockRelease(&slot->mutex);
+
+                               LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+                               slot->next_free = UndoLogShared->free_lists[i];
+                               UndoLogShared->free_lists[i] = slot->logno;
+                               LWLockRelease(UndoLogLock);
+
+                               CurrentSession->attached_undo_slots[i] = NULL;
+                       }
+               }
+       }
+
+       return need_to_unlock;
+}
+
+bool
+DropUndoLogsInTablespace(Oid tablespace)
+{
+       DIR *dir;
+       char undo_path[MAXPGPATH];
+       UndoLogSlot *slot = NULL;
+       int             i;
+
+       Assert(LWLockHeldByMe(TablespaceCreateLock));
+       Assert(tablespace != DEFAULTTABLESPACE_OID);
+
+       /* First, try to kick everyone off any undo logs in this tablespace. */
+       while ((slot = UndoLogNextSlot(slot)))
+       {
+               bool ok;
+               bool return_to_freelist = false;
+
+               /* Skip undo logs in other tablespaces. */
+               if (slot->meta.tablespace != tablespace)
+                       continue;
+
+               /* Check if this undo log can be forcibly detached. */
+               LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+               if (slot->meta.discard == slot->meta.unlogged.insert &&
+                       (slot->meta.unlogged.xid == InvalidTransactionId ||
+                        !TransactionIdIsInProgress(slot->meta.unlogged.xid)))
+               {
+                       slot->meta.unlogged.xid = InvalidTransactionId;
+                       if (slot->pid != InvalidPid)
+                       {
+                               slot->pid = InvalidPid;
+                               return_to_freelist = true;
+                       }
+                       ok = true;
+               }
+               else
+               {
+                       /*
+                        * There is data we need in this undo log.  We can't force it to
+                        * be detached.
+                        */
+                       ok = false;
+               }
+               LWLockRelease(&slot->mutex);
+
+               /* If we failed, then give up now and report failure. */
+               if (!ok)
+                       return false;
+
+               /*
+                * Put this undo log back on the appropriate free-list.  No one can
+                * attach to it while we hold TablespaceCreateLock, but if we return
+                * earlier in a future go around this loop, we need the undo log to
+                * remain usable.  We'll remove all appropriate logs from the
+                * free-lists in a separate step below.
+                */
+               if (return_to_freelist)
+               {
+                       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+                       slot->next_free = UndoLogShared->free_lists[slot->meta.category];
+                       UndoLogShared->free_lists[slot->meta.category] = slot->logno;
+                       LWLockRelease(UndoLogLock);
+               }
+       }
+
+       /*
+        * We detached all backends from undo logs in this tablespace, and no one
+        * can attach to any non-default-tablespace undo logs while we hold
+        * TablespaceCreateLock.  We can now drop the undo logs.
+        */
+       slot = NULL;
+       while ((slot = UndoLogNextSlot(slot)))
+       {
+               /* Skip undo logs in other tablespaces. */
+               if (slot->meta.tablespace != tablespace)
+                       continue;
+
+               /*
+                * Make sure no buffers remain.  When that is done by
+                * UndoLogDiscard(), the final page is left in shared_buffers because
+                * it may contain data, or at least be needed again very soon.  Here
+                * we need to drop even that page from the buffer pool.
+                */
+               forget_undo_buffers(slot->logno, slot->meta.discard, slot->meta.discard, true);
+
+               /*
+                * TODO: For now we drop the undo log, meaning that it will never be
+                * used again.  That wastes the rest of its address space.  Instead,
+                * we should put it onto a special list of 'offline' undo logs, ready
+                * to be reactivated in some other tablespace.  Then we can keep the
+                * unused portion of its address space.
+                */
+               LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+               slot->meta.status = UNDO_LOG_STATUS_DISCARDED;
+               LWLockRelease(&slot->mutex);
+       }
+
+       /* Forget about all sync requests relating to this tablespace. */
+       undofile_forget_sync_tablespace(tablespace);
+
+       /* Unlink all undo segment files in this tablespace. */
+       UndoLogDirectory(tablespace, undo_path);
+
+       dir = AllocateDir(undo_path);
+       if (dir != NULL)
+       {
+               struct dirent *de;
+
+               while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL)
+               {
+                       char segment_path[MAXPGPATH];
+
+                       if (strcmp(de->d_name, ".") == 0 ||
+                               strcmp(de->d_name, "..") == 0)
+                               continue;
+                       snprintf(segment_path, sizeof(segment_path), "%s/%s",
+                                        undo_path, de->d_name);
+                       if (unlink(segment_path) < 0)
+                               elog(LOG, "couldn't unlink file \"%s\": %m", segment_path);
+               }
+               FreeDir(dir);
+       }
+
+       /* Remove all dropped undo logs from the free-lists. */
+       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+       for (i = 0; i < UndoLogCategories; ++i)
+       {
+               UndoLogSlot *slot;
+               UndoLogNumber *place;
+
+               place = &UndoLogShared->free_lists[i];
+               while (*place != InvalidUndoLogNumber)
+               {
+                       slot = find_undo_log_slot(*place, true);
+                       if (!slot)
+                               elog(ERROR,
+                                        "corrupted undo log freelist, unknown log %u", *place);
+                       if (slot->meta.status == UNDO_LOG_STATUS_DISCARDED)
+                               *place = slot->next_free;
+                       else
+                               place = &slot->next_free;
+               }
+       }
+       LWLockRelease(UndoLogLock);
+
+       return true;
+}
+
+void
+ResetUndoLogs(UndoLogCategory category)
+{
+       UndoLogSlot *slot = NULL;
+
+       while ((slot = UndoLogNextSlot(slot)))
+       {
+               DIR        *dir;
+               struct dirent *de;
+               char    undo_path[MAXPGPATH];
+               char    segment_prefix[MAXPGPATH];
+               size_t  segment_prefix_size;
+
+               if (slot->meta.category != category)
+                       continue;
+
+               /* Scan the directory for files belonging to this undo log. */
+               snprintf(segment_prefix, sizeof(segment_prefix), "%06X.", slot->logno);
+               segment_prefix_size = strlen(segment_prefix);
+               UndoLogDirectory(slot->meta.tablespace, undo_path);
+               dir = AllocateDir(undo_path);
+               if (dir == NULL)
+                       continue;
+               while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL)
+               {
+                       char segment_path[MAXPGPATH];
+
+                       if (strncmp(de->d_name, segment_prefix, segment_prefix_size) != 0)
+                               continue;
+                       snprintf(segment_path, sizeof(segment_path), "%s/%s",
+                                        undo_path, de->d_name);
+                       elog(DEBUG1, "unlinked undo segment \"%s\"", segment_path);
+                       if (unlink(segment_path) < 0)
+                               elog(LOG, "couldn't unlink file \"%s\": %m", segment_path);
+               }
+               FreeDir(dir);
+
+               /*
+                * We have no segment files.  Set the pointers to indicate that there
+                * is no data.  The discard and insert pointers point to the first
+                * usable byte in the segment we will create when we next try to
+                * allocate.  This is a bit strange, because it means that they are
+                * past the end pointer.  That's the same as when new undo logs are
+                * created.
+                *
+                * TODO: Should we rewind to zero instead, so we can reuse that (now)
+                * unreferenced address space?
+                */
+               slot->meta.unlogged.insert = slot->meta.discard = slot->meta.end +
+                       UndoLogBlockHeaderSize;
+       }
+}
+
+Datum
+pg_stat_get_undo_logs(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_UNDO_LOGS_COLS 9
+       ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+       TupleDesc       tupdesc;
+       Tuplestorestate *tupstore;
+       MemoryContext per_query_ctx;
+       MemoryContext oldcontext;
+       char *tablespace_name = NULL;
+       Oid last_tablespace = InvalidOid;
+       int                     i;
+
+       /* check to see if caller supports us returning a tuplestore */
+       if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("set-valued function called in context that cannot accept a set")));
+       if (!(rsinfo->allowedModes & SFRM_Materialize))
+               ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("materialize mode required, but it is not " \
+                                               "allowed in this context")));
+
+       /* Build a tuple descriptor for our result type */
+       if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
+
+       per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+       oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+       tupstore = tuplestore_begin_heap(true, false, work_mem);
+       rsinfo->returnMode = SFRM_Materialize;
+       rsinfo->setResult = tupstore;
+       rsinfo->setDesc = tupdesc;
+
+       MemoryContextSwitchTo(oldcontext);
+
+       /* Scan all undo logs to build the results. */
+       for (i = 0; i < UndoLogShared->nslots; ++i)
+       {
+               UndoLogSlot *slot = &UndoLogShared->slots[i];
+               char buffer[17];
+               Datum values[PG_STAT_GET_UNDO_LOGS_COLS];
+               bool nulls[PG_STAT_GET_UNDO_LOGS_COLS] = { false };
+               Oid tablespace;
+
+               /*
+                * This won't be a consistent result overall, but the values for each
+                * log will be consistent because we'll take the per-log lock while
+                * copying them.
+                */
+               LWLockAcquire(&slot->mutex, LW_SHARED);
+
+               /* Skip unused slots and entirely discarded undo logs. */
+               if (slot->logno == InvalidUndoLogNumber ||
+                       slot->meta.status == UNDO_LOG_STATUS_DISCARDED)
+               {
+                       LWLockRelease(&slot->mutex);
+                       continue;
+               }
+
+               values[0] = ObjectIdGetDatum((Oid) slot->logno);
+               values[1] = CStringGetTextDatum(
+                       slot->meta.category == UNDO_PERMANENT ? "permanent" :
+                       slot->meta.category == UNDO_UNLOGGED ? "unlogged" :
+                       slot->meta.category == UNDO_TEMP ? "temporary" :
+                       slot->meta.category == UNDO_SHARED ? "shared" : "<unknown>");
+               tablespace = slot->meta.tablespace;
+
+               snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+                                MakeUndoRecPtr(slot->logno, slot->meta.discard));
+               values[3] = CStringGetTextDatum(buffer);
+               snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+                                MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert));
+               values[4] = CStringGetTextDatum(buffer);
+               snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+                                MakeUndoRecPtr(slot->logno, slot->meta.end));
+               values[5] = CStringGetTextDatum(buffer);
+               if (slot->meta.unlogged.xid == InvalidTransactionId)
+                       nulls[6] = true;
+               else
+                       values[6] = TransactionIdGetDatum(slot->meta.unlogged.xid);
+               if (slot->pid == InvalidPid)
+                       nulls[7] = true;
+               else
+                       values[7] = Int32GetDatum((int32) slot->pid);
+               switch (slot->meta.status)
+               {
+               case UNDO_LOG_STATUS_ACTIVE:
+                       values[8] = CStringGetTextDatum("ACTIVE"); break;
+               case UNDO_LOG_STATUS_FULL:
+                       values[8] = CStringGetTextDatum("FULL"); break;
+               default:
+                       nulls[8] = true;
+               }
+               LWLockRelease(&slot->mutex);
+
+               /*
+                * Deal with potentially slow tablespace name lookup without the lock.
+                * Avoid making multiple calls to that expensive function for the
+                * common case of repeating tablespace.
+                */
+               if (tablespace != last_tablespace)
+               {
+                       if (tablespace_name)
+                               pfree(tablespace_name);
+                       tablespace_name = get_tablespace_name(tablespace);
+                       last_tablespace = tablespace;
+               }
+               if (tablespace_name)
+               {
+                       values[2] = CStringGetTextDatum(tablespace_name);
+                       nulls[2] = false;
+               }
+               else
+                       nulls[2] = true;
+
+               tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+       }
+
+       if (tablespace_name)
+               pfree(tablespace_name);
+       tuplestore_donestoring(tupstore);
+
+       return (Datum) 0;
+}
+
+/*
+ * replay the creation of a new undo log
+ */
+static void
+undolog_xlog_create(XLogReaderState *record)
+{
+       xl_undolog_create *xlrec = (xl_undolog_create *) XLogRecGetData(record);
+       UndoLogSlot *slot;
+
+       /* Create meta-data space in shared memory. */
+       LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+       /* TODO: assert that it doesn't exist already? */
+
+       slot = allocate_undo_log_slot();
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->logno = xlrec->logno;
+       slot->meta.logno = xlrec->logno;
+       slot->meta.status = UNDO_LOG_STATUS_ACTIVE;
+       slot->meta.category = xlrec->category;
+       slot->meta.tablespace = xlrec->tablespace;
+       slot->meta.unlogged.insert = UndoLogBlockHeaderSize;
+       slot->meta.discard = UndoLogBlockHeaderSize;
+       UndoLogShared->next_logno = Max(xlrec->logno + 1, UndoLogShared->next_logno);
+       LWLockRelease(&slot->mutex);
+       LWLockRelease(UndoLogLock);
+}
+
+/*
+ * replay the addition of a new segment to an undo log
+ */
+static void
+undolog_xlog_extend(XLogReaderState *record)
+{
+       xl_undolog_extend *xlrec = (xl_undolog_extend *) XLogRecGetData(record);
+
+       /* Extend exactly as we would during DO phase. */
+       extend_undo_log(xlrec->logno, xlrec->end);
+}
+
+/*
+ * Drop all buffers for the given undo log, from the old_discard to up
+ * new_discard.  If drop_tail is true, also drop the buffer that holds
+ * new_discard; this is used when discarding undo logs completely, for example
+ * via DROP TABLESPACE.  If it is false, then the final buffer is not dropped
+ * because it may contain data.
+ *
+ */
+static void
+forget_undo_buffers(int logno, UndoLogOffset old_discard,
+                                       UndoLogOffset new_discard, bool drop_tail)
+{
+       BlockNumber old_blockno;
+       BlockNumber new_blockno;
+       RelFileNode     rnode;
+
+       UndoRecPtrAssignRelFileNode(rnode, MakeUndoRecPtr(logno, old_discard));
+       old_blockno = old_discard / BLCKSZ;
+       new_blockno = new_discard / BLCKSZ;
+       if (drop_tail)
+               ++new_blockno;
+       while (old_blockno < new_blockno)
+       {
+               ForgetBuffer(rnode, UndoLogForkNum, old_blockno);
+               ForgetLocalBuffer(rnode, UndoLogForkNum, old_blockno++);
+       }
+}
+/*
+ * replay an undo segment discard record
+ */
+static void
+undolog_xlog_discard(XLogReaderState *record)
+{
+       xl_undolog_discard *xlrec = (xl_undolog_discard *) XLogRecGetData(record);
+       UndoLogSlot *slot;
+       UndoLogOffset discard;
+       UndoLogOffset end;
+       UndoLogOffset old_segment_begin;
+       UndoLogOffset new_segment_begin;
+       RelFileNode rnode = {0};
+
+       slot = find_undo_log_slot(xlrec->logno, false);
+       if (slot == NULL)
+               elog(ERROR, "unknown undo log %d", xlrec->logno);
+
+       /*
+        * We're about to discard undologs. In Hot Standby mode, ensure that
+        * there's no queries running which need to get tuple from discarded undo.
+        *
+        * XXX we are passing empty rnode to the conflict function so that it can
+        * check conflict in all the backend regardless of which database the
+        * backend is connected.
+        */
+       if (InHotStandby && TransactionIdIsValid(xlrec->latestxid))
+               ResolveRecoveryConflictWithSnapshot(xlrec->latestxid, rnode);
+
+       /*
+        * See if we need to unlink or rename any files, but don't consider it an
+        * error if we find that files are missing.  Since UndoLogDiscard()
+        * performs filesystem operations before WAL logging or updating shmem
+        * which could be checkpointed, a crash could have left files already
+        * deleted, but we could replay WAL that expects the files to be there.
+        */
+
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       Assert(slot->logno == xlrec->logno);
+       discard = slot->meta.discard;
+       end = slot->meta.end;
+       LWLockRelease(&slot->mutex);
+
+       /* Drop buffers before we remove/recycle any files. */
+       forget_undo_buffers(xlrec->logno, discard, xlrec->discard,
+                                               xlrec->entirely_discarded);
+
+       /* Rewind to the start of the segment. */
+       old_segment_begin = discard - discard % UndoLogSegmentSize;
+       new_segment_begin = xlrec->discard - xlrec->discard % UndoLogSegmentSize;
+
+       /* Unlink or rename segments that are no longer in range. */
+       while (old_segment_begin < new_segment_begin)
+       {
+               char    discard_path[MAXPGPATH];
+
+               /* Tell the checkpointer that the file is going away. */
+               undofile_forget_sync(slot->logno,
+                                                        old_segment_begin / UndoLogSegmentSize,
+                                                        slot->meta.tablespace);
+
+               UndoLogSegmentPath(xlrec->logno, old_segment_begin / UndoLogSegmentSize,
+                                                  slot->meta.tablespace, discard_path);
+
+               /* Can we recycle the oldest segment? */
+               if (end < xlrec->end)
+               {
+                       char    recycle_path[MAXPGPATH];
+
+                       UndoLogSegmentPath(xlrec->logno, end / UndoLogSegmentSize,
+                                                          slot->meta.tablespace, recycle_path);
+                       if (rename(discard_path, recycle_path) == 0)
+                       {
+                               elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"",
+                                        discard_path, recycle_path);
+                               end += UndoLogSegmentSize;
+                       }
+                       else
+                       {
+                               elog(LOG, "could not rename \"%s\" to \"%s\": %m",
+                                        discard_path, recycle_path);
+                       }
+               }
+               else
+               {
+                       if (unlink(discard_path) == 0)
+                               elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path);
+                       else
+                               elog(LOG, "could not unlink \"%s\": %m", discard_path);
+               }
+               old_segment_begin += UndoLogSegmentSize;
+       }
+
+       /* Create any further new segments that are needed the slow way. */
+       while (end < xlrec->end)
+       {
+               allocate_empty_undo_segment(xlrec->logno, slot->meta.tablespace, end);
+               end += UndoLogSegmentSize;
+       }
+
+       /* Flush the directory entries before next checkpoint. */
+       undofile_request_sync_dir(slot->meta.tablespace);
+
+       /* Update shmem. */
+       LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+       slot->meta.discard = xlrec->discard;
+       slot->meta.end = end;
+       LWLockRelease(&slot->mutex);
+
+       /* If we discarded everything, the slot can be given up. */
+       if (xlrec->entirely_discarded)
+               free_undo_log_slot(slot);
+}
+
+/*
+ * replay the switch of a undo log
+ */
+static void
+undolog_xlog_switch(XLogReaderState *record)
+{
+       xl_undolog_switch *xlrec = (xl_undolog_switch *) XLogRecGetData(record);
+       UndoLogSlot *slot;
+
+       slot = find_undo_log_slot(xlrec->logno, false);
+
+       /*
+        * Restore the log switch information in the MyUndoLogState this will be
+        * reset by following UndoLogAllocateDuringRecovery.
+        */
+       slot->meta.unlogged.prevlog_xact_start = xlrec->prevlog_xact_start;
+       slot->meta.unlogged.prevlog_last_urp = xlrec->prevlog_last_urp;
+}
+
+void
+undolog_redo(XLogReaderState *record)
+{
+       uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+       switch (info)
+       {
+               case XLOG_UNDOLOG_CREATE:
+                       undolog_xlog_create(record);
+                       break;
+               case XLOG_UNDOLOG_EXTEND:
+                       undolog_xlog_extend(record);
+                       break;
+               case XLOG_UNDOLOG_DISCARD:
+                       undolog_xlog_discard(record);
+                       break;
+               case XLOG_UNDOLOG_SWITCH:
+                       undolog_xlog_switch(record);
+               default:
+                       elog(PANIC, "undo_redo: unknown op code %u", info);
+       }
+}
+
+/*
+ * For assertions only.
+ */
+bool
+AmAttachedToUndoLogSlot(UndoLogSlot *slot)
+{
+       /*
+        * In general, we can't access log's members without locking.  But this
+        * function is intended only for asserting that you are attached, and
+        * while you're attached the slot can't be recycled, so don't bother
+        * locking.
+        */
+       return CurrentSession->attached_undo_slots[slot->meta.category] == slot;
+}
+
+/*
+ * For testing use only.  This function is only used by the test_undo module.
+ */
+void
+UndoLogDetachFull(void)
+{
+       int             i;
+
+       for (i = 0; i < UndoLogCategories; ++i)
+               if (CurrentSession->attached_undo_slots[i])
+                       detach_current_undo_log(i, true);
+}
index 9238fbe98d7d1dfece034be04f09fa3da93a7d57..4671838dd2c1a4793d99b19dfad4f760d101c0e9 100644 (file)
@@ -20,6 +20,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/session.h"
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
@@ -519,6 +520,8 @@ BootstrapModeMain(void)
 
        InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
 
+       InitializeSession();
+
        /* Initialize stuff for bootstrap-file processing */
        for (i = 0; i < MAXATTR; i++)
        {
index ea4c85e3959a403db045ee461cc2e4a91dad5260..cbe04e4dccd5dd3efaadf5402d132d09409c968b 100644 (file)
@@ -1062,6 +1062,10 @@ GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublicatio
     ON pg_subscription TO public;
 
 
+CREATE VIEW pg_stat_undo_logs AS
+    SELECT *
+    FROM pg_stat_get_undo_logs();
+
 --
 -- We have a few function definitions in here, too.
 -- At some point there might be enough to justify breaking them out into
index 84efb414d8c5b8f7de379d49ccbba3ac1150a5ba..17c28b2c66c7e1757d81592cbf5cacafca60f437 100644 (file)
@@ -55,6 +55,7 @@
 #include "access/htup_details.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
@@ -496,6 +497,20 @@ DropTableSpace(DropTableSpaceStmt *stmt)
         */
        LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
 
+       /*
+        * Drop the undo logs in this tablespace.  This will fail (without
+        * dropping anything) if there are undo logs that we can't afford to drop
+        * because they contain non-discarded data or a transaction is in
+        * progress.  Since we hold TablespaceCreateLock, no other session will be
+        * able to attach to an undo log in this tablespace (or any tablespace
+        * except default) concurrently.
+        */
+       if (!DropUndoLogsInTablespace(tablespaceoid))
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("tablespace \"%s\" cannot be dropped because it contains non-empty undo logs",
+                                               tablespacename)));
+
        /*
         * Try to remove the physical infrastructure.
         */
@@ -1517,6 +1532,14 @@ tblspc_redo(XLogReaderState *record)
        {
                xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
 
+               /* This shouldn't be able to fail in recovery. */
+               LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
+               if (!DropUndoLogsInTablespace(xlrec->ts_id))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                        errmsg("tablespace cannot be dropped because it contains non-empty undo logs")));
+               LWLockRelease(TablespaceCreateLock);
+
                /*
                 * If we issued a WAL record for a drop tablespace it implies that
                 * there were no files in it at all when the DROP was done. That means
index b4f2b28b517fbe58410d215398b938ae7fc7bb55..c742861dae8949b62cd7c96db7ba4e43750fb84c 100644 (file)
@@ -4070,6 +4070,27 @@ pgstat_get_wait_io(WaitEventIO w)
                case WAIT_EVENT_TWOPHASE_FILE_WRITE:
                        event_name = "TwophaseFileWrite";
                        break;
+               case WAIT_EVENT_UNDO_CHECKPOINT_READ:
+                       event_name = "UndoCheckpointRead";
+                       break;
+               case WAIT_EVENT_UNDO_CHECKPOINT_WRITE:
+                       event_name = "UndoCheckpointWrite";
+                       break;
+               case WAIT_EVENT_UNDO_CHECKPOINT_SYNC:
+                       event_name = "UndoCheckpointSync";
+                       break;
+               case WAIT_EVENT_UNDO_FILE_READ:
+                       event_name = "UndoFileRead";
+                       break;
+               case WAIT_EVENT_UNDO_FILE_WRITE:
+                       event_name = "UndoFileWrite";
+                       break;
+               case WAIT_EVENT_UNDO_FILE_FLUSH:
+                       event_name = "UndoFileFlush";
+                       break;
+               case WAIT_EVENT_UNDO_FILE_SYNC:
+                       event_name = "UndoFileSync";
+                       break;
                case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
                        event_name = "WALSenderTimelineHistoryRead";
                        break;
index d5f9b617c84c5f49b8ee2eb3f276cf1fbbf7d78b..abcb5e5ad21f8f44e3c34ac82b4aa40e95180286 100644 (file)
@@ -1368,7 +1368,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
        char       *page;
        size_t          pad;
        PageHeader      phdr;
-       int                     segmentno = 0;
+       BlockNumber     first_blkno = 0;
        char       *segmentpath;
        bool            verify_checksum = false;
 
@@ -1406,12 +1406,18 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
                        segmentpath = strstr(filename, ".");
                        if (segmentpath != NULL)
                        {
-                               segmentno = atoi(segmentpath + 1);
-                               if (segmentno == 0)
+                               char       *end;
+                               if (strstr(readfilename, "undo"))
+                                       first_blkno = strtol(segmentpath + 1, &end, 16) / BLCKSZ;
+                               else
+                                       first_blkno = strtol(segmentpath + 1, &end, 10) * RELSEG_SIZE;
+                               if (*end != '\0')
                                        ereport(ERROR,
-                                                       (errmsg("invalid segment number %d in file \"%s\"",
-                                                                       segmentno, filename)));
+                                                       (errmsg("invalid segment number in file \"%s\"",
+                                                                       filename)));
                        }
+                       else
+                               first_blkno = 0;
                }
        }
 
@@ -1451,7 +1457,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
                                 */
                                if (!PageIsNew(page) && PageGetLSN(page) < startptr)
                                {
-                                       checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
+                                       checksum = pg_checksum_page((char *) page, blkno + first_blkno);
                                        phdr = (PageHeader) page;
                                        if (phdr->pd_checksum != checksum)
                                        {
index 151c3ef882582515bf29762425ef299d35884a37..d3a9c4d64c6650b5e3c0ffc51efbf37559a239f3 100644 (file)
@@ -154,6 +154,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor
                case RM_COMMIT_TS_ID:
                case RM_REPLORIGIN_ID:
                case RM_GENERIC_ID:
+               case RM_UNDOLOG_ID:
                        /* just deal with xid, and done */
                        ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record),
                                                                        buf.origptr);
index 6f3a4028547c658701ab620b9093e339aa8ac8c1..2b1d60680e8622c8c93a9e8787df18d2d3784799 100644 (file)
@@ -177,6 +177,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
 static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
 static inline int32 GetPrivateRefCount(Buffer buffer);
 static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+static void InvalidateBuffer(BufferDesc *buf);
 
 /*
  * Ensure that the PrivateRefCountArray has sufficient space to store one more
@@ -620,10 +621,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * valid, the page is zeroed instead of throwing an error. This is intended
  * for non-critical data, where the caller is prepared to repair errors.
  *
- * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * In RBM_ZERO mode, if the page isn't in buffer cache already, it's
  * filled with zeros instead of reading it from disk.  Useful when the caller
  * is going to fill the page from scratch, since this saves I/O and avoids
  * unnecessary failure if the page-on-disk has corrupt page headers.
+ *
+ * In RBM_ZERO_AND_LOCK mode, the page is zeroed and also locked.
  * The page is returned locked to ensure that the caller has a chance to
  * initialize the page before it's made visible to others.
  * Caution: do not use this mode to read a page that is beyond the relation's
@@ -674,24 +677,20 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
 /*
  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
  *             a relcache entry for the relation.
- *
- * NB: At present, this function may only be used on permanent relations, which
- * is OK, because we only use it during XLOG replay.  If in the future we
- * want to use it on temporary or unlogged relations, we could pass additional
- * parameters.
  */
 Buffer
 ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
                                                  BlockNumber blockNum, ReadBufferMode mode,
-                                                 BufferAccessStrategy strategy)
+                                                 BufferAccessStrategy strategy,
+                                                 char relpersistence)
 {
        bool            hit;
 
-       SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
-
-       Assert(InRecovery);
+       SMgrRelation smgr = smgropen(rnode,
+                                                                relpersistence == RELPERSISTENCE_TEMP
+                                                                ? MyBackendId : InvalidBackendId);
 
-       return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
+       return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum,
                                                         mode, strategy, &hit);
 }
 
@@ -885,7 +884,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                 * Read in the page, unless the caller intends to overwrite it and
                 * just wants us to allocate a buffer.
                 */
-               if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+               if (mode == RBM_ZERO ||
+                       mode == RBM_ZERO_AND_LOCK ||
+                       mode == RBM_ZERO_AND_CLEANUP_LOCK)
                        MemSet((char *) bufBlock, 0, BLCKSZ);
                else
                {
@@ -1339,6 +1340,61 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
        return buf;
 }
 
+/*
+ * ForgetBuffer -- drop a buffer from shared buffers
+ *
+ * If the buffer isn't present in shared buffers, nothing happens.  If it is
+ * present, it is discarded without making any attempt to write it back out to
+ * the operating system.  The caller must therefore somehow be sure that the
+ * data won't be needed for anything now or in the future.  It assumes that
+ * there is no concurrent access to the block, except that it might be being
+ * concurrently written.
+ */
+void
+ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum)
+{
+       SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+       BufferTag       tag;                    /* identity of target block */
+       uint32          hash;                   /* hash value for tag */
+       LWLock     *partitionLock;      /* buffer partition lock for it */
+       int                     buf_id;
+       BufferDesc *bufHdr;
+       uint32          buf_state;
+
+       /* create a tag so we can lookup the buffer */
+       INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+       /* determine its hash code and partition lock ID */
+       hash = BufTableHashCode(&tag);
+       partitionLock = BufMappingPartitionLock(hash);
+
+       /* see if the block is in the buffer pool */
+       LWLockAcquire(partitionLock, LW_SHARED);
+       buf_id = BufTableLookup(&tag, hash);
+       LWLockRelease(partitionLock);
+
+       /* didn't find it, so nothing to do */
+       if (buf_id < 0)
+               return;
+
+       /* take the buffer header lock */
+       bufHdr = GetBufferDescriptor(buf_id);
+       buf_state = LockBufHdr(bufHdr);
+
+       /*
+        * The buffer might been evicted after we released the partition lock and
+        * before we acquired the buffer header lock.  If so, the buffer we've
+        * locked might contain some other data which we shouldn't touch. If the
+        * buffer hasn't been recycled, we proceed to invalidate it.
+        */
+       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+               bufHdr->tag.blockNum == blockNum &&
+               bufHdr->tag.forkNum == forkNum)
+               InvalidateBuffer(bufHdr);               /* releases spinlock */
+       else
+               UnlockBufHdr(bufHdr, buf_state);
+}
+
 /*
  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
  * freelist.
index f5f6a29222b7bf5582ee815e9ca916be33bf30c2..a6fe3dbf8290d9356b917d565723b43fb948e1ac 100644 (file)
@@ -272,6 +272,49 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
        return bufHdr;
 }
 
+/*
+ * ForgetLocalBuffer - drop a buffer from local buffers
+ *
+ * This is similar to bufmgr.c's ForgetBuffer, except that we do not need
+ * to do any locking since this is all local.  As with that function, this
+ * must be used very carefully, since we'll cheerfully throw away dirty
+ * buffers without any attempt to write them.
+ */
+void
+ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum)
+{
+       SMgrRelation smgr = smgropen(rnode, BackendIdForTempRelations());
+       BufferTag       tag;                                    /* identity of target block */
+       LocalBufferLookupEnt *hresult;
+       BufferDesc *bufHdr;
+       uint32          buf_state;
+
+       /*
+        * If somehow this is the first request in the session, there's nothing to
+        * do.  (This probably shouldn't happen, though.)
+        */
+       if (LocalBufHash == NULL)
+               return;
+
+       /* create a tag so we can lookup the buffer */
+       INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+       /* see if the block is in the local buffer pool */
+       hresult = (LocalBufferLookupEnt *)
+       hash_search(LocalBufHash, (void *) &tag, HASH_REMOVE, NULL);
+
+       /* didn't find it, so nothing to do */
+       if (!hresult)
+               return;
+
+       /* mark buffer invalid */
+       bufHdr = GetLocalBufferDescriptor(hresult->id);
+       CLEAR_BUFFERTAG(bufHdr->tag);
+       buf_state = pg_atomic_read_u32(&bufHdr->state);
+       buf_state &= ~(BM_VALID | BM_TAG_VALID | BM_DIRTY);
+       pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+}
+
 /*
  * MarkLocalBufferDirty -
  *       mark a local buffer dirty
index 315c74c745688868ae3623b2d728954b63d419b4..3c4f0537d412122682a53314fcc9e67be0892628 100644 (file)
@@ -322,7 +322,6 @@ static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
-static int     fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
 static int     fsync_parent_path(const char *fname, int elevel);
 
 
@@ -3345,7 +3344,7 @@ unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
  *
  * Returns 0 if the operation succeeded, -1 otherwise.
  */
-static int
+int
 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
 {
        int                     fd;
index d7d733530ff740216ab9c27f50c5a218b61d4536..12c324925ca90a1524a8f14a3109c3eeb04ee186 100644 (file)
@@ -21,6 +21,7 @@
 #include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
+#include "access/undolog.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -125,6 +126,7 @@ CreateSharedMemoryAndSemaphores(int port)
                size = add_size(size, ProcGlobalShmemSize());
                size = add_size(size, XLOGShmemSize());
                size = add_size(size, CLOGShmemSize());
+               size = add_size(size, UndoLogShmemSize());
                size = add_size(size, CommitTsShmemSize());
                size = add_size(size, SUBTRANSShmemSize());
                size = add_size(size, TwoPhaseShmemSize());
@@ -213,6 +215,7 @@ CreateSharedMemoryAndSemaphores(int port)
         */
        XLOGShmemInit();
        CLOGShmemInit();
+       UndoLogShmemInit();
        CommitTsShmemInit();
        SUBTRANSShmemInit();
        MultiXactShmemInit();
index bc1aa88322b0c8fbf40676a8e4ce27d304ac0f02..5df658d3494e7ca748d5f0dc38a64a348ca08387 100644 (file)
@@ -522,6 +522,8 @@ RegisterLWLockTranches(void)
        LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
        LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
        LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact");
+       LWLockRegisterTranche(LWTRANCHE_UNDOLOG, "undo_log");
+       LWLockRegisterTranche(LWTRANCHE_UNDODISCARD, "undo_discard");
 
        /* Register named tranches. */
        for (i = 0; i < NamedLWLockTrancheRequests; i++)
index db478432291b6fd4f65092c29a202bb5f1199f11..4b42a1cf0ba16e75d1ad189433ee9679d809ca51 100644 (file)
@@ -49,3 +49,4 @@ MultiXactTruncationLock                               41
 OldSnapshotTimeMapLock                         42
 LogicalRepWorkerLock                           43
 CLogTruncationLock                                     44
+UndoLogLock                                      45
index e486b7c0d1c2c97ecda21c02f96533397d4f2c97..ff2e5e2db49cd47e477e43a44bd473d9abd57185 100644 (file)
@@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = md.o smgr.o
+OBJS = md.o smgr.o undofile.o
 
 include $(top_srcdir)/src/backend/common.mk
index d00b275e468a6d91a5ec98d2d97ea3307e1d3f17..93df85cba99e6b45f90a76a45c6ad0ea957ee36d 100644 (file)
  */
 #include "postgres.h"
 
+#include "catalog/database_internal.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
+#include "storage/undofile.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
@@ -81,6 +83,24 @@ static const f_smgr smgrsw[] = {
                .smgr_nblocks = mdnblocks,
                .smgr_truncate = mdtruncate,
                .smgr_immedsync = mdimmedsync,
+       },
+       /* undo logs */
+       {
+               .smgr_init = undofile_init,
+               .smgr_shutdown = undofile_shutdown,
+               .smgr_open = undofile_open,
+               .smgr_close = undofile_close,
+               .smgr_create = undofile_create,
+               .smgr_exists = undofile_exists,
+               .smgr_unlink = undofile_unlink,
+               .smgr_extend = undofile_extend,
+               .smgr_prefetch = undofile_prefetch,
+               .smgr_read = undofile_read,
+               .smgr_write = undofile_write,
+               .smgr_writeback = undofile_writeback,
+               .smgr_nblocks = undofile_nblocks,
+               .smgr_truncate = undofile_truncate,
+               .smgr_immedsync = undofile_immedsync,
        }
 };
 
@@ -105,6 +125,8 @@ smgrwhich(RelFileNode rnode)
 {
        switch (rnode.dbNode)
        {
+               case UndoDbOid:
+                       return 1;                       /* undofile.c */
                default:
                        return 0;                       /* md.c */
        }
@@ -189,6 +211,7 @@ smgropen(RelFileNode rnode, BackendId backend)
                reln->smgr_fsm_nblocks = InvalidBlockNumber;
                reln->smgr_vm_nblocks = InvalidBlockNumber;
                reln->smgr_which = smgrwhich(rnode);
+               reln->private_data = NULL;
 
                /* implementation-specific initialization */
                smgrsw[reln->smgr_which].smgr_open(reln);
diff --git a/src/backend/storage/smgr/undofile.c b/src/backend/storage/smgr/undofile.c
new file mode 100644 (file)
index 0000000..04d4514
--- /dev/null
@@ -0,0 +1,422 @@
+/*
+ * undofile.h
+ *
+ * PostgreSQL undo file manager.  This module provides SMGR-compatible
+ * interface to the files that back undo logs on the filesystem, so that undo
+ * log data can use the shared buffer pool.  Other aspects of undo log
+ * management are provided by undolog.c, so the SMGR interfaces not directly
+ * concerned with reading, writing and flushing data are unimplemented.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/storage/smgr/undofile.c
+ */
+
+#include "postgres.h"
+
+#include "access/undolog.h"
+#include "access/xlog.h"
+#include "catalog/database_internal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/fd.h"
+#include "storage/smgr.h"
+#include "storage/undofile.h"
+#include "utils/memutils.h"
+
+/* Populate a file tag describing an undo segment file. */
+#define INIT_UNDOFILETAG(a,xx_logno,xx_tbspc,xx_segno) \
+( \
+       memset(&(a), 0, sizeof(FileTag)), \
+       (a).handler = SYNC_HANDLER_UNDO, \
+       (a).rnode.dbNode = UndoDbOid, \
+       (a).rnode.spcNode = (xx_tbspc), \
+       (a).rnode.relNode = (xx_logno), \
+       (a).segno = (xx_segno) \
+)
+
+/*
+ * While md.c expects random access and has a small number of huge
+ * segments, undofile.c manages a potentially very large number of smaller
+ * segments and has a less random access pattern.  Therefore, instead of
+ * keeping a potentially huge array of vfds we'll just keep the most
+ * recently accessed N.
+ *
+ * For now, N == 1, so we just need to hold onto one 'File' handle.
+ */
+typedef struct UndoFileState
+{
+       int             mru_segno;
+       File    mru_file;
+} UndoFileState;
+
+static MemoryContext UndoFileCxt;
+
+static File undofile_open_segment_file(Oid relNode, Oid spcNode,
+                                                                          BlockNumber segno, bool missing_ok);
+static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno);
+
+void
+undofile_init(void)
+{
+       UndoFileCxt = AllocSetContextCreate(TopMemoryContext,
+                                                                               "UndoFileSmgr",
+                                                                               ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+undofile_shutdown(void)
+{
+}
+
+void
+undofile_open(SMgrRelation reln)
+{
+       UndoFileState *state;
+
+       state = MemoryContextAllocZero(UndoFileCxt, sizeof(UndoFileState));
+       reln->private_data = state;
+}
+
+void
+undofile_close(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+void
+undofile_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+       /*
+        * File creation is managed by undolog.c, but xlogutils.c likes to call
+        * this just in case.  Ignore.
+        */
+}
+
+bool
+undofile_exists(SMgrRelation reln, ForkNumber forknum)
+{
+       elog(ERROR, "undofile_exists is not supported");
+
+       return false;           /* not reached */
+}
+
+void
+undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+{
+       elog(ERROR, "undofile_unlink is not supported");
+}
+
+void
+undofile_extend(SMgrRelation reln, ForkNumber forknum,
+                               BlockNumber blocknum, char *buffer,
+                               bool skipFsync)
+{
+       elog(ERROR, "undofile_extend is not supported");
+}
+
+void
+undofile_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+       elog(ERROR, "undofile_prefetch is not supported");
+}
+
+void
+undofile_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+                         char *buffer)
+{
+       File            file;
+       off_t           seekpos;
+       int                     nbytes;
+
+       Assert(forknum == MAIN_FORKNUM);
+       file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE));
+       Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE);
+       nbytes = FileRead(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_READ);
+       if (nbytes != BLCKSZ)
+       {
+               if (nbytes < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not read block %u in file \"%s\": %m",
+                                                       blocknum, FilePathName(file))));
+               ereport(ERROR,
+                               (errcode(ERRCODE_DATA_CORRUPTED),
+                                errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
+                                               blocknum, FilePathName(file),
+                                               nbytes, BLCKSZ)));
+       }
+}
+
+void
+undofile_write(SMgrRelation reln, ForkNumber forknum,
+                          BlockNumber blocknum, char *buffer,
+                          bool skipFsync)
+{
+       File            file;
+       off_t           seekpos;
+       int                     nbytes;
+
+       Assert(forknum == MAIN_FORKNUM);
+       file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+       seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE));
+       Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE);
+       nbytes = FileWrite(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_WRITE);
+       if (nbytes != BLCKSZ)
+       {
+               if (nbytes < 0)
+                       ereport(ERROR,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not write block %u in file \"%s\": %m",
+                                                       blocknum, FilePathName(file))));
+               /*
+                * short write: unexpected, because this should be overwriting an
+                * entirely pre-allocated segment file
+                */
+               ereport(ERROR,
+                               (errcode(ERRCODE_DISK_FULL),
+                                errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
+                                               blocknum, FilePathName(file),
+                                               nbytes, BLCKSZ)));
+       }
+
+       /* Tell checkpointer this file is dirty. */
+       if (!skipFsync && !SmgrIsTemp(reln))
+       {
+               undofile_request_sync(reln->smgr_rnode.node.relNode,
+                                                         blocknum / UNDOSEG_SIZE,
+                                                         reln->smgr_rnode.node.spcNode);
+       }
+}
+
+void
+undofile_writeback(SMgrRelation reln, ForkNumber forknum,
+                                  BlockNumber blocknum, BlockNumber nblocks)
+{
+       while (nblocks > 0)
+       {
+               File    file;
+               int             nflush;
+
+               file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+
+               /* compute number of desired writes within the current segment */
+               nflush = Min(nblocks,
+                                        1 + UNDOSEG_SIZE - (blocknum % UNDOSEG_SIZE));
+
+               FileWriteback(file,
+                                         (blocknum % UNDOSEG_SIZE) * BLCKSZ,
+                                         nflush * BLCKSZ, WAIT_EVENT_UNDO_FILE_FLUSH);
+
+               nblocks -= nflush;
+               blocknum += nflush;
+       }
+}
+
+BlockNumber
+undofile_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+       /*
+        * xlogutils.c likes to call this to decide whether to read or extend; for
+        * now we lie and say the relation is big as possible.
+        */
+       return UndoLogMaxSize / BLCKSZ;
+}
+
+void
+undofile_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+       elog(ERROR, "undofile_truncate is not supported");
+}
+
+void
+undofile_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+       elog(ERROR, "undofile_immedsync is not supported");
+}
+
+static File undofile_open_segment_file(Oid relNode, Oid spcNode,
+                                                                          BlockNumber segno, bool missing_ok)
+{
+       File            file;
+       char            path[MAXPGPATH];
+
+       UndoLogSegmentPath(relNode, segno, spcNode, path);
+       file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+
+       if (file <= 0 && (!missing_ok || errno != ENOENT))
+               elog(ERROR, "cannot open undo segment file '%s': %m", path);
+
+       return file;
+}
+
+/*
+ * Get a File for a particular segment of a SMgrRelation representing an undo
+ * log.
+ */
+static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno)
+{
+       UndoFileState *state = (UndoFileState *) reln->private_data;
+
+       /* If we have a file open already, check if we need to close it. */
+       if (state->mru_file > 0 && state->mru_segno != segno)
+       {
+               /* These are not the blocks we're looking for. */
+               FileClose(state->mru_file);
+               state->mru_file = 0;
+       }
+
+       /* Check if we need to open a new file. */
+       if (state->mru_file <= 0)
+       {
+               state->mru_file =
+                       undofile_open_segment_file(reln->smgr_rnode.node.relNode,
+                                                                          reln->smgr_rnode.node.spcNode,
+                                                                          segno, InRecovery);
+               if (InRecovery && state->mru_file <= 0)
+               {
+                       /*
+                        * If in recovery, we may be trying to access a file that will
+                        * later be unlinked.  Tolerate missing files, creating a new
+                        * zero-filled file as required.
+                        */
+                       UndoLogNewSegment(reln->smgr_rnode.node.relNode,
+                                                         reln->smgr_rnode.node.spcNode,
+                                                         segno);
+                       state->mru_file =
+                               undofile_open_segment_file(reln->smgr_rnode.node.relNode,
+                                                                                  reln->smgr_rnode.node.spcNode,
+                                                                                  segno, false);
+                       Assert(state->mru_file > 0);
+               }
+               state->mru_segno = segno;
+       }
+
+       return state->mru_file;
+}
+
+/*
+ * Callback to handle a queued sync request.
+ */
+int
+undofile_syncfiletag(const FileTag *tag, char *path)
+{
+       SMgrRelation reln = smgropen(tag->rnode, InvalidBackendId);
+       File            file;
+
+       if (tag->rnode.relNode == (Oid) InvalidUndoLogNumber)
+       {
+               /* Sync parent directory for this tablespace. */
+               UndoLogDirectory(tag->rnode.spcNode, path);
+
+               /* The caller (sync.c) will do appropriate error reporting. */
+               return fsync_fname_ext(path, true, false, WARNING);
+       }
+       else
+       {
+               /* Sync a segment file. */
+               UndoLogSegmentPath(tag->rnode.relNode, tag->segno, tag->rnode.spcNode,
+                                                  path);
+
+               file = undofile_get_segment_file(reln, tag->segno);
+               if (file <= 0)
+               {
+                       /* errno set by undofile_get_segment_file() */
+                       return -1;
+               }
+
+               return FileSync(file, WAIT_EVENT_UNDO_FILE_SYNC);
+       }
+}
+
+/*
+ * Filtering callback used by SYNC_FILTER_REQUEST to forget some requests.
+ */
+bool
+undofile_filetagmatches(const FileTag *tag, const FileTag *candidate)
+{
+       /*
+        * We use SYNC_FILTER_REQUEST to forget requests for a given tablespace,
+        * before removing all undo files in the tablespace.
+        */
+       return tag->rnode.spcNode == candidate->rnode.spcNode;
+}
+
+/*
+ * Tell the checkpointer to sync a segment file.
+ */
+void
+undofile_request_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace)
+{
+       char            path[MAXPGPATH];
+       FileTag         tag;
+
+       INIT_UNDOFILETAG(tag, logno, tablespace, segno);
+
+       /* Try to send to the checkpointer, but if out of space, do it here. */
+       if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+       {
+               if (undofile_syncfiletag(&tag, path) < 0)
+                       ereport(data_sync_elevel(ERROR),
+                                       (errmsg("could not fsync file \"%s\": %m", path)));
+       }
+}
+
+/*
+ * Tell the checkpointer to forget about any sync requests for a given segment
+ * file, because it's about to go away.
+ */
+void
+undofile_forget_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace)
+{
+       FileTag         tag;
+
+       INIT_UNDOFILETAG(tag, logno, tablespace, segno);
+
+       /* Send, and keep retrying if out of space. */
+       (void) RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+}
+
+/*
+ * Tell the checkpointer to fsync the undo directory in a given tablespace,
+ * because we have created or renamed files inside it.
+ */
+void
+undofile_request_sync_dir(Oid tablespace)
+{
+       char            path[MAXPGPATH];
+       FileTag         tag;
+
+       /* We use a special logno and segno to mean "the directory". */
+       INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace,
+                                        InvalidBlockNumber);
+
+       /* Try to send to the checkpointer, but if out of space, do it here. */
+       if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+       {
+               if (undofile_syncfiletag(&tag, path) < 0)
+                       ereport(data_sync_elevel(ERROR),
+                                       (errmsg("could not fsync directory \"%s\": %m", path)));
+       }
+}
+
+/*
+ * Tell the checkpointer to forget about all sync requests for a given
+ * tablespace, because it's about to go away.
+ */
+void
+undofile_forget_sync_tablespace(Oid tablespace)
+{
+       FileTag         tag;
+
+       INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace,
+                                        InvalidBlockNumber);
+
+       /*
+        * Tell checkpointer to forget about any request for this tag, and keep
+        * waiting if there is not enough space.
+        */
+       (void) RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true);
+}
index f329c3fd667cace1ad8f0b597f58024ca3090078..db7b417d837ba6fb1b57e87e6b62b2f025eac445 100644 (file)
@@ -28,6 +28,7 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
+#include "storage/undofile.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/inval.h"
@@ -96,6 +97,11 @@ static const SyncOps syncsw[] = {
                .sync_syncfiletag = mdsyncfiletag,
                .sync_unlinkfiletag = mdunlinkfiletag,
                .sync_filetagmatches = mdfiletagmatches
+       },
+       /* undo log segment files */
+       {
+               .sync_syncfiletag = undofile_syncfiletag,
+               .sync_filetagmatches = undofile_filetagmatches
        }
 };
 
index 43b9f17f722698937e7569d5dde0d37fe30c1991..5d9af8903675d3eebd285c240f82d2e569f08348 100644 (file)
@@ -25,6 +25,7 @@
 #include "access/session.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
@@ -561,6 +562,7 @@ BaseInit(void)
        InitSync();
        smgrinit();
        InitBufferPoolAccess();
+       UndoLogInit();
 }
 
 
index fc463601ff3c47c2549181b71961b07d78427a33..296fb771668e5ece82827c6d869e8857d6eab115 100644 (file)
@@ -121,6 +121,7 @@ extern int  CommitDelay;
 extern int     CommitSiblings;
 extern char *default_tablespace;
 extern char *temp_tablespaces;
+extern char *undo_tablespaces;
 extern bool ignore_checksum_failure;
 extern bool synchronize_seqscans;
 
@@ -3667,6 +3668,17 @@ static struct config_string ConfigureNamesString[] =
                check_temp_tablespaces, assign_temp_tablespaces, NULL
        },
 
+       {
+               {"undo_tablespaces", PGC_USERSET, CLIENT_CONN_STATEMENT,
+                       gettext_noop("Sets the tablespace(s) to use for undo logs."),
+                       NULL,
+                       GUC_LIST_INPUT | GUC_LIST_QUOTE
+               },
+               &undo_tablespaces,
+               "",
+               check_undo_tablespaces, assign_undo_tablespaces, NULL
+       },
+
        {
                {"dynamic_library_path", PGC_SUSET, CLIENT_CONN_OTHER,
                        gettext_noop("Sets the path for dynamically loadable modules."),
index 04d77ad70069a76b5b7a697a3d9c0594f1740902..bbcbdfdb87af85d0764aa51e784b65ce0f2788a1 100644 (file)
@@ -210,11 +210,13 @@ static const char *const subdirs[] = {
        "pg_snapshots",
        "pg_subtrans",
        "pg_twophase",
+       "pg_undo",
        "pg_multixact",
        "pg_multixact/members",
        "pg_multixact/offsets",
        "base",
        "base/1",
+       "base/undo",
        "pg_replslot",
        "pg_tblspc",
        "pg_stat",
index 8c00ec9a3b1a27177cf1db8de96e4354d0452585..8e83be9d523965c4ebf7f222c540ed7e36aba577 100644 (file)
@@ -167,7 +167,7 @@ skipfile(const char *fn)
 }
 
 static void
-scan_file(const char *fn, BlockNumber segmentno)
+scan_file(const char *fn, BlockNumber first_blkno)
 {
        PGAlignedBlock buf;
        PageHeader      header = (PageHeader) buf.data;
@@ -208,7 +208,7 @@ scan_file(const char *fn, BlockNumber segmentno)
                if (PageIsNew(header))
                        continue;
 
-               csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
+               csum = pg_checksum_page(buf.data, blockno + first_blkno);
                current_size += r;
                if (mode == PG_MODE_CHECK)
                {
@@ -310,7 +310,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
                        char            fnonly[MAXPGPATH];
                        char       *forkpath,
                                           *segmentpath;
-                       BlockNumber segmentno = 0;
+                       BlockNumber first_blkno;
 
                        if (skipfile(de->d_name))
                                continue;
@@ -325,15 +325,22 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
                        segmentpath = strchr(fnonly, '.');
                        if (segmentpath != NULL)
                        {
+                               char       *end;
+
                                *segmentpath++ = '\0';
-                               segmentno = atoi(segmentpath);
-                               if (segmentno == 0)
+                               if (strstr(segmentpath, "undo"))
+                                       first_blkno = strtol(segmentpath, &end, 16) / BLCKSZ;
+                               else
+                                       first_blkno = strtol(segmentpath, &end, 10) * RELSEG_SIZE;
+                               if (*end != '\0')
                                {
-                                       pg_log_error("invalid segment number %d in file name \"%s\"",
-                                                                segmentno, fn);
+                                       pg_log_error("invalid segment number in file name \"%s\"",
+                                                                fn);
                                        exit(1);
                                }
                        }
+                       else
+                               first_blkno = 0;
 
                        forkpath = strchr(fnonly, '_');
                        if (forkpath != NULL)
@@ -350,7 +357,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
                         * the items in the data folder.
                         */
                        if (!sizeonly)
-                               scan_file(fn, segmentno);
+                               scan_file(fn, first_blkno);
                }
 #ifndef WIN32
                else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
index ff0f8ea5e7714cd071f0098c1aaa005a0c7ea051..ad96fe75af543981e47acff3fecce9cfbb48f90a 100644 (file)
@@ -80,6 +80,7 @@ static bool ReadControlFile(void);
 static void GuessControlValues(void);
 static void PrintControlValues(bool guessed);
 static void PrintNewControlValues(void);
+static void AdjustRedoLocation(const char *DataDir);
 static void RewriteControlFile(void);
 static void FindEndOfXLOG(void);
 static void KillExistingXLOG(void);
@@ -510,6 +511,7 @@ main(int argc, char *argv[])
        /*
         * Else, do the dirty deed.
         */
+       AdjustRedoLocation(DataDir);
        RewriteControlFile();
        KillExistingXLOG();
        KillExistingArchiveStatus();
@@ -889,6 +891,80 @@ PrintNewControlValues(void)
 }
 
 
+/*
+ * Compute the new redo, and move the pg_undo file to match if necessary.
+ * Rather than renaming it, we'll create a new copy, so that a failure that
+ * occurs before the controlfile is rewritten won't be fatal.
+ */
+static void
+AdjustRedoLocation(const char *DataDir)
+{
+       uint64          old_redo = ControlFile.checkPointCopy.redo;
+       char            old_pg_undo_path[MAXPGPATH];
+       char            new_pg_undo_path[MAXPGPATH];
+       int                     old_fd;
+       int                     new_fd;
+       ssize_t         nread;
+       ssize_t         nwritten;
+       char            buffer[1024];
+
+       /*
+        * Adjust fields as needed to force an empty XLOG starting at
+        * newXlogSegNo.
+        */
+       XLogSegNoOffsetToRecPtr(newXlogSegNo, SizeOfXLogLongPHD, WalSegSz,
+                                                       ControlFile.checkPointCopy.redo);
+
+       /* If the redo location didn't move, we don't need to do anything. */
+       if (old_redo == ControlFile.checkPointCopy.redo)
+               return;
+
+       /*
+        * Otherwise we copy the pg_undo file, because its name must match the redo
+        * location.
+        */
+       snprintf(old_pg_undo_path,
+                        sizeof(old_pg_undo_path),
+                        "pg_undo/%016" INT64_MODIFIER "X",
+                        old_redo);
+       snprintf(new_pg_undo_path,
+                        sizeof(new_pg_undo_path),
+                        "pg_undo/%016" INT64_MODIFIER "X",
+                        ControlFile.checkPointCopy.redo);
+       old_fd = open(old_pg_undo_path, O_RDONLY, 0);
+       if (old_fd < 0)
+       {
+               pg_log_error("could not open \"%s\": %m", old_pg_undo_path);
+               exit(1);
+       }
+       new_fd = open(new_pg_undo_path, O_RDWR | O_CREAT, 0644);
+       if (new_fd < 0)
+       {
+               pg_log_error("could not create \"%s\": %m", new_pg_undo_path);
+               exit(1);
+       }
+       while ((nread = read(old_fd, buffer, sizeof(buffer))) > 0)
+       {
+               do
+               {
+                       nwritten = write(new_fd, buffer, nread);
+                       if (nwritten < 0)
+                       {
+                               pg_log_error("could not write to \"%s\": %m", new_pg_undo_path);
+                               exit(1);
+                       }
+                       nread -= nwritten;
+               } while (nread > 0);
+       }
+       if (nread < 0)
+       {
+               pg_log_error("could not read from \"%s\": %m", old_pg_undo_path);
+               exit(1);
+       }
+       close(old_fd);
+       close(new_fd);
+}
+
 /*
  * Write out the new pg_control file.
  */
index 2cb829acd59309d225ea00f010dec20555f3ea0c..82c80336c5e3c871bdefe691b140039eb6e07176 100644 (file)
@@ -9,7 +9,7 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = check.o controldata.o dump.o exec.o file.o function.o info.o \
        option.o parallel.o pg_upgrade.o relfilenode.o server.o \
-       tablespace.o util.o version.o $(WIN32RES)
+       tablespace.o undo.o util.o version.o $(WIN32RES)
 
 override CPPFLAGS := -DDLSUFFIX=\"$(DLSUFFIX)\" -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS)
 LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
index 617270f101d4a62f6dd4f188c826a5509ceb7dc7..4a431dce83a24f7ee8620ab77b0b6c6908cb2389 100644 (file)
@@ -21,6 +21,7 @@ static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
 static bool equivalent_locale(int category, const char *loca, const char *locb);
 static void check_is_install_user(ClusterInfo *cluster);
 static void check_proper_datallowconn(ClusterInfo *cluster);
+static void check_for_undo_data(ClusterInfo *cluster);
 static void check_for_prepared_transactions(ClusterInfo *cluster);
 static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
 static void check_for_tables_with_oids(ClusterInfo *cluster);
@@ -97,6 +98,7 @@ check_and_dump_old_cluster(bool live_check)
         */
        check_is_install_user(&old_cluster);
        check_proper_datallowconn(&old_cluster);
+       check_for_undo_data(&old_cluster);
        check_for_prepared_transactions(&old_cluster);
        check_for_reg_data_type_usage(&old_cluster);
        check_for_isn_and_int8_passing_mismatch(&old_cluster);
@@ -171,6 +173,7 @@ check_new_cluster(void)
 
        check_is_install_user(&new_cluster);
 
+       check_for_undo_data(&new_cluster);
        check_for_prepared_transactions(&new_cluster);
 }
 
@@ -800,6 +803,46 @@ check_for_prepared_transactions(ClusterInfo *cluster)
 }
 
 
+/*
+ *     check_for_live_undo_data()
+ *
+ *     Make sure there are no live undo records (aborted transactions that have
+ *     not been rolled back, or committed transactions whose undo data has not
+ *     yet been discarded).
+ */
+static void
+check_for_undo_data(ClusterInfo *cluster)
+{
+       PGresult   *res;
+       PGconn     *conn;
+
+       if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300)
+               return;
+
+       conn = connectToServer(cluster, "template1");
+       prep_status("Checking for undo data");
+
+       res = executeQueryOrDie(conn,
+                                                       "SELECT * "
+                                                       "FROM pg_catalog.pg_stat_undo_logs "
+                                                       "WHERE discard != insert");
+
+       if (PQntuples(res) != 0)
+       {
+               if (cluster == &old_cluster)
+                       pg_fatal("The source cluster contains live undo data\n");
+               else
+                       pg_fatal("The target cluster contains live undo data\n");
+       }
+
+       PQclear(res);
+
+       PQfinish(conn);
+
+       check_ok();
+}
+
+
 /*
  *     check_for_isn_and_int8_passing_mismatch()
  *
index 38236415bef99f949418f83051215862306dad2a..523e86eeff611d51e4e0a79372d05a40051ea128 100644 (file)
@@ -59,6 +59,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
        bool            got_date_is_int = false;
        bool            got_data_checksum_version = false;
        bool            got_cluster_state = false;
+       bool            got_redo_location = false;
        char       *lc_collate = NULL;
        char       *lc_ctype = NULL;
        char       *lc_monetary = NULL;
@@ -485,6 +486,23 @@ get_control_data(ClusterInfo *cluster, bool live_check)
                        cluster->controldata.data_checksum_version = str2uint(p);
                        got_data_checksum_version = true;
                }
+               else if ((p = strstr(bufin, "Latest checkpoint's REDO location:")) != NULL)
+               {
+                       uint32          hi;
+                       uint32          lo;
+
+                       p = strchr(p, ':');
+
+                       if (p == NULL || strlen(p) <= 1)
+                               pg_fatal("%d: controldata retrieval problem\n", __LINE__);
+
+                       p++;                            /* remove ':' char */
+
+                       if (sscanf(p, "%X/%X", &hi, &lo) != 2)
+                               pg_fatal("%d: controldata cannot parse REDO location\n", __LINE__);
+                       cluster->controldata.redo_location = (((uint64) hi) << 32) | lo;
+                       got_redo_location = true;
+               }
        }
 
        pclose(output);
@@ -528,6 +546,13 @@ get_control_data(ClusterInfo *cluster, bool live_check)
                }
        }
 
+       /*
+        * If we used pg_resetwal instead of pg_controldata, there is no REDO
+        * location.
+        */
+       if (!got_redo_location)
+               cluster->controldata.redo_location = 0;
+
        /* verify that we got all the mandatory pg_control data */
        if (!got_xid || !got_oid ||
                !got_multi ||
index 036330932820ddc17a719cce8fedfce42c7f9bf1..f23451a876f2698666c9eb986cb4b7229bcbf2a7 100644 (file)
@@ -351,6 +351,10 @@ check_data_dir(ClusterInfo *cluster)
                check_single_dir(pg_data, "pg_clog");
        else
                check_single_dir(pg_data, "pg_xact");
+
+       /* pg_undo is new in v13 */
+       if (GET_MAJOR_VERSION(cluster->major_version) >= 1300)
+               check_single_dir(pg_data, "pg_undo");
 }
 
 
index d1975aab2bc2c9ad3bde1eab3cc18a9519e73ca3..09b786f4724ab1a0e6bce77225ee8f1b90b52db3 100644 (file)
@@ -139,6 +139,8 @@ main(int argc, char **argv)
 
        /* New now using xids of the old system */
 
+       merge_undo_logs();
+
        /* -- NEW -- */
        start_postmaster(&new_cluster, true);
 
index 5d31750d867498f37bed3b425de472e45a867117..d0d93c0682110be80d824641f82c01948d5c14ea 100644 (file)
@@ -227,6 +227,7 @@ typedef struct
        bool            date_is_int;
        bool            float8_pass_by_value;
        bool            data_checksum_version;
+       uint64          redo_location;
 } ControlData;
 
 /*
@@ -465,3 +466,7 @@ void                parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr
                                                                                  char *old_pgdata, char *new_pgdata,
                                                                                  char *old_tablespace);
 bool           reap_child(bool wait_for_child);
+
+/* undo.c */
+
+void           merge_undo_logs(void);
diff --git a/src/bin/pg_upgrade/undo.c b/src/bin/pg_upgrade/undo.c
new file mode 100644 (file)
index 0000000..48397b0
--- /dev/null
@@ -0,0 +1,292 @@
+/*
+ *     undo.c
+ *
+ *     Support for upgrading undo logs.\
+ *     Copyright (c) 2019, PostgreSQL Global Development Group
+ *     src/bin/pg_upgrade/undo.c
+ */
+
+
+#include "postgres_fe.h"
+#include "pg_upgrade.h"
+#include "access/undolog.h"
+
+/*
+ * The relevant parts of UndoLogMetaDataData, in a version-independent format.
+ */
+typedef struct
+{
+       UndoLogNumber logno;
+       UndoLogOffset discard;
+       UndoLogStatus status;
+       UndoLogCategory category;
+       Oid                     tablespace;
+} UndoLogInfo;
+
+/*
+ * Read the header of a pg_undo file and extract basic information.  If the
+ * format of the header changes in later versions, this may need to change
+ * depending on "cluster".
+ */
+static void
+read_pg_undo_header(int fd, ClusterInfo *cluster, UndoLogNumber *low_logno,
+                                       UndoLogNumber *next_logno, UndoLogNumber *num_logs)
+{
+       pg_crc32c crc;
+
+       /* Read the header, much like StartupUndoLogs(). */
+       if (read(fd, low_logno, sizeof(*low_logno)) != sizeof(*low_logno) ||
+               read(fd, next_logno, sizeof(*next_logno)) != sizeof(*next_logno) ||
+               read(fd, num_logs, sizeof(*num_logs)) != sizeof(*num_logs) ||
+               read(fd, &crc, sizeof(crc)) != sizeof(crc))
+               pg_fatal("pg_undo file is corrupted or cannot be read\n");
+}
+
+/*
+ * Read a single UndoLogMetaData object.  If the format changes in later
+ * versions, this may need to change to be able to read different structs
+ * depending on "cluster".
+ */
+static void
+read_one_undo_log(int fd, ClusterInfo *cluster, UndoLogInfo *info)
+{
+       UndoLogMetaData meta_data;
+       int             rc;
+
+       rc = read(fd, &meta_data, sizeof(meta_data));
+       if (rc < 0)
+               pg_fatal("could not read undo log meta-data: %m");
+       else if (rc != sizeof(meta_data))
+               pg_fatal("could not read undo log meta-data: expect %zu bytes but read only %d bytes",
+                                sizeof(meta_data), rc);
+
+       info->logno = meta_data.logno;
+       info->category = meta_data.category;
+       info->tablespace = meta_data.tablespace;
+       info->discard = meta_data.discard;
+       info->status = meta_data.status;
+}
+
+static void
+merge_undo_log(UndoLogInfo *logs, UndoLogNumber *num_logs,
+                          const UndoLogInfo *info)
+{
+       UndoLogNumber i;
+
+       /* Do we already have an entry for this logno? */
+       for (i = 0; i < *num_logs; ++i)
+       {
+               if (logs[i].logno == info->logno)
+               {
+                       /*
+                        * Take the highest discard offset, so that any pointers that
+                        * originated in either cluster appear to be discarded.
+                        */
+                       if (logs[i].discard < info->discard)
+                               logs[i].discard = info->discard;
+
+                       /*
+                        * Take the highest status so that entirely discarded logs trump
+                        * active logs.
+                        */
+                       StaticAssertStmt(UNDO_LOG_STATUS_ACTIVE < UNDO_LOG_STATUS_FULL,
+                                                        "undo log status out of order");
+                       StaticAssertStmt(UNDO_LOG_STATUS_FULL < UNDO_LOG_STATUS_DISCARDED,
+                                                        "undo log status out of order");
+                       if (logs[i].status < info->status)
+                               logs[i].status = info->status;
+
+                       /*
+                        * Take the most persistent persistence level.  While we could
+                        * just convert them all to permanent and it wouldn't hurt, it's
+                        * probably a better idea to keep about the same number of each
+                        * persistence level, so that a system that has stabilized with
+                        * those numbers will continue to be stable after the upgrade (ie
+                        * not suddenly need to create more undo logs of different
+                        * levels).  The most permanent is the best choice, because TEMP
+                        * undo logs might be rewound in future.
+                        */
+                       StaticAssertStmt(UNDO_PERMANENT < UNDO_UNLOGGED,
+                                                        "undo log persistent out of order");
+                       StaticAssertStmt(UNDO_UNLOGGED < UNDO_TEMP,
+                                                        "undo log persistent out of order");
+                       if (logs[i].status > info->status)
+                               logs[i].status = info->status;
+
+                       /*
+                        * Take the highest tablespace OID.  The choice of 'highest' is
+                        * arbitrary (we don't really expect the new cluster to have more
+                        * than one log), but it seems useful to preserve the distribution
+                        * of tablespaces from the old cluster for stability, as above.
+                        */
+                       if (logs[i].tablespace < info->tablespace)
+                               logs[i].tablespace = info->tablespace;
+                       break;
+               }
+       }
+
+       /* Otherwise create a new entry. */
+       logs[*num_logs++] = *info;
+}
+
+/*
+ * We need to merge the old undo logs and the new undo logs.  We know that
+ * there is no live undo data (see check_for_live_undo_data()), but we need to
+ * make sure that any undo record pointers that exist in the old OR new
+ * cluster appear as discarded.  That is, any log numbers that are entirely
+ * discarded in either cluster appear as entirely discarded, and we retain
+ * the higher of the discard pointers in any log that is active.  This is
+ * mostly a theoretical concern for now, but perhaps a future release will be
+ * able to create higher undo record pointers during initdb than the old
+ * cluster had, so let's use an algorithm that doesn't make any assumptions
+ * about that.
+ */
+void
+merge_undo_logs(void)
+{
+       char            old_pg_undo_path[MAXPGPATH];
+       char            new_pg_undo_path[MAXPGPATH];
+       UndoLogInfo *logs;
+       UndoLogNumber num_logs;
+       UndoLogNumber num_old_logs;
+       UndoLogNumber old_low_logno;
+       UndoLogNumber old_next_logno;
+       UndoLogNumber num_new_logs;
+       UndoLogNumber new_low_logno;
+       UndoLogNumber new_next_logno;
+       UndoLogNumber i;
+       int                     old_fd;
+       int                     new_fd;
+       pg_crc32c       crc;
+
+       /* If the old cluster has no undo logs, there is nothing to do */
+       if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300)
+               return;
+
+       /*
+        * Open the pg_undo files corresponding to the old and new redo locations.
+        * First, we'll reload pg_controldata output, so that we have up-to-date
+        * redo locations.
+        */
+       get_control_data(&old_cluster, true);
+       get_control_data(&new_cluster, true);
+       snprintf(old_pg_undo_path,
+                        sizeof(old_pg_undo_path),
+                        "%s/pg_undo/%016" INT64_MODIFIER "X",
+                        old_cluster.pgdata,
+                        old_cluster.controldata.redo_location);
+       snprintf(new_pg_undo_path,
+                        sizeof(new_pg_undo_path),
+                        "%s/pg_undo/%016" INT64_MODIFIER "X",
+                        new_cluster.pgdata,
+                        new_cluster.controldata.redo_location);
+       old_fd = open(old_pg_undo_path, O_RDONLY, 0);
+       if (old_fd < 0)
+               pg_fatal("could not open file \"%s\": %m\n", old_pg_undo_path);
+       new_fd = open(new_pg_undo_path, O_RDWR, 0);
+       if (new_fd < 0)
+               pg_fatal("could not open file \"%s\": %m\n", new_pg_undo_path);
+
+       /* Read the headers */
+       read_pg_undo_header(old_fd, &old_cluster, &old_low_logno, &old_next_logno, &num_old_logs);
+       read_pg_undo_header(new_fd, &new_cluster, &new_low_logno, &new_next_logno, &num_new_logs);
+
+       /* Allocate workspace that is sure to be enough for the merged set */
+       logs = malloc(sizeof(*logs) * (num_old_logs + num_new_logs));
+       if (logs == NULL)
+       {
+               pg_fatal("out of memory\n");
+               exit(1);
+       }
+       num_logs = 0;
+
+       /*
+        * Anything below the "low" logno has been entirely discarded, so we'll
+        * take the higher of the two values.  Likewise, the "next" log number to
+        * allocate should be the higher of the two.
+        */
+       new_low_logno = Max(old_low_logno, new_low_logno);
+       new_next_logno = Max(old_next_logno, new_next_logno);
+
+       /* Merge in the old logs */
+       while (num_old_logs > 0)
+       {
+               UndoLogInfo     info;
+
+               read_one_undo_log(old_fd, &old_cluster, &info);
+               merge_undo_log(logs, &num_logs, &info);
+               --num_old_logs;
+       }
+
+       /* Merge in the new logs */
+       while (num_new_logs > 0)
+       {
+               UndoLogInfo     info;
+
+               read_one_undo_log(new_fd, &old_cluster, &info);
+               merge_undo_log(logs, &num_logs, &info);
+               --num_new_logs;
+       }
+
+       close(old_fd);
+
+       /* Now write out the new file, much like CheckPointUndoLogs() */
+       if (ftruncate(new_fd, 0) < 0)
+               pg_fatal("could not truncate file \"%s\": %m", new_pg_undo_path);
+       if (lseek(new_fd, SEEK_SET, 0) < 0)
+               pg_fatal("could not seek to start of file \"%s\": %m", new_pg_undo_path);
+
+       /* Compute header checksum */
+       INIT_CRC32C(crc);
+       COMP_CRC32C(crc, &new_low_logno, sizeof(new_low_logno));
+       COMP_CRC32C(crc, &new_next_logno, sizeof(new_next_logno));
+       COMP_CRC32C(crc, &num_logs, sizeof(num_logs));
+       FIN_CRC32C(crc);
+
+       /* Write out the header */
+       if ((write(new_fd, &new_low_logno, sizeof(new_low_logno)) != sizeof(new_low_logno)) ||
+               (write(new_fd, &new_next_logno, sizeof(new_next_logno)) != sizeof(new_next_logno)) ||
+               (write(new_fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) ||
+               (write(new_fd, &crc, sizeof(crc)) != sizeof(crc)))
+               pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+       /* Write out the undo logs */
+       INIT_CRC32C(crc);
+       for (i = 0; i < num_logs; ++i)
+       {
+               UndoLogMetaData meta_data;
+               UndoLogInfo     *info = &logs[i];
+               UndoLogOffset end;
+
+               memset(&meta_data, 0, sizeof(meta_data));
+               meta_data.logno = info->logno;
+
+               /*
+                * Round the discard offset up so that it points to the first byte in
+                * a segment, and assign that to all three offsets.  That means there
+                * is no logical data, and there are no physical files.
+                */
+               end = ((info->discard + UndoLogSegmentSize - 1) / UndoLogSegmentSize)
+                       * UndoLogSegmentSize;
+               meta_data.unlogged.insert = meta_data.discard = meta_data.end = end;
+
+               /*
+                * We have whatever was the highest status (though it probably
+                * wouldn't hurt if we set them all to ACTIVE).
+                */
+               meta_data.status = info->status;
+
+
+               if (write(new_fd, &meta_data, sizeof(meta_data)) != sizeof(meta_data))
+                       pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+               COMP_CRC32C(crc, &meta_data, sizeof(meta_data));
+       }
+       FIN_CRC32C(crc);
+
+       if (write(new_fd, &crc, sizeof(crc)) != sizeof(crc))
+               pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+       close(new_fd);
+       free(logs);
+}
index 852d8ca4b1c6cbdeee4601b84fda27dfed396796..938150dd915e2ab223c914e5dbb41a8e97133d52 100644 (file)
@@ -20,6 +20,7 @@
 #include "access/nbtxlog.h"
 #include "access/rmgr.h"
 #include "access/spgxlog.h"
+#include "access/undolog_xlog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "catalog/storage_xlog.h"
index 3c0db2ccf5fd9c5e256a78a5d1290b2f70e4ba63..6945e3e9504c6ef254c38edd82e859ea9ca0b0e4 100644 (file)
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
 PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
 PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+PG_RMGR(RM_UNDOLOG_ID, "UndoLog", undolog_redo, undolog_desc, undolog_identify, NULL, NULL, NULL)
index 8fba568029942dc061981c8d5ef275659d4e893c..da0770e61486a500681a01869a6516fea4a7aa61 100644 (file)
@@ -17,6 +17,9 @@
 /* Avoid including typcache.h */
 struct SharedRecordTypmodRegistry;
 
+/* Avoid including undolog.h */
+struct UndoLogSlot;
+
 /*
  * A struct encapsulating some elements of a user's session.  For now this
  * manages state that applies to parallel query, but it principle it could
@@ -27,6 +30,10 @@ typedef struct Session
        dsm_segment *segment;           /* The session-scoped DSM segment. */
        dsa_area   *area;                       /* The session-scoped DSA area. */
 
+       /* State managed by undolog.c. */
+       struct UndoLogSlot *attached_undo_slots[4];             /* UndoLogCategories */
+       bool            need_to_choose_undo_tablespace;
+
        /* State managed by typcache.c. */
        struct SharedRecordTypmodRegistry *shared_typmod_registry;
        dshash_table *shared_record_table;
diff --git a/src/include/access/undolog.h b/src/include/access/undolog.h
new file mode 100644 (file)
index 0000000..c4a6b29
--- /dev/null
@@ -0,0 +1,489 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog.h
+ *
+ * PostgreSQL undo log manager.  This module is responsible for lifecycle
+ * management of undo logs and backing files, associating undo logs with
+ * backends, allocating and managing space within undo logs.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/undolog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOLOG_H
+#define UNDOLOG_H
+
+#include "access/transam.h"
+#include "access/xlogreader.h"
+#include "catalog/database_internal.h"
+#include "catalog/pg_class.h"
+#include "common/relpath.h"
+#include "storage/bufpage.h"
+
+#ifndef FRONTEND
+#include "storage/lwlock.h"
+#endif
+
+/* The type used to identify an undo log and position within it. */
+typedef uint64 UndoRecPtr;
+
+/* The type used for undo record lengths. */
+typedef uint16 UndoRecordSize;
+
+/* Undo log statuses. */
+typedef enum
+{
+       UNDO_LOG_STATUS_UNUSED = 0,
+       UNDO_LOG_STATUS_ACTIVE,
+       UNDO_LOG_STATUS_FULL,
+       UNDO_LOG_STATUS_DISCARDED
+} UndoLogStatus;
+
+/*
+ * Undo log categories.  These correspond to the different persistence levels
+ * of relations so that we can discard unlogged and temporary undo data
+ * wholesale in some circumstance.  We also have a separate category for
+ * 'shared' records that are not associated with a single transactions.  Since
+ * they might live longer than the transaction that created them, and since we
+ * prefer to avoid interleaving records that don't belong to the same
+ * transaction, we keep them separate.
+ */
+typedef enum
+{
+       UNDO_PERMANENT = 0,
+       UNDO_UNLOGGED = 1,
+       UNDO_TEMP = 2,
+       UNDO_SHARED = 3
+} UndoLogCategory;
+
+#define UndoLogCategories 4
+
+/*
+ * Convert from relpersistence ('p', 'u', 't') to an UndoLogCategory
+ * enumerator.
+ */
+#define UndoLogCategoryForRelPersistence(rp)                                           \
+       ((rp) == RELPERSISTENCE_PERMANENT ? UNDO_PERMANENT :                    \
+        (rp) == RELPERSISTENCE_UNLOGGED ? UNDO_UNLOGGED : UNDO_TEMP)
+
+/*
+ * Convert from UndoLogCategory to a relpersistence value.  There is no
+ * relpersistence level for UNDO_SHARED, but the only use of this macro is to
+ * pass a value to ReadBufferWithoutRelcache, which cares only about detecting
+ * RELPERSISTENCE_TEMP.  XXX There must be a better way.
+ */
+#define RelPersistenceForUndoLogCategory(up)                           \
+       ((up) == UNDO_PERMANENT ? RELPERSISTENCE_PERMANENT :    \
+        (up) == UNDO_UNLOGGED ? RELPERSISTENCE_UNLOGGED :              \
+        (up) == UNDO_SHARED ? RELPERSISTENCE_PERMANENT :               \
+        RELPERSISTENCE_TEMP)
+
+/*
+ * Get the appropriate UndoLogCategory value from a Relation.
+ */
+#define UndoLogCategoryForRelation(rel)                                                                        \
+       (UndoLogCategoryForRelPersistence((rel)->rd_rel->relpersistence))
+
+/* Type for offsets within undo logs */
+typedef uint64 UndoLogOffset;
+
+/* printf-family format string for UndoRecPtr. */
+#define UndoRecPtrFormat "%016" INT64_MODIFIER "X"
+
+/* printf-family format string for UndoLogOffset. */
+#define UndoLogOffsetFormat UINT64_FORMAT
+
+/* Number of blocks of BLCKSZ in an undo log segment file.  128 = 1MB. */
+#define UNDOSEG_SIZE 128
+
+/* Size of an undo log segment file in bytes. */
+#define UndoLogSegmentSize ((size_t) BLCKSZ * UNDOSEG_SIZE)
+
+/* The width of an undo log number in bits.  24 allows for 16.7m logs. */
+#define UndoLogNumberBits 24
+
+/* The maximum valid undo log number. */
+#define MaxUndoLogNumber ((1 << UndoLogNumberBits) - 1)
+
+/* The width of an undo log offset in bits.  40 allows for 1TB per log.*/
+#define UndoLogOffsetBits (64 - UndoLogNumberBits)
+
+/* Special value for undo record pointer which indicates that it is invalid. */
+#define        InvalidUndoRecPtr       ((UndoRecPtr) 0)
+
+/* End-of-list value when building linked lists of undo logs. */
+#define InvalidUndoLogNumber -1
+
+/*
+ * The maximum amount of data that can be stored in an undo log.  Can be set
+ * artificially low to test full log behavior.
+ */
+#define UndoLogMaxSize ((UndoLogOffset) 1 << UndoLogOffsetBits)
+
+/* Type for numbering undo logs. */
+typedef int UndoLogNumber;
+
+/* Extract the undo log number from an UndoRecPtr. */
+#define UndoRecPtrGetLogNo(urp)                                        \
+       ((urp) >> UndoLogOffsetBits)
+
+/* Extract the offset from an UndoRecPtr. */
+#define UndoRecPtrGetOffset(urp)                               \
+       ((urp) & ((UINT64CONST(1) << UndoLogOffsetBits) - 1))
+
+/* Make an UndoRecPtr from an log number and offset. */
+#define MakeUndoRecPtr(logno, offset)                  \
+       (((uint64) (logno) << UndoLogOffsetBits) | (offset))
+
+/* The number of unusable bytes in the header of each block. */
+#define UndoLogBlockHeaderSize SizeOfPageHeaderData
+
+/* The number of usable bytes we can store per block. */
+#define UndoLogUsableBytesPerPage (BLCKSZ - UndoLogBlockHeaderSize)
+
+/* Length of undo checkpoint filename */
+#define UNDO_CHECKPOINT_FILENAME_LENGTH        16
+
+/*
+ * UndoRecPtrIsValid
+ *             True iff undoRecPtr is valid.
+ */
+#define UndoRecPtrIsValid(undoRecPtr) \
+       ((bool) ((UndoRecPtr) (undoRecPtr) != InvalidUndoRecPtr))
+
+/* Extract the relnode for an undo log. */
+#define UndoRecPtrGetRelNode(urp)                              \
+       UndoRecPtrGetLogNo(urp)
+
+/* The only valid fork number for undo log buffers. */
+#define UndoLogForkNum MAIN_FORKNUM
+
+/* Compute the block number that holds a given UndoRecPtr. */
+#define UndoRecPtrGetBlockNum(urp)                             \
+       (UndoRecPtrGetOffset(urp) / BLCKSZ)
+
+/* Compute the offset of a given UndoRecPtr in the page that holds it. */
+#define UndoRecPtrGetPageOffset(urp)                   \
+       (UndoRecPtrGetOffset(urp) % BLCKSZ)
+
+/* Compare two undo checkpoint files to find the oldest file. */
+#define UndoCheckPointFilenamePrecedes(file1, file2)   \
+       (strcmp(file1, file2) < 0)
+
+/* What is the offset of the i'th non-header byte? */
+#define UndoLogOffsetFromUsableByteNo(i)                                                               \
+       (((i) / UndoLogUsableBytesPerPage) * BLCKSZ +                                           \
+        UndoLogBlockHeaderSize +                                                                                       \
+        ((i) % UndoLogUsableBytesPerPage))
+
+/* How many non-header bytes are there before a given offset? */
+#define UndoLogOffsetToUsableByteNo(offset)                            \
+       (((offset) % BLCKSZ - UndoLogBlockHeaderSize) +         \
+        ((offset) / BLCKSZ) * UndoLogUsableBytesPerPage)
+
+/* Add 'n' usable bytes to offset stepping over headers to find new offset. */
+#define UndoLogOffsetPlusUsableBytes(offset, n)                                                        \
+       UndoLogOffsetFromUsableByteNo(UndoLogOffsetToUsableByteNo(offset) + (n))
+
+/* Populate a RelFileNode from an UndoRecPtr. */
+#define UndoRecPtrAssignRelFileNode(rfn, urp)                                                           \
+       do                                                                                                                                               \
+       {                                                                                                                                                \
+               (rfn).spcNode = UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp)); \
+               (rfn).dbNode = UndoDbOid;                                                                                        \
+               (rfn).relNode = UndoRecPtrGetRelNode(urp);                                                       \
+       } while (false);
+
+/*
+ * Properties of an undo log that don't have explicit WAL records logging
+ * their changes, to reduce WAL volume.  Instead, they change incrementally
+ * whenever data is inserted as a result of other WAL records.  Since the
+ * values recorded in an online checkpoint may be out of the sync (ie not the
+ * correct values as at the redo LSN), these are backed up in buffer data on
+ * first change after each checkpoint.
+ */
+typedef struct UndoLogUnloggedMetaData
+{
+       UndoLogOffset insert;                   /* next insertion point (head) */
+       UndoLogOffset last_xact_start;  /* last transaction's first byte in this log */
+       UndoLogOffset this_xact_start;  /* this transaction's first byte in this log */
+       TransactionId xid;                              /* currently attached/writing xid */
+
+       /*
+        * Below two variable are used during recovery when transaction's undo
+        * records are split across undo logs.  Replay of switch will restore
+        * these two undo record pointers which will be reset on next allocation
+        * during recovery. */
+       UndoRecPtr      prevlog_xact_start; /* Transaction's start undo record pointer
+                                                                        * in the previous log. */
+       UndoRecPtr      prevlog_last_urp;       /* Transaction's last undo record pointer in
+                                                                        * the previous log. */
+} UndoLogUnloggedMetaData;
+
+/*
+ * Control metadata for an active undo log.  Lives in shared memory inside an
+ * UndoLogSlot object, but also written to disk during checkpoints.
+ */
+typedef struct UndoLogMetaData
+{
+       /* Members that are not managed by explicit WAL logs. */
+       UndoLogUnloggedMetaData unlogged;
+
+       /* Members that are fixed for the lifetime of the undo log. */
+       UndoLogNumber logno;
+       Oid             tablespace;
+       UndoLogCategory category;
+
+       /* Members that are changed by explicit WAL records. */
+       UndoLogStatus status;
+       UndoLogOffset end;                              /* one past end of highest segment */
+       UndoLogOffset discard;                  /* oldest data needed (tail) */
+} UndoLogMetaData;
+
+/*
+ * Context used to hold undo log state across all the undo log insertions
+ * corresponding to a single WAL record.
+ */
+typedef struct UndoLogAllocContext
+{
+       UndoLogCategory category;
+       UndoRecPtr      try_location;
+       XLogReaderState *xlog_record;
+       UndoLogNumber recovery_logno;
+       uint8           recovery_block_id;
+       bool            new_shared_record_set;
+
+       /*
+        * The maximum number of undo logs that a single WAL record could insert
+        * into, modifying its unlogged meta data.  Typically the number is 1, but
+        * it might touch a couple or more in rare cases where space runs out.
+        */
+#define MAX_META_DATA_IMAGES 4
+       int                     num_meta_data_images;
+       struct
+       {
+               UndoLogNumber logno;
+               UndoLogUnloggedMetaData data;
+       } meta_data_images[MAX_META_DATA_IMAGES];
+} UndoLogAllocContext;
+
+#ifndef FRONTEND
+
+/*
+ * The in-memory control object for an undo log.  We have a fixed-sized array
+ * of these.
+ */
+typedef struct UndoLogSlot
+{
+       /*
+        * Protected by UndoLogLock and 'mutex'.  Both must be held to steal this
+        * slot for another undolog.  Either may be held to prevent that from
+        * happening.
+        */
+       UndoLogNumber logno;                    /* InvalidUndoLogNumber for unused slots */
+
+       /* Protected by UndoLogLock. */
+       UndoLogNumber next_free;                /* link for active unattached undo logs */
+
+       /* Protected by 'mutex'. */
+       LWLock          mutex;
+       UndoLogMetaData meta;                   /* current meta-data */
+       pid_t           pid;                            /* InvalidPid for unattached */
+
+       /* Protected by 'discard_lock'.  State used by undo workers. */
+       FullTransactionId       wait_fxmin;             /* trigger for processing this log again */
+       UndoRecPtr      oldest_data;
+       LWLock          discard_lock;           /* prevents discarding while reading */
+       LWLock      discard_update_lock;    /* block updaters during discard */
+} UndoLogSlot;
+
+extern UndoLogSlot *UndoLogGetSlot(UndoLogNumber logno, bool missing_ok);
+extern UndoLogSlot *UndoLogNextSlot(UndoLogSlot *slot);
+extern bool AmAttachedToUndoLogSlot(UndoLogSlot *slot);
+extern UndoRecPtr UndoLogGetOldestRecord(UndoLogNumber logno, bool *full);
+
+/*
+ * Each backend maintains a small hash table mapping undo log numbers to
+ * UndoLogSlot objects in shared memory.
+ *
+ * We also cache the tablespace, category and a recently observed discard
+ * pointer here, since we need fast access to those.  We could also reach them
+ * via slot->meta, but they can't be accessed without locking (since the
+ * UndoLogSlot object might be recycled if the log is entirely discard).
+ * Since tablespace and category are constant for lifetime of the undo log
+ * number, and the discard pointer only travels in one direction, there is no
+ * cache invalidation problem to worry about.
+ */
+typedef struct UndoLogTableEntry
+{
+       UndoLogNumber   number;
+       UndoLogSlot        *slot;
+       Oid                             tablespace;
+       UndoLogCategory category;
+       UndoRecPtr              recent_discard;
+       char                    status;                 /* used by simplehash */
+} UndoLogTableEntry;
+
+/*
+ * Instantiate fast inline hash table access functions.  We use an identity
+ * hash function for speed, since we already have integers and don't expect
+ * many collisions.
+ */
+#define SH_PREFIX undologtable
+#define SH_ELEMENT_TYPE UndoLogTableEntry
+#define SH_KEY_TYPE UndoLogNumber
+#define SH_KEY number
+#define SH_HASH_KEY(tb, key) (key)
+#define SH_EQUAL(tb, a, b) ((a) == (b))
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+extern PGDLLIMPORT undologtable_hash *undologtable_cache;
+
+/*
+ * Find or create an UndoLogTableGetEntry for this log number.  This is used
+ * only for fast look-ups of tablespace and persistence.
+ */
+static pg_attribute_always_inline UndoLogTableEntry *
+UndoLogGetTableEntry(UndoLogNumber logno)
+{
+       UndoLogTableEntry  *entry;
+
+       /* Fast path. */
+       entry = undologtable_lookup(undologtable_cache, logno);
+       if (likely(entry))
+               return entry;
+
+       /* Slow path: force cache entry to be created. */
+       UndoLogGetSlot(logno, false);
+       entry = undologtable_lookup(undologtable_cache, logno);
+
+       return entry;
+}
+
+/*
+ * Look up the tablespace for an undo log in our cache.
+ */
+static inline Oid
+UndoLogNumberGetTablespace(UndoLogNumber logno)
+{
+       return UndoLogGetTableEntry(logno)->tablespace;
+}
+
+static inline Oid
+UndoRecPtrGetTablespace(UndoRecPtr urp)
+{
+       return UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp));
+}
+
+/*
+ * Look up the category for an undo log in our cache.
+ */
+static inline UndoLogCategory
+UndoLogNumberGetCategory(UndoLogNumber logno)
+{
+       return UndoLogGetTableEntry(logno)->category;
+}
+
+static inline UndoLogCategory
+UndoRecPtrGetCategory(UndoRecPtr urp)
+{
+       return UndoLogNumberGetCategory(UndoRecPtrGetLogNo(urp));
+}
+
+#endif
+
+/* Space management. */
+extern void UndoLogBeginInsert(UndoLogAllocContext *context,
+                                                          UndoLogCategory category,
+                                                          XLogReaderState *xlog_record);
+extern void UndoLogRegister(UndoLogAllocContext *context,
+                                                       uint8 block_id,
+                                                       UndoLogNumber logno);
+extern UndoRecPtr UndoLogAllocate(UndoLogAllocContext *context,
+                                                                 uint16 size,
+                                                                 bool *need_xact_header,
+                                                                 UndoRecPtr *last_xact_start,
+                                                                 UndoRecPtr *prevlog_xact_start,
+                                                                 UndoRecPtr *prevlog_insert_urp);
+extern UndoRecPtr UndoLogAllocateInRecovery(UndoLogAllocContext *context,
+                                                                                       TransactionId xid,
+                                                                                       uint16 size,
+                                                                                       bool *need_xact_header,
+                                                                                       UndoRecPtr *last_xact_start,
+                                                                                       UndoRecPtr *prevlog_xact_start,
+                                                                                       UndoRecPtr *prevlog_last_urp);
+extern void UndoLogAdvance(UndoLogAllocContext *context, size_t size);
+extern void UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size);
+extern bool UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid);
+extern bool UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer);
+
+#ifndef FRONTEND
+
+/*
+ * Check if an undo log pointer is discarded.
+ */
+static inline bool
+UndoRecPtrIsDiscarded(UndoRecPtr pointer)
+{
+       UndoLogNumber   logno = UndoRecPtrGetLogNo(pointer);
+       UndoRecPtr              recent_discard;
+
+       /* See if we can answer the question without acquiring any locks. */
+       recent_discard = UndoLogGetTableEntry(logno)->recent_discard;
+       if (likely(recent_discard > pointer))
+               return true;
+
+       /*
+        * It might be discarded or not, but we'll need to do a bit more work to
+        * find out.
+        */
+       return UndoLogRecPtrIsDiscardedSlowPath(pointer);
+}
+
+#endif
+
+/* Initialization interfaces. */
+extern void StartupUndoLogs(XLogRecPtr checkPointRedo);
+extern void UndoLogShmemInit(void);
+extern Size UndoLogShmemSize(void);
+extern void UndoLogInit(void);
+extern void UndoLogDirectory(Oid tablespace, char *path);
+extern void UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace,
+                                                          char *path);
+extern void ResetUndoLogs(UndoLogCategory category);
+
+/* Interface use by tablespace.c. */
+extern bool DropUndoLogsInTablespace(Oid tablespace);
+
+/* GUC interfaces. */
+extern void assign_undo_tablespaces(const char *newval, void *extra);
+
+/* Checkpointing interfaces. */
+extern void CheckPointUndoLogs(XLogRecPtr checkPointRedo,
+                                                          XLogRecPtr priorCheckPointRedo);
+
+/* File sync request management. */
+
+
+extern UndoRecPtr UndoLogGetLastXactStartPoint(UndoLogNumber logno);
+extern UndoRecPtr UndoLogGetNextInsertPtr(UndoLogNumber logno);
+extern void UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno,
+                                                                               UndoRecPtr prevlog_last_urp,
+                                                                               UndoRecPtr prevlog_xact_start);
+extern void UndoLogSetLSN(XLogRecPtr lsn);
+void UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno);
+/* Redo interface. */
+extern void undolog_redo(XLogReaderState *record);
+/* Discard the undo logs for temp tables */
+extern void TempUndoDiscard(UndoLogNumber);
+
+/* Test-only interfacing. */
+extern void UndoLogDetachFull(void);
+
+#endif
diff --git a/src/include/access/undolog_xlog.h b/src/include/access/undolog_xlog.h
new file mode 100644 (file)
index 0000000..aa9003b
--- /dev/null
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog_xlog.h
+ *       undo log access XLOG definitions.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/undolog_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOLOG_XLOG_H
+#define UNDOLOG_XLOG_H
+
+#include "access/undolog.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+
+/* XLOG records */
+#define XLOG_UNDOLOG_CREATE            0x00
+#define XLOG_UNDOLOG_EXTEND            0x10
+#define XLOG_UNDOLOG_DISCARD   0x20
+#define XLOG_UNDOLOG_SWITCH            0x30
+
+/* Create a new undo log. */
+typedef struct xl_undolog_create
+{
+       UndoLogNumber logno;
+       Oid             tablespace;
+       UndoLogCategory category;
+} xl_undolog_create;
+
+/* Extend an undo log by adding a new segment. */
+typedef struct xl_undolog_extend
+{
+       UndoLogNumber logno;
+       UndoLogOffset end;
+} xl_undolog_extend;
+
+/* Discard space, and possibly destroy or recycle undo log segments. */
+typedef struct xl_undolog_discard
+{
+       UndoLogNumber logno;
+       UndoLogOffset discard;
+       UndoLogOffset end;
+       TransactionId latestxid;        /* latest xid whose undolog are discarded. */
+       bool              entirely_discarded;
+} xl_undolog_discard;
+
+/* Switch undo log. */
+typedef struct xl_undolog_switch
+{
+       UndoLogNumber logno;
+       UndoRecPtr prevlog_xact_start;
+       UndoRecPtr prevlog_last_urp;
+} xl_undolog_switch;
+
+extern void undolog_desc(StringInfo buf,XLogReaderState *record);
+extern const char *undolog_identify(uint8 info);
+
+#endif
index 4105b59904b457f09ddc0fa92e8bd9ca9d5b07f6..4db68f1082c996865ac1e015aa293c54ce8e9625 100644 (file)
@@ -44,6 +44,20 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record,
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
                                                                         BlockNumber blkno, ReadBufferMode mode);
 
+extern bool XLogFindBlockId(XLogReaderState *record,
+                                                       RelFileNode rnode,
+                                                       ForkNumber forknum,
+                                                       BlockNumber blockno,
+                                                       uint8 *block_id);
+
+extern XLogRedoAction XLogReadBufferForRedoBlock(XLogReaderState *record,
+                                                                                                RelFileNode rnode,
+                                                                                                ForkNumber forknum,
+                                                                                                BlockNumber blockno,
+                                                                                                ReadBufferMode mode,
+                                                                                                bool get_cleanup_lock,
+                                                                                                Buffer *buf);
+
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/catalog/database_internal.h b/src/include/catalog/database_internal.h
new file mode 100644 (file)
index 0000000..a2b3a12
--- /dev/null
@@ -0,0 +1,21 @@
+/*-------------------------------------------------------------------------
+ *
+ * database_internal.h
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/database_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DATABASE_INTERNAL_H
+#define DATABASE_INTERNAL_H
+
+/*
+ * We use this header to define the OIDs of pseudo-database OIDs used in
+ * buffer tags to hold system data.
+ */
+#define UndoDbOid 9
+
+#endif                                                 /* DATABASE_INTERNAL_H */
index 0902dce5f140c963deee8671dc5aa70b6f44f07b..de475cb2d4c2d72a7f1e3a46d48eb9e6cf21fa14 100644 (file)
   proname => 'pg_partition_root', prorettype => 'regclass',
   proargtypes => 'regclass', prosrc => 'pg_partition_root' },
 
+# undo logs
+{ oid => '5032', descr => 'list undo logs',
+  proname => 'pg_stat_get_undo_logs', procost => '1', prorows => '10', proretset => 't',
+  prorettype => 'record', proargtypes => '',
+  proallargtypes => '{oid,text,text,text,text,text,xid,int4,text}', proargmodes => '{o,o,o,o,o,o,o,o,o}',
+  proargnames => '{logno,category,tablespace,discard,insert,end,xid,pid,status}', prosrc => 'pg_stat_get_undo_logs' },
+
 ]
index 0a3ad3a1883b270ba8b44396a4121b7a154f6958..1936c5db6fe8c81e18aec82aa4571617b07faa5e 100644 (file)
@@ -934,6 +934,13 @@ typedef enum
        WAIT_EVENT_TWOPHASE_FILE_READ,
        WAIT_EVENT_TWOPHASE_FILE_SYNC,
        WAIT_EVENT_TWOPHASE_FILE_WRITE,
+       WAIT_EVENT_UNDO_CHECKPOINT_READ,
+       WAIT_EVENT_UNDO_CHECKPOINT_SYNC,
+       WAIT_EVENT_UNDO_CHECKPOINT_WRITE,
+       WAIT_EVENT_UNDO_FILE_READ,
+       WAIT_EVENT_UNDO_FILE_WRITE,
+       WAIT_EVENT_UNDO_FILE_FLUSH,
+       WAIT_EVENT_UNDO_FILE_SYNC,
        WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
        WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
        WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
index 509f4b7ef1c474fc259348dc4d05c69d239830ac..a04190aa92419e6fb5ef13481e13f2f35f947ba0 100644 (file)
@@ -37,8 +37,9 @@ typedef enum BufferAccessStrategyType
 typedef enum
 {
        RBM_NORMAL,                                     /* Normal read */
-       RBM_ZERO_AND_LOCK,                      /* Don't read from disk, caller will
-                                                                * initialize. Also locks the page. */
+       RBM_ZERO,                                       /* Don't read from disk, caller will
+                                                                * initialize. */
+       RBM_ZERO_AND_LOCK,                      /* Like RBM_ZERO, but also locks the page. */
        RBM_ZERO_AND_CLEANUP_LOCK,      /* Like RBM_ZERO_AND_LOCK, but locks the page
                                                                 * in "cleanup" mode */
        RBM_ZERO_ON_ERROR,                      /* Read, but return an all-zeros page on error */
@@ -170,7 +171,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
                                                                 BufferAccessStrategy strategy);
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
                                                                                ForkNumber forkNum, BlockNumber blockNum,
-                                                                               ReadBufferMode mode, BufferAccessStrategy strategy);
+                                                                               ReadBufferMode mode, BufferAccessStrategy strategy,
+                                                                               char relpersistence);
+extern void ForgetBuffer(RelFileNode rnode, ForkNumber forkNum,
+                                                BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
@@ -227,6 +231,10 @@ extern void AtProcExit_LocalBuffers(void);
 
 extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
 
+/* in localbuf.c */
+extern void ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum,
+                                                         BlockNumber blockNum);
+
 /* in freelist.c */
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
index d2a8c52044bb71b8008045f9b95c3f89f0cf45b2..b215201f30d773763c2554d10b540d520bd690e9 100644 (file)
@@ -143,6 +143,7 @@ extern int  pg_fsync_writethrough(int fd);
 extern int     pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(const char *fname, bool isdir);
+extern int     fsync_fname_ext(const char *fname, bool isdir, bool perm, int elevel);
 extern int     durable_rename(const char *oldfile, const char *newfile, int loglevel);
 extern int     durable_unlink(const char *fname, int loglevel);
 extern int     durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
index 08e0dc8144bb947d8b8f7f241b12638cc351d043..4abb344aec66747c073b409fadfd378611bd3a76 100644 (file)
@@ -220,7 +220,10 @@ typedef enum BuiltinTrancheIds
        LWTRANCHE_TBM,
        LWTRANCHE_PARALLEL_APPEND,
        LWTRANCHE_SXACT,
-       LWTRANCHE_FIRST_USER_DEFINED
+       LWTRANCHE_UNDOLOG,
+       LWTRANCHE_UNDODISCARD,
+       LWTRANCHE_REWIND,
+       LWTRANCHE_FIRST_USER_DEFINED,
 }                      BuiltinTrancheIds;
 
 /*
index d286c8c7b116a6bede7d3e4e7757ec6b99856883..0fd3b7505d14b05fa925cadbb124f3960af3842b 100644 (file)
@@ -70,6 +70,9 @@ typedef struct SMgrRelationData
        int                     md_num_open_segs[MAX_FORKNUM + 1];
        struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
+       /* For use by implementations. */
+       void       *private_data;
+
        /* if unowned, list link in list of all unowned SMgrRelations */
        dlist_node      node;
 } SMgrRelationData;
index 16428c5f5fb04a8ab70707140f1687e44fcd38cc..59f1da917881470c62b5f2a0fe3232510dbf614f 100644 (file)
@@ -34,7 +34,8 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-       SYNC_HANDLER_MD = 0                     /* md smgr */
+       SYNC_HANDLER_MD = 0,            /* md smgr */
+       SYNC_HANDLER_UNDO = 1           /* undo smgr */
 } SyncRequestHandler;
 
 /*
diff --git a/src/include/storage/undofile.h b/src/include/storage/undofile.h
new file mode 100644 (file)
index 0000000..a5ec30f
--- /dev/null
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * undofile.h
+ *       undo storage manager public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/undofile.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOFILE_H
+#define UNDOFILE_H
+
+#include "access/undolog.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+
+extern void undofile_init(void);
+extern void undofile_shutdown(void);
+extern void undofile_open(SMgrRelation reln);
+extern void undofile_close(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_create(SMgrRelation reln, ForkNumber forknum,
+                                                       bool isRedo);
+extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum,
+                                                       bool isRedo);
+extern void undofile_extend(SMgrRelation reln, ForkNumber forknum,
+                BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum,
+                  BlockNumber blocknum);
+extern void undofile_read(SMgrRelation reln, ForkNumber forknum,
+                                                 BlockNumber blocknum, char *buffer);
+extern void undofile_write(SMgrRelation reln, ForkNumber forknum,
+               BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum,
+                       BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum,
+                  BlockNumber nblocks);
+extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+/* Callbacks used by sync.c. */
+extern int undofile_syncfiletag(const FileTag *tag, char *path);
+extern bool undofile_filetagmatches(const FileTag *tag, const FileTag *candidate);
+
+/* Management of checkpointer requests. */
+extern void undofile_request_sync(UndoLogNumber logno, BlockNumber segno,
+                                                                 Oid tablespace);
+extern void undofile_forget_sync(UndoLogNumber logno, BlockNumber segno,
+                                                                Oid tablespace);
+extern void undofile_forget_sync_tablespace(Oid tablespace);
+extern void undofile_request_sync_dir(Oid tablespace);
+
+#endif                                                 /* UNDOFILE_H */
index e709177c37698409d212c1dc1029abe409cddc7f..84639eed2256fa0385aff9f78fcc785c56048c09 100644 (file)
@@ -431,6 +431,8 @@ extern void GUC_check_errcode(int sqlerrcode);
 extern bool check_default_tablespace(char **newval, void **extra, GucSource source);
 extern bool check_temp_tablespaces(char **newval, void **extra, GucSource source);
 extern void assign_temp_tablespaces(const char *newval, void *extra);
+extern bool check_undo_tablespaces(char **newval, void **extra, GucSource source);
+extern void assign_undo_tablespaces(const char *newval, void *extra);
 
 /* in catalog/namespace.c */
 extern bool check_search_path(char **newval, void **extra, GucSource source);
index 210e9cd146cf3a9b9d54fb9b06abbbd365356b5b..7098461ddeee4ba52002c8933c8e1f6e12ee4745 100644 (file)
@@ -2010,6 +2010,16 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
+pg_stat_undo_logs| SELECT pg_stat_get_undo_logs.logno,
+    pg_stat_get_undo_logs.category,
+    pg_stat_get_undo_logs.tablespace,
+    pg_stat_get_undo_logs.discard,
+    pg_stat_get_undo_logs.insert,
+    pg_stat_get_undo_logs."end",
+    pg_stat_get_undo_logs.xid,
+    pg_stat_get_undo_logs.pid,
+    pg_stat_get_undo_logs.status
+   FROM pg_stat_get_undo_logs() pg_stat_get_undo_logs(logno, category, tablespace, discard, insert, "end", xid, pid, status);
 pg_stat_user_functions| SELECT p.oid AS funcid,
     n.nspname AS schemaname,
     p.proname AS funcname,