Prevent re-use of a deleted relation's relfilenode until after the next

author Tom Lane <[email protected]>

Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)

committer Tom Lane <[email protected]>

Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)
author Tom Lane <[email protected]>
Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)
committer Tom Lane <[email protected]>
Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 6d4dc6940a55cd6660456e1210350f537bd0ed84..36adc20848e98e7759f28099e8dc9baf564f3436 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.286 2007/10/12 19:39:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.287 2007/11/15 20:36:40 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -45,6 +45,7 @@
  #include "storage/fd.h"
  #include "storage/pmsignal.h"
  #include "storage/procarray.h"
+#include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/builtins.h"
  #include "utils/pg_locale.h"
@@ -5663,6 +5664,14 @@ CreateCheckPoint(int flags)
         UpdateControlFile();
     }
  
+   /*
+    * Let smgr prepare for checkpoint; this has to happen before we
+    * determine the REDO pointer.  Note that smgr must not do anything
+    * that'd have to be undone if we decide no checkpoint is needed.
+    */
+   smgrpreckpt();
+
+   /* Begin filling in the checkpoint WAL record */
     MemSet(&checkPoint, 0, sizeof(checkPoint));
     checkPoint.ThisTimeLineID = ThisTimeLineID;
     checkPoint.time = time(NULL);
@@ -5886,6 +5895,11 @@ CreateCheckPoint(int flags)
      */
     END_CRIT_SECTION();
  
+   /*
+    * Let smgr do post-checkpoint cleanup (eg, deleting old files).
+    */
+   smgrpostckpt();
+
     /*
      * Delete old log files (those no longer needed even for previous
      * checkpoint).
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c

index f19e237315e447e869bb7e141cd09f9b6c6149b5..305da59da000af2844e9f7897ea0910704a8aea0 100644 (file)
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -37,7 +37,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -57,6 +57,7 @@
  #include "commands/comment.h"
  #include "commands/tablespace.h"
  #include "miscadmin.h"
+#include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "utils/acl.h"
  #include "utils/builtins.h"
@@ -460,13 +461,29 @@ DropTableSpace(DropTableSpaceStmt *stmt)
     LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
  
     /*
-    * Try to remove the physical infrastructure
+    * Try to remove the physical infrastructure. 
      */
     if (!remove_tablespace_directories(tablespaceoid, false))
-       ereport(ERROR,
-               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-                errmsg("tablespace \"%s\" is not empty",
-                       tablespacename)));
+   {
+       /*
+        * Not all files deleted?  However, there can be lingering empty files
+        * in the directories, left behind by for example DROP TABLE, that
+        * have been scheduled for deletion at next checkpoint (see comments
+        * in mdunlink() for details).  We could just delete them immediately,
+        * but we can't tell them apart from important data files that we
+        * mustn't delete.  So instead, we force a checkpoint which will clean
+        * out any lingering files, and try again.
+        */
+       RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
+       if (!remove_tablespace_directories(tablespaceoid, false))
+       {
+           /* Still not empty, the files must be important then */
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("tablespace \"%s\" is not empty",
+                           tablespacename)));
+       }
+   }
  
     /* Record the filesystem change in XLOG */
     {
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 251811421e0af34e919de0672e3259a5d1b38ffe..59d39117f3f2440fc3865f70029e33dfea622f93 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -34,6 +34,7 @@
  /* special values for the segno arg to RememberFsyncRequest */
  #define FORGET_RELATION_FSYNC  (InvalidBlockNumber)
  #define FORGET_DATABASE_FSYNC  (InvalidBlockNumber-1)
+#define UNLINK_RELATION_REQUEST    (InvalidBlockNumber-2)
  
  /*
   * On Windows, we have to interpret EACCES as possibly meaning the same as
@@ -113,6 +114,10 @@ static MemoryContext MdCxt;        /* context for all md.c allocations */
   * table remembers the pending operations. We use a hash table mostly as
   * a convenient way of eliminating duplicate requests.
   *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
   * (Regular backends do not track pending operations locally, but forward
   * them to the bgwriter.)
   */
@@ -131,9 +136,17 @@ typedef struct
     CycleCtr    cycle_ctr;      /* mdsync_cycle_ctr when request was made */
  } PendingOperationEntry;
  
+typedef struct
+{
+   RelFileNode rnode;          /* the dead relation to delete */
+   CycleCtr cycle_ctr;         /* mdckpt_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
  static HTAB *pendingOpsTable = NULL;
+static List *pendingUnlinks = NIL;
  
  static CycleCtr mdsync_cycle_ctr = 0;
+static CycleCtr mdckpt_cycle_ctr = 0;
  
  
  typedef enum                   /* behavior for mdopen & _mdfd_getseg */
@@ -146,6 +159,7 @@ typedef enum                    /* behavior for mdopen & _mdfd_getseg */
  /* local routines */
  static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
  static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+static void register_unlink(RelFileNode rnode);
  static MdfdVec *_fdvec_alloc(void);
  
  #ifndef LET_OS_MANAGE_FILESIZE
@@ -188,6 +202,7 @@ mdinit(void)
                                       100L,
                                       &hash_ctl,
                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+       pendingUnlinks = NIL;
     }
  }
  
@@ -254,14 +269,37 @@ mdcreate(SMgrRelation reln, bool isRedo)
   * Note that we're passed a RelFileNode --- by the time this is called,
   * there won't be an SMgrRelation hashtable entry anymore.
   *
+ * Actually, we don't unlink the first segment file of the relation, but
+ * just truncate it to zero length, and record a request to unlink it after
+ * the next checkpoint.  Additional segments can be unlinked immediately,
+ * however.  Leaving the empty file in place prevents that relfilenode
+ * number from being reused.  The scenario this protects us from is:
+ * 1. We delete a relation (and commit, and actually remove its file).
+ * 2. We create a new relation, which by chance gets the same relfilenode as
+ *    the just-deleted one (OIDs must've wrapped around for that to happen).
+ * 3. We crash before another checkpoint occurs.
+ * During replay, we would delete the file and then recreate it, which is fine
+ * if the contents of the file were repopulated by subsequent WAL entries.
+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
+ * the contents of the file would be lost forever.  By leaving the empty file
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
+ * number until it's safe, because relfilenode assignment skips over any
+ * existing file.
+ *
   * If isRedo is true, it's okay for the relation to be already gone.
- * Also, any failure should be reported as WARNING not ERROR, because
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
   * we are usually not in a transaction anymore when this is called.
   */
  void
  mdunlink(RelFileNode rnode, bool isRedo)
  {
     char       *path;
+   int ret;
  
     /*
      * We have to clean out any pending fsync requests for the doomed relation,
@@ -271,8 +309,15 @@ mdunlink(RelFileNode rnode, bool isRedo)
  
     path = relpath(rnode);
  
-   /* Delete the first segment, or only segment if not doing segmenting */
-   if (unlink(path) < 0)
+   /*
+    * Delete or truncate the first segment, or only segment if not doing
+    * segmenting
+    */
+   if (isRedo)
+       ret = unlink(path);
+   else
+       ret = truncate(path, 0);
+   if (ret < 0)
     {
         if (!isRedo || errno != ENOENT)
             ereport(WARNING,
@@ -316,6 +361,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
  #endif
  
     pfree(path);
+
+   /* Register request to unlink first segment later */
+   if (!isRedo)
+       register_unlink(rnode);
  }
  
  /*
@@ -1063,6 +1112,91 @@ mdsync(void)
     mdsync_in_progress = false;
  }
  
+/*
+ * mdpreckpt() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+mdpreckpt(void)
+{
+   ListCell *cell;
+
+   /*
+    * In case the prior checkpoint wasn't completed, stamp all entries in
+    * the list with the current cycle counter.  Anything that's in the
+    * list at the start of checkpoint can surely be deleted after the
+    * checkpoint is finished, regardless of when the request was made.
+    */
+   foreach(cell, pendingUnlinks)
+   {
+       PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+       entry->cycle_ctr = mdckpt_cycle_ctr;
+   }
+
+   /*
+    * Any unlink requests arriving after this point will be assigned the
+    * next cycle counter, and won't be unlinked until next checkpoint.
+    */
+   mdckpt_cycle_ctr++;
+}
+
+/*
+ * mdpostckpt() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+mdpostckpt(void)
+{
+   while (pendingUnlinks != NIL)
+   {
+       PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+       char *path;
+
+       /*
+        * New entries are appended to the end, so if the entry is new
+        * we've reached the end of old entries.
+        */
+       if (entry->cycle_ctr == mdsync_cycle_ctr)
+           break;
+
+       /* Else assert we haven't missed it */
+       Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
+
+       /* Unlink the file */
+       path = relpath(entry->rnode);
+       if (unlink(path) < 0)
+       {
+           /*
+            * ENOENT shouldn't happen either, but it doesn't really matter
+            * because we would've deleted it now anyway.
+            */
+           if (errno != ENOENT)
+               ereport(WARNING,
+                       (errcode_for_file_access(),
+                        errmsg("could not remove relation %u/%u/%u: %m",
+                               entry->rnode.spcNode,
+                               entry->rnode.dbNode,
+                               entry->rnode.relNode)));
+       }
+       pfree(path);
+
+       pendingUnlinks = list_delete_first(pendingUnlinks);
+       pfree(entry);
+   }
+}
+
  /*
   * register_dirty_segment() -- Mark a relation segment as needing fsync
   *
@@ -1096,19 +1230,53 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
     }
  }
  
+/*
+ * register_unlink() -- Schedule a file to be deleted after next checkpoint
+ *
+ * As with register_dirty_segment, this could involve either a local or
+ * a remote pending-ops table.
+ */
+static void
+register_unlink(RelFileNode rnode)
+{
+   if (pendingOpsTable)
+   {
+       /* push it into local pending-ops table */
+       RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+   }
+   else
+   {
+       /*
+        * Notify the bgwriter about it.  If we fail to queue the request
+        * message, we have to sleep and try again, because we can't simply
+        * delete the file now.  Ugly, but hopefully won't happen often.
+        *
+        * XXX should we just leave the file orphaned instead?
+        */
+       Assert(IsUnderPostmaster);
+       while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+           pg_usleep(10000L);  /* 10 msec seems a good number */
+   }
+}
+
  /*
   * RememberFsyncRequest() -- callback from bgwriter side of fsync request
   *
- * We stuff the fsync request into the local hash table for execution
- * during the bgwriter's next checkpoint.
+ * We stuff most fsync requests into the local hash table for execution
+ * during the bgwriter's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
- * a whole database.  (These are a tad slow because the hash table has to be
- * searched linearly, but it doesn't seem worth rethinking the table structure
- * for them.)
+ * We define three:
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
+ *   checkpoint.
+ *
+ * (Handling the FORGET_* requests is a tad slow because the hash table has
+ * to be searched linearly, but it doesn't seem worth rethinking the table
+ * structure for them.)
   */
  void
  RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
@@ -1147,6 +1315,20 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
             }
         }
     }
+   else if (segno == UNLINK_RELATION_REQUEST)
+   {
+       /* Unlink request: put it in the linked list */
+       MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
+       PendingUnlinkEntry *entry;
+
+       entry = palloc(sizeof(PendingUnlinkEntry));
+       entry->rnode = rnode;
+       entry->cycle_ctr = mdckpt_cycle_ctr;
+
+       pendingUnlinks = lappend(pendingUnlinks, entry);
+
+       MemoryContextSwitchTo(oldcxt);
+   }
     else
     {
         /* Normal case: enter a request to fsync this segment */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index 22ac13146c8fb1b00d2fd1ea53873d01f2b04200..6b13483a6d5c9321f5e8530b1eb0caf0d6fe4b98 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.106 2007/09/05 18:10:48 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.107 2007/11/15 20:36:40 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -55,9 +55,11 @@ typedef struct f_smgr
     void        (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
                                   bool isTemp);
     void        (*smgr_immedsync) (SMgrRelation reln);
-   void        (*smgr_commit) (void);  /* may be NULL */
-   void        (*smgr_abort) (void);   /* may be NULL */
-   void        (*smgr_sync) (void);    /* may be NULL */
+   void        (*smgr_commit) (void);      /* may be NULL */
+   void        (*smgr_abort) (void);       /* may be NULL */
+   void        (*smgr_pre_ckpt) (void);    /* may be NULL */
+   void        (*smgr_sync) (void);        /* may be NULL */
+   void        (*smgr_post_ckpt) (void);   /* may be NULL */
  } f_smgr;
  
  
@@ -65,7 +67,7 @@ static const f_smgr smgrsw[] = {
     /* magnetic disk */
     {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
         mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
-       NULL, NULL, mdsync
+       NULL, NULL, mdpreckpt, mdsync, mdpostckpt
     }
  };
  
@@ -778,7 +780,22 @@ smgrabort(void)
  }
  
  /*
- * smgrsync() -- Sync files to disk at checkpoint time.
+ * smgrpreckpt() -- Prepare for checkpoint.
+ */
+void
+smgrpreckpt(void)
+{
+   int         i;
+
+   for (i = 0; i < NSmgr; i++)
+   {
+       if (smgrsw[i].smgr_pre_ckpt)
+           (*(smgrsw[i].smgr_pre_ckpt)) ();
+   }
+}
+
+/*
+ * smgrsync() -- Sync files to disk during checkpoint.
   */
  void
  smgrsync(void)
@@ -792,6 +809,21 @@ smgrsync(void)
     }
  }
  
+/*
+ * smgrpostckpt() -- Post-checkpoint cleanup.
+ */
+void
+smgrpostckpt(void)
+{
+   int         i;
+
+   for (i = 0; i < NSmgr; i++)
+   {
+       if (smgrsw[i].smgr_post_ckpt)
+           (*(smgrsw[i].smgr_post_ckpt)) ();
+   }
+}
+
  
  void
  smgr_redo(XLogRecPtr lsn, XLogRecord *record)
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index bc071e7ef052d85c8aed73ac8c288c89800e0fd3..87b0171a1b8b5c874b202cafd768e2f485d81d8a 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.59 2007/09/05 18:10:48 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.60 2007/11/15 20:36:40 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -83,7 +83,9 @@ extern void AtSubAbort_smgr(void);
  extern void PostPrepare_smgr(void);
  extern void smgrcommit(void);
  extern void smgrabort(void);
+extern void smgrpreckpt(void);
  extern void smgrsync(void);
+extern void smgrpostckpt(void);
  
  extern void smgr_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void smgr_desc(StringInfo buf, uint8 xl_info, char *rec);
@@ -104,7 +106,9 @@ extern void mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer,
  extern BlockNumber mdnblocks(SMgrRelation reln);
  extern void mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp);
  extern void mdimmedsync(SMgrRelation reln);
+extern void mdpreckpt(void);
  extern void mdsync(void);
+extern void mdpostckpt(void);
  
  extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
  extern void ForgetRelationFsyncRequests(RelFileNode rnode);
author	Tom Lane <[email protected]>
	Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)
committer	Tom Lane <[email protected]>
	Thu, 15 Nov 2007 20:36:40 +0000 (20:36 +0000)
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/commands/tablespace.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/include/storage/smgr.h		patch \| blob \| blame \| history