Allow on-line enabling and disabling of data checksums
authorMagnus Hagander <[email protected]>
Thu, 5 Apr 2018 19:57:26 +0000 (21:57 +0200)
committerMagnus Hagander <[email protected]>
Thu, 5 Apr 2018 20:04:48 +0000 (22:04 +0200)
This makes it possible to turn checksums on in a live cluster, without
the previous need for dump/reload or logical replication (and to turn it
off).

Enabling checkusm starts a background process in the form of a
launcher/worker combination that goes through the entire database and
recalculates checksums on each and every page. Only when all pages have
been checksummed are they fully enabled in the cluster. Any failure of
the process will revert to checksums off and the process has to be
started.

This adds a new WAL record that indicates the state of checksums, so
the process works across replicated clusters.

Authors: Magnus Hagander and Daniel Gustafsson
Review: Tomas Vondra, Michael Banck, Heikki Linnakangas, Andrey Borodin

45 files changed:
doc/src/sgml/func.sgml
doc/src/sgml/ref/allfiles.sgml
doc/src/sgml/ref/initdb.sgml
doc/src/sgml/ref/pg_verify_checksums.sgml [new file with mode: 0644]
doc/src/sgml/reference.sgml
doc/src/sgml/wal.sgml
src/backend/access/rmgrdesc/xlogdesc.c
src/backend/access/transam/xlog.c
src/backend/access/transam/xlogfuncs.c
src/backend/catalog/system_views.sql
src/backend/postmaster/Makefile
src/backend/postmaster/bgworker.c
src/backend/postmaster/checksumhelper.c [new file with mode: 0644]
src/backend/postmaster/pgstat.c
src/backend/replication/basebackup.c
src/backend/replication/logical/decode.c
src/backend/storage/ipc/ipci.c
src/backend/storage/page/README
src/backend/storage/page/bufpage.c
src/backend/utils/misc/guc.c
src/bin/Makefile
src/bin/pg_upgrade/controldata.c
src/bin/pg_upgrade/pg_upgrade.h
src/bin/pg_verify_checksums/.gitignore [new file with mode: 0644]
src/bin/pg_verify_checksums/Makefile [new file with mode: 0644]
src/bin/pg_verify_checksums/pg_verify_checksums.c [new file with mode: 0644]
src/include/access/xlog.h
src/include/access/xlog_internal.h
src/include/catalog/catversion.h
src/include/catalog/pg_control.h
src/include/catalog/pg_proc.h
src/include/pgstat.h
src/include/postmaster/checksumhelper.h [new file with mode: 0644]
src/include/storage/bufpage.h
src/include/storage/checksum.h
src/test/Makefile
src/test/checksum/.gitignore [new file with mode: 0644]
src/test/checksum/Makefile [new file with mode: 0644]
src/test/checksum/README [new file with mode: 0644]
src/test/checksum/t/001_standby_checksum.pl [new file with mode: 0644]
src/test/isolation/expected/checksum_cancel.out [new file with mode: 0644]
src/test/isolation/expected/checksum_enable.out [new file with mode: 0644]
src/test/isolation/isolation_schedule
src/test/isolation/specs/checksum_cancel.spec [new file with mode: 0644]
src/test/isolation/specs/checksum_enable.spec [new file with mode: 0644]

index 122f034f17763c2f90a3a0b44ecf2ee13cc365e5..6257563eaad39636578d4c0f7a6a05c5f4790887 100644 (file)
@@ -19540,6 +19540,71 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
 
   </sect2>
 
+  <sect2 id="functions-admin-checksum">
+   <title>Data Checksum Functions</title>
+
+   <para>
+    The functions shown in <xref linkend="functions-checksums-table" /> can
+    be used to enable or disable data checksums in a running cluster.
+    See <xref linkend="checksums" /> for details.
+   </para>
+
+   <table id="functions-checksums-table">
+    <title>Checksum <acronym>SQL</acronym> Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row>
+       <entry>Function</entry>
+       <entry>Return Type</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+     <tbody>
+      <row>
+       <entry>
+        <indexterm>
+         <primary>pg_enable_data_checksums</primary>
+        </indexterm>
+        <literal><function>pg_enable_data_checksums(<optional><parameter>cost_delay</parameter> <type>int</type>, <parameter>cost_limit</parameter> <type>int</type></optional>)</function></literal>
+       </entry>
+       <entry>
+        void
+       </entry>
+       <entry>
+        <para>
+         Initiates data checksums for the cluster. This will switch the data checksums mode
+         to <literal>in progress</literal> and start a background worker that will process
+         all data in the database and enable checksums for it. When all data pages have had
+         checksums enabled, the cluster will automatically switch to checksums
+         <literal>on</literal>.
+        </para>
+        <para>
+         If <parameter>cost_delay</parameter> and <parameter>cost_limit</parameter> are
+         specified, the speed of the process is throttled using the same principles as
+         <link linkend="runtime-config-resource-vacuum-cost">Cost-based Vacuum Delay</link>.
+        </para>
+       </entry>
+      </row>
+      <row>
+       <entry>
+        <indexterm>
+         <primary>pg_disable_data_checksums</primary>
+        </indexterm>
+        <literal><function>pg_disable_data_checksums()</function></literal>
+       </entry>
+       <entry>
+        void
+       </entry>
+       <entry>
+        Disables data checksums for the cluster.
+       </entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+  </sect2>
+
   <sect2 id="functions-admin-dbobject">
    <title>Database Object Management Functions</title>
 
index 4e01e5641cfcb42b204e60e96838059fdef5c409..7cd6ee85dc92f018f33595e7186860f42b046478 100644 (file)
@@ -211,6 +211,7 @@ Complete list of usable sgml source files in this directory.
 <!ENTITY pgResetwal         SYSTEM "pg_resetwal.sgml">
 <!ENTITY pgRestore          SYSTEM "pg_restore.sgml">
 <!ENTITY pgRewind           SYSTEM "pg_rewind.sgml">
+<!ENTITY pgVerifyChecksums  SYSTEM "pg_verify_checksums.sgml">
 <!ENTITY pgtestfsync        SYSTEM "pgtestfsync.sgml">
 <!ENTITY pgtesttiming       SYSTEM "pgtesttiming.sgml">
 <!ENTITY pgupgrade          SYSTEM "pgupgrade.sgml">
index 949b5a220f588db74dd4cc836f610e234eb03a12..826dd91f7290b30c8552096404339b644a07d9fb 100644 (file)
@@ -195,9 +195,9 @@ PostgreSQL documentation
        <para>
         Use checksums on data pages to help detect corruption by the
         I/O system that would otherwise be silent. Enabling checksums
-        may incur a noticeable performance penalty. This option can only
-        be set during initialization, and cannot be changed later. If
-        set, checksums are calculated for all objects, in all databases.
+        may incur a noticeable performance penalty. If set, checksums
+        are calculated for all objects, in all databases. See
+        <xref linkend="checksums" /> for details.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/ref/pg_verify_checksums.sgml b/doc/src/sgml/ref/pg_verify_checksums.sgml
new file mode 100644 (file)
index 0000000..463ecd5
--- /dev/null
@@ -0,0 +1,112 @@
+<!--
+doc/src/sgml/ref/pg_verify_checksums.sgml
+PostgreSQL documentation
+-->
+
+<refentry id="pgverifychecksums">
+ <indexterm zone="pgverifychecksums">
+  <primary>pg_verify_checksums</primary>
+ </indexterm>
+
+ <refmeta>
+  <refentrytitle><application>pg_verify_checksums</application></refentrytitle>
+  <manvolnum>1</manvolnum>
+  <refmiscinfo>Application</refmiscinfo>
+ </refmeta>
+
+ <refnamediv>
+  <refname>pg_verify_checksums</refname>
+  <refpurpose>verify data checksums in an offline <productname>PostgreSQL</productname> database cluster</refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+  <cmdsynopsis>
+   <command>pg_verify_checksums</command>
+   <arg choice="opt"><replaceable class="parameter">option</replaceable></arg>
+   <arg choice="opt"><arg choice="opt"><option>-D</option></arg> <replaceable class="parameter">datadir</replaceable></arg>
+  </cmdsynopsis>
+ </refsynopsisdiv>
+
+ <refsect1 id="r1-app-pg_verify_checksums-1">
+  <title>Description</title>
+  <para>
+   <command>pg_verify_checksums</command> verifies data checksums in a PostgreSQL
+   cluster. It must be run against a cluster that's offline.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>Options</title>
+
+   <para>
+    The following command-line options are available:
+
+    <variablelist>
+
+     <varlistentry>
+      <term><option>-r <replaceable>relfilenode</replaceable></option></term>
+      <listitem>
+       <para>
+        Only validate checksums in the relation with specified relfilenode.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>-f</option></term>
+      <listitem>
+       <para>
+        Force check even if checksums are disabled on cluster.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>-d</option></term>
+      <listitem>
+       <para>
+        Enable debug output. Lists all checked blocks and their checksum.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+       <term><option>-V</option></term>
+       <term><option>--version</option></term>
+       <listitem>
+       <para>
+       Print the <application>pg_verify_checksums</application> version and exit.
+       </para>
+       </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>-?</option></term>
+      <term><option>--help</option></term>
+       <listitem>
+        <para>
+         Show help about <application>pg_verify_checksums</application> command line
+         arguments, and exit.
+        </para>
+       </listitem>
+      </varlistentry>
+    </variablelist>
+   </para>
+ </refsect1>
+
+ <refsect1>
+  <title>Notes</title>
+  <para>
+    Can only be run when the server is offline.
+  </para>
+ </refsect1>
+
+ <refsect1>
+  <title>See Also</title>
+
+  <simplelist type="inline">
+   <member><xref linkend="checksums"/></member>
+  </simplelist>
+ </refsect1>
+
+</refentry>
index ef2270c46730caf9ad99c406bb73b4f6aa57a3f4..78c214f1b089707ed93c94033e721f4aae161d90 100644 (file)
    &pgtestfsync;
    &pgtesttiming;
    &pgupgrade;
+   &pgVerifyChecksums;
    &pgwaldump;
    &postgres;
    &postmaster;
index f4bc2d4161e2599117bd7032e72228db828f3a6f..6249cb413617574768ca3ef3c74ad9ae92c7076a 100644 (file)
   </para>
  </sect1>
 
+ <sect1 id="checksums">
+  <title>Data checksums</title>
+  <indexterm>
+   <primary>checksums</primary>
+  </indexterm>
+
+  <para>
+   Data pages are not checksum protected by default, but this can optionally be enabled for a cluster.
+   When enabled, each data page will be assigned a checksum that is updated when the page is
+   written and verified every time the page is read. Only data pages are protected by checksums,
+   internal data structures and temporary files are not.
+  </para>
+
+  <para>
+   Checksums are normally enabled when the cluster is initialized using
+   <link linkend="app-initdb-data-checksums"><application>initdb</application></link>. They
+   can also be enabled or disabled at runtime. In all cases, checksums are enabled or disabled
+   at the full cluster level, and cannot be specified individually for databases or tables.
+  </para>
+
+  <para>
+   The current state of checksums in the cluster can be verified by viewing the value
+   of the read-only configuration variable <xref linkend="guc-data-checksums" /> by
+   issuing the command <command>SHOW data_checksums</command>.
+  </para>
+
+  <para>
+   When attempting to recover from corrupt data it may be necessary to bypass the checksum
+   protection in order to recover data. To do this, temporarily set the configuration parameter
+   <xref linkend="guc-ignore-checksum-failure" />.
+  </para>
+
+  <sect2 id="checksums-enable-disable">
+   <title>On-line enabling of checksums</title>
+
+   <para>
+    Checksums can be enabled or disabled online, by calling the appropriate
+    <link linkend="functions-admin-checksum">functions</link>.
+    Disabling of checksums takes effect immediately when the function is called.
+   </para>
+
+   <para>
+    Enabling checksums will put the cluster in <literal>inprogress</literal> mode.
+    During this time, checksums will be written but not verified. In addition to
+    this, a background worker process is started that enables checksums on all
+    existing data in the cluster. Once this worker has completed processing all
+    databases in the cluster, the checksum mode will automatically switch to
+    <literal>on</literal>.
+   </para>
+
+   <para>
+    The process will initially wait for all open transactions to finish before
+    it starts, so that it can be certain that there are no tables that have been
+    created inside a transaction that has not committed yet and thus would not
+    be visible to the process enabling checksums. It will also, for each database,
+    wait for all pre-existing temporary tables to get removed before it finishes.
+    If long-lived temporary tables are used in the application it may be necessary
+    to terminate these application connections to allow the process to complete.
+    Information about open transactions and connections with temporary tables is
+    written to log.
+   </para>
+
+   <para>
+    If the cluster is stopped while in <literal>inprogress</literal> mode, for
+    any reason, then this process must be restarted manually. To do this,
+    re-execute the function <function>pg_enable_data_checksums()</function>
+    once the cluster has been restarted. It is not possible to resume the work,
+    the process has to start over and re-process the cluster.
+   </para>
+
+   <note>
+    <para>
+     Enabling checksums can cause significant I/O to the system, as most of the
+     database pages will need to be rewritten, and will be written both to the
+     data files and the WAL.
+    </para>
+   </note>
+
+  </sect2>
+ </sect1>
+
   <sect1 id="wal-intro">
    <title>Write-Ahead Logging (<acronym>WAL</acronym>)</title>
 
index 00741c7b09ecd1a68a348f7913cb16bc6e6c3654..a31f8b806a8d0bbf095e58621cb3464d9272a78a 100644 (file)
@@ -17,6 +17,7 @@
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
+#include "storage/bufpage.h"
 #include "utils/guc.h"
 #include "utils/timestamp.h"
 
@@ -137,6 +138,18 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
                         xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
                         timestamptz_to_str(xlrec.end_time));
    }
+   else if (info == XLOG_CHECKSUMS)
+   {
+       xl_checksum_state xlrec;
+
+       memcpy(&xlrec, rec, sizeof(xl_checksum_state));
+       if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_VERSION)
+           appendStringInfo(buf, "on");
+       else if (xlrec.new_checksumtype == PG_DATA_CHECKSUM_INPROGRESS_VERSION)
+           appendStringInfo(buf, "inprogress");
+       else
+           appendStringInfo(buf, "off");
+   }
 }
 
 const char *
@@ -182,6 +195,9 @@ xlog_identify(uint8 info)
        case XLOG_FPI_FOR_HINT:
            id = "FPI_FOR_HINT";
            break;
+       case XLOG_CHECKSUMS:
+           id = "CHECKSUMS";
+           break;
    }
 
    return id;
index b4fd8395b726ca210db21497e1286b708b7caf5d..813b2afaac2e8490bc67c61ef503280d1406847f 100644 (file)
@@ -856,6 +856,7 @@ static void SetLatestXTime(TimestampTz xtime);
 static void SetCurrentChunkStartTime(TimestampTz xtime);
 static void CheckRequiredParameterValues(void);
 static void XLogReportParameters(void);
+static void XlogChecksums(ChecksumType new_type);
 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
                    TimeLineID prevTLI);
 static void LocalSetXLogInsertAllowed(void);
@@ -1033,7 +1034,7 @@ XLogInsertRecord(XLogRecData *rdata,
        Assert(RedoRecPtr < Insert->RedoRecPtr);
        RedoRecPtr = Insert->RedoRecPtr;
    }
-   doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
+   doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites || DataChecksumsInProgress());
 
    if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites)
    {
@@ -4673,10 +4674,6 @@ ReadControlFile(void)
        (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
 
    CalculateCheckpointSegments();
-
-   /* Make the initdb settings visible as GUC variables, too */
-   SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
-                   PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
 void
@@ -4748,12 +4745,90 @@ GetMockAuthenticationNonce(void)
  * Are checksums enabled for data pages?
  */
 bool
-DataChecksumsEnabled(void)
+DataChecksumsNeedWrite(void)
 {
    Assert(ControlFile != NULL);
    return (ControlFile->data_checksum_version > 0);
 }
 
+bool
+DataChecksumsNeedVerify(void)
+{
+   Assert(ControlFile != NULL);
+
+   /*
+    * Only verify checksums if they are fully enabled in the cluster. In
+    * inprogress state they are only updated, not verified.
+    */
+   return (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION);
+}
+
+bool
+DataChecksumsInProgress(void)
+{
+   Assert(ControlFile != NULL);
+   return (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION);
+}
+
+void
+SetDataChecksumsInProgress(void)
+{
+   Assert(ControlFile != NULL);
+   if (ControlFile->data_checksum_version > 0)
+       return;
+
+   XlogChecksums(PG_DATA_CHECKSUM_INPROGRESS_VERSION);
+
+   LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+   ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_VERSION;
+   UpdateControlFile();
+   LWLockRelease(ControlFileLock);
+}
+
+void
+SetDataChecksumsOn(void)
+{
+   Assert(ControlFile != NULL);
+
+   LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+   if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_VERSION)
+   {
+       LWLockRelease(ControlFileLock);
+       elog(ERROR, "Checksums not in inprogress mode");
+   }
+
+   ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
+   UpdateControlFile();
+   LWLockRelease(ControlFileLock);
+
+   XlogChecksums(PG_DATA_CHECKSUM_VERSION);
+}
+
+void
+SetDataChecksumsOff(void)
+{
+   LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+   ControlFile->data_checksum_version = 0;
+   UpdateControlFile();
+   LWLockRelease(ControlFileLock);
+
+   XlogChecksums(0);
+}
+
+/* guc hook */
+const char *
+show_data_checksums(void)
+{
+   if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
+       return "on";
+   else if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION)
+       return "inprogress";
+   else
+       return "off";
+}
+
 /*
  * Returns a fake LSN for unlogged relations.
  *
@@ -7788,6 +7863,16 @@ StartupXLOG(void)
     */
    CompleteCommitTsInitialization();
 
+   /*
+    * If we reach this point with checksums in inprogress state, we notify
+    * the user that they need to manually restart the process to enable
+    * checksums.
+    */
+   if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION)
+       ereport(WARNING,
+               (errmsg("checksum state is \"inprogress\" with no worker"),
+                errhint("Either disable or enable checksums by calling the pg_disable_data_checksums() or pg_enable_data_checksums() functions.")));
+
    /*
     * All done with end-of-recovery actions.
     *
@@ -9541,6 +9626,22 @@ XLogReportParameters(void)
    }
 }
 
+/*
+ * Log the new state of checksums
+ */
+static void
+XlogChecksums(ChecksumType new_type)
+{
+   xl_checksum_state xlrec;
+
+   xlrec.new_checksumtype = new_type;
+
+   XLogBeginInsert();
+   XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state));
+
+   XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS);
+}
+
 /*
  * Update full_page_writes in shared memory, and write an
  * XLOG_FPW_CHANGE record if necessary.
@@ -9969,6 +10070,17 @@ xlog_redo(XLogReaderState *record)
        /* Keep track of full_page_writes */
        lastFullPageWrites = fpw;
    }
+   else if (info == XLOG_CHECKSUMS)
+   {
+       xl_checksum_state state;
+
+       memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
+
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+       ControlFile->data_checksum_version = state.new_checksumtype;
+       UpdateControlFile();
+       LWLockRelease(ControlFileLock);
+   }
 }
 
 #ifdef WAL_DEBUG
index 316edbe3c5857826c74427fbef04a90bc89100b4..b76b2688911eff60182a004f69aa0d032cf86700 100644 (file)
@@ -24,6 +24,7 @@
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "postmaster/checksumhelper.h"
 #include "replication/walreceiver.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
@@ -698,3 +699,61 @@ pg_backup_start_time(PG_FUNCTION_ARGS)
 
    PG_RETURN_DATUM(xtime);
 }
+
+/*
+ * Disables checksums for the cluster, unless already disabled.
+ *
+ * Has immediate effect - the checksums are set to off right away.
+ */
+Datum
+disable_data_checksums(PG_FUNCTION_ARGS)
+{
+   /*
+    * If we don't need to write new checksums, then clearly they are already
+    * disabled.
+    */
+   if (!DataChecksumsNeedWrite())
+       ereport(ERROR,
+               (errmsg("data checksums already disabled")));
+
+   ShutdownChecksumHelperIfRunning();
+
+   SetDataChecksumsOff();
+
+   PG_RETURN_VOID();
+}
+
+/*
+ * Enables checksums for the cluster, unless already enabled.
+ *
+ * Supports vacuum-like cost-based throttling, to limit system load.
+ * Starts a background worker that updates checksums on existing data.
+ */
+Datum
+enable_data_checksums(PG_FUNCTION_ARGS)
+{
+   int         cost_delay = PG_GETARG_INT32(0);
+   int         cost_limit = PG_GETARG_INT32(1);
+
+   if (cost_delay < 0)
+       ereport(ERROR,
+               (errmsg("cost delay cannot be less than zero")));
+   if (cost_limit <= 0)
+       ereport(ERROR,
+               (errmsg("cost limit must be a positive value")));
+
+   /*
+    * Allow state change from "off" or from "inprogress", since this is how
+    * we restart the worker if necessary.
+    */
+   if (DataChecksumsNeedVerify())
+       ereport(ERROR,
+               (errmsg("data checksums already enabled")));
+
+   SetDataChecksumsInProgress();
+   if (!StartChecksumHelperLauncher(cost_delay, cost_limit))
+       ereport(ERROR,
+               (errmsg("failed to start checksum helper process")));
+
+   PG_RETURN_VOID();
+}
index e9e188682fb417491787ebe3d77164bce0ff33dd..5d567d0cf90f35bc3454769e4c5887b59afa0247 100644 (file)
@@ -1027,6 +1027,11 @@ CREATE OR REPLACE FUNCTION pg_stop_backup (
   RETURNS SETOF record STRICT VOLATILE LANGUAGE internal as 'pg_stop_backup_v2'
   PARALLEL RESTRICTED;
 
+CREATE OR REPLACE FUNCTION pg_enable_data_checksums (
+        cost_delay int DEFAULT 0, cost_limit int DEFAULT 100)
+  RETURNS void STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums'
+  PARALLEL RESTRICTED;
+
 -- legacy definition for compatibility with 9.3
 CREATE OR REPLACE FUNCTION
   json_populate_record(base anyelement, from_json json, use_json_as_text boolean DEFAULT false)
index 71c23211b2a11973dc2b4b705149cb6875147b38..ee8f8c1cd33c3e16716b80b37caf54c393fb6c39 100644 (file)
@@ -12,7 +12,8 @@ subdir = src/backend/postmaster
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o fork_process.o \
-   pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o
+OBJS = autovacuum.o bgworker.o bgwriter.o checkpointer.o checksumhelper.o \
+   fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o \
+   walwriter.o
 
 include $(top_srcdir)/src/backend/common.mk
index f651bb49b158134d6086b9074d90cbdd67298271..19529d77ad636873acab59de81d923b6855ee657 100644 (file)
@@ -20,6 +20,7 @@
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "postmaster/bgworker_internals.h"
+#include "postmaster/checksumhelper.h"
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
@@ -129,6 +130,12 @@ static const struct
    },
    {
        "ApplyWorkerMain", ApplyWorkerMain
+   },
+   {
+       "ChecksumHelperLauncherMain", ChecksumHelperLauncherMain
+   },
+   {
+       "ChecksumHelperWorkerMain", ChecksumHelperWorkerMain
    }
 };
 
diff --git a/src/backend/postmaster/checksumhelper.c b/src/backend/postmaster/checksumhelper.c
new file mode 100644 (file)
index 0000000..288ab86
--- /dev/null
@@ -0,0 +1,855 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksumhelper.c
+ *   Background worker to walk the database and write checksums to pages
+ *
+ * When enabling data checksums on a database at initdb time, no extra process
+ * is required as each page is checksummed, and verified, at accesses.  When
+ * enabling checksums on an already running cluster, which was not initialized
+ * with checksums, this helper worker will ensure that all pages are
+ * checksummed before verification of the checksums is turned on.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/backend/postmaster/checksumhelper.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/pg_database.h"
+#include "commands/vacuum.h"
+#include "common/relpath.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/checksumhelper.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/lmgr.h"
+#include "storage/ipc.h"
+#include "storage/procarray.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/lsyscache.h"
+#include "utils/ps_status.h"
+
+
+typedef enum
+{
+   SUCCESSFUL = 0,
+   ABORTED,
+   FAILED
+}          ChecksumHelperResult;
+
+typedef struct ChecksumHelperShmemStruct
+{
+   pg_atomic_flag launcher_started;
+   ChecksumHelperResult success;
+   bool        process_shared_catalogs;
+   bool        abort;
+   /* Parameter values set on start */
+   int         cost_delay;
+   int         cost_limit;
+}          ChecksumHelperShmemStruct;
+
+/* Shared memory segment for checksumhelper */
+static ChecksumHelperShmemStruct * ChecksumHelperShmem;
+
+/* Bookkeeping for work to do */
+typedef struct ChecksumHelperDatabase
+{
+   Oid         dboid;
+   char       *dbname;
+}          ChecksumHelperDatabase;
+
+typedef struct ChecksumHelperRelation
+{
+   Oid         reloid;
+   char        relkind;
+}          ChecksumHelperRelation;
+
+/* Prototypes */
+static List *BuildDatabaseList(void);
+static List *BuildRelationList(bool include_shared);
+static List *BuildTempTableList(void);
+static ChecksumHelperResult ProcessDatabase(ChecksumHelperDatabase * db);
+static void launcher_cancel_handler(SIGNAL_ARGS);
+
+/*
+ * Main entry point for checksumhelper launcher process.
+ */
+bool
+StartChecksumHelperLauncher(int cost_delay, int cost_limit)
+{
+   BackgroundWorker bgw;
+   BackgroundWorkerHandle *bgw_handle;
+
+   if (ChecksumHelperShmem->abort)
+   {
+       ereport(ERROR,
+               (errmsg("could not start checksumhelper: has been cancelled")));
+   }
+
+   ChecksumHelperShmem->cost_delay = cost_delay;
+   ChecksumHelperShmem->cost_limit = cost_limit;
+
+   memset(&bgw, 0, sizeof(bgw));
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+   snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ChecksumHelperLauncherMain");
+   snprintf(bgw.bgw_name, BGW_MAXLEN, "checksumhelper launcher");
+   snprintf(bgw.bgw_type, BGW_MAXLEN, "checksumhelper launcher");
+   bgw.bgw_restart_time = BGW_NEVER_RESTART;
+   bgw.bgw_notify_pid = MyProcPid;
+   bgw.bgw_main_arg = (Datum) 0;
+
+   if (!pg_atomic_test_set_flag(&ChecksumHelperShmem->launcher_started))
+   {
+       /* Failed to set means somebody else started */
+       ereport(ERROR,
+               (errmsg("could not start checksumhelper: already running")));
+   }
+
+   if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+   {
+       pg_atomic_clear_flag(&ChecksumHelperShmem->launcher_started);
+       return false;
+   }
+
+   return true;
+}
+
+/*
+ * ShutdownChecksumHelperIfRunning
+ *     Request shutdown of the checksumhelper
+ *
+ * This does not turn off processing immediately, it signals the checksum
+ * process to end when done with the current block.
+ */
+void
+ShutdownChecksumHelperIfRunning(void)
+{
+   /* If the launcher isn't started, there is nothing to shut down */
+   if (pg_atomic_unlocked_test_flag(&ChecksumHelperShmem->launcher_started))
+       return;
+
+   /*
+    * We don't need an atomic variable for aborting, setting it multiple
+    * times will not change the handling.
+    */
+   ChecksumHelperShmem->abort = true;
+}
+
+/*
+ * ProcessSingleRelationFork
+ *     Enable checksums in a single relation/fork.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual
+ * error is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
+{
+   BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum);
+   BlockNumber b;
+   char        activity[NAMEDATALEN * 2 + 128];
+
+   for (b = 0; b < numblocks; b++)
+   {
+       Buffer      buf = ReadBufferExtended(reln, forkNum, b, RBM_NORMAL, strategy);
+
+       /*
+        * Report to pgstat every 100 blocks (so as not to "spam")
+        */
+       if ((b % 100) == 0)
+       {
+           snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s block %d/%d)",
+                    get_namespace_name(RelationGetNamespace(reln)), RelationGetRelationName(reln),
+                    forkNames[forkNum], b, numblocks);
+           pgstat_report_activity(STATE_RUNNING, activity);
+       }
+
+       /* Need to get an exclusive lock before we can flag as dirty */
+       LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+       /*
+        * Mark the buffer as dirty and force a full page write.  We have to
+        * re-write the page to WAL even if the checksum hasn't changed,
+        * because if there is a replica it might have a slightly different
+        * version of the page with an invalid checksum, caused by unlogged
+        * changes (e.g. hintbits) on the master happening while checksums
+        * were off. This can happen if there was a valid checksum on the page
+        * at one point in the past, so only when checksums are first on, then
+        * off, and then turned on again.
+        */
+       START_CRIT_SECTION();
+       MarkBufferDirty(buf);
+       log_newpage_buffer(buf, false);
+       END_CRIT_SECTION();
+
+       UnlockReleaseBuffer(buf);
+
+       /*
+        * This is the only place where we check if we are asked to abort, the
+        * abortion will bubble up from here.
+        */
+       if (ChecksumHelperShmem->abort)
+           return false;
+
+       vacuum_delay_point();
+   }
+
+   return true;
+}
+
+/*
+ * ProcessSingleRelationByOid
+ *     Process a single relation based on oid.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual error
+ * is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy)
+{
+   Relation    rel;
+   ForkNumber  fnum;
+   bool        aborted = false;
+
+   StartTransactionCommand();
+
+   elog(DEBUG2, "Checksumhelper starting to process relation %d", relationId);
+   rel = try_relation_open(relationId, AccessShareLock);
+   if (rel == NULL)
+   {
+       /*
+        * Relation no longer exist. We consider this a success, since there
+        * are no pages in it that need checksums, and thus return true.
+        */
+       elog(DEBUG1, "Checksumhelper skipping relation %d as it no longer exists", relationId);
+       CommitTransactionCommand();
+       pgstat_report_activity(STATE_IDLE, NULL);
+       return true;
+   }
+   RelationOpenSmgr(rel);
+
+   for (fnum = 0; fnum <= MAX_FORKNUM; fnum++)
+   {
+       if (smgrexists(rel->rd_smgr, fnum))
+       {
+           if (!ProcessSingleRelationFork(rel, fnum, strategy))
+           {
+               aborted = true;
+               break;
+           }
+       }
+   }
+   relation_close(rel, AccessShareLock);
+   elog(DEBUG2, "Checksumhelper done with relation %d: %s",
+        relationId, (aborted ? "aborted" : "finished"));
+
+   CommitTransactionCommand();
+
+   pgstat_report_activity(STATE_IDLE, NULL);
+
+   return !aborted;
+}
+
+/*
+ * ProcessDatabase
+ *     Enable checksums in a single database.
+ *
+ * We do this by launching a dynamic background worker into this database, and
+ * waiting for it to finish.  We have to do this in a separate worker, since
+ * each process can only be connected to one database during its lifetime.
+ */
+static ChecksumHelperResult
+ProcessDatabase(ChecksumHelperDatabase * db)
+{
+   BackgroundWorker bgw;
+   BackgroundWorkerHandle *bgw_handle;
+   BgwHandleStatus status;
+   pid_t       pid;
+   char        activity[NAMEDATALEN + 64];
+
+   ChecksumHelperShmem->success = FAILED;
+
+   memset(&bgw, 0, sizeof(bgw));
+   bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+   bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+   snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+   snprintf(bgw.bgw_function_name, BGW_MAXLEN, "ChecksumHelperWorkerMain");
+   snprintf(bgw.bgw_name, BGW_MAXLEN, "checksumhelper worker");
+   snprintf(bgw.bgw_type, BGW_MAXLEN, "checksumhelper worker");
+   bgw.bgw_restart_time = BGW_NEVER_RESTART;
+   bgw.bgw_notify_pid = MyProcPid;
+   bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
+
+   if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+   {
+       ereport(LOG,
+               (errmsg("failed to start worker for checksumhelper in \"%s\"",
+                       db->dbname)));
+       return FAILED;
+   }
+
+   status = WaitForBackgroundWorkerStartup(bgw_handle, &pid);
+   if (status != BGWH_STARTED)
+   {
+       ereport(LOG,
+               (errmsg("failed to wait for worker startup for checksumhelper in \"%s\"",
+                       db->dbname)));
+       return FAILED;
+   }
+
+   ereport(DEBUG1,
+           (errmsg("started background worker for checksums in \"%s\"",
+                   db->dbname)));
+
+   snprintf(activity, sizeof(activity) - 1,
+            "Waiting for worker in database %s (pid %d)", db->dbname, pid);
+   pgstat_report_activity(STATE_RUNNING, activity);
+
+
+   status = WaitForBackgroundWorkerShutdown(bgw_handle);
+   if (status != BGWH_STOPPED)
+   {
+       ereport(LOG,
+               (errmsg("failed to wait for worker shutdown for checksumhelper in \"%s\"",
+                       db->dbname)));
+       return FAILED;
+   }
+
+   if (ChecksumHelperShmem->success == ABORTED)
+       ereport(LOG,
+               (errmsg("checksumhelper was aborted during processing in \"%s\"",
+                       db->dbname)));
+
+   ereport(DEBUG1,
+           (errmsg("background worker for checksums in \"%s\" completed",
+                   db->dbname)));
+
+   pgstat_report_activity(STATE_IDLE, NULL);
+
+   return ChecksumHelperShmem->success;
+}
+
+static void
+launcher_exit(int code, Datum arg)
+{
+   ChecksumHelperShmem->abort = false;
+   pg_atomic_clear_flag(&ChecksumHelperShmem->launcher_started);
+}
+
+static void
+launcher_cancel_handler(SIGNAL_ARGS)
+{
+   ChecksumHelperShmem->abort = true;
+}
+
+static void
+WaitForAllTransactionsToFinish(void)
+{
+   TransactionId waitforxid;
+
+   LWLockAcquire(XidGenLock, LW_SHARED);
+   waitforxid = ShmemVariableCache->nextXid;
+   LWLockRelease(XidGenLock);
+
+   while (true)
+   {
+       TransactionId oldestxid = GetOldestActiveTransactionId();
+
+       elog(DEBUG1, "Checking old transactions");
+       if (TransactionIdPrecedes(oldestxid, waitforxid))
+       {
+           char        activity[64];
+
+           /* Oldest running xid is older than us, so wait */
+           snprintf(activity, sizeof(activity), "Waiting for current transactions to finish (waiting for %d)", waitforxid);
+           pgstat_report_activity(STATE_RUNNING, activity);
+
+           /* Retry every 5 seconds */
+           ResetLatch(MyLatch);
+           (void) WaitLatch(MyLatch,
+                            WL_LATCH_SET | WL_TIMEOUT,
+                            5000,
+                            WAIT_EVENT_PG_SLEEP);
+       }
+       else
+       {
+           pgstat_report_activity(STATE_IDLE, NULL);
+           return;
+       }
+   }
+}
+
+void
+ChecksumHelperLauncherMain(Datum arg)
+{
+   List       *DatabaseList;
+   List       *remaining = NIL;
+   ListCell   *lc,
+              *lc2;
+   List       *CurrentDatabases = NIL;
+   bool        found_failed = false;
+
+   on_shmem_exit(launcher_exit, 0);
+
+   ereport(DEBUG1,
+           (errmsg("checksumhelper launcher started")));
+
+   pqsignal(SIGTERM, die);
+   pqsignal(SIGINT, launcher_cancel_handler);
+
+   BackgroundWorkerUnblockSignals();
+
+   init_ps_display(pgstat_get_backend_desc(B_CHECKSUMHELPER_LAUNCHER), "", "", "");
+
+   /*
+    * Initialize a connection to shared catalogs only.
+    */
+   BackgroundWorkerInitializeConnection(NULL, NULL, 0);
+
+   /*
+    * Set up so first run processes shared catalogs, but not once in every
+    * db.
+    */
+   ChecksumHelperShmem->process_shared_catalogs = true;
+
+   /*
+    * Wait for all existing transactions to finish. This will make sure that
+    * we can see all tables all databases, so we don't miss any. Anything
+    * created after this point is known to have checksums on all pages
+    * already, so we don't have to care about those.
+    */
+   WaitForAllTransactionsToFinish();
+
+   /*
+    * Create a database list.  We don't need to concern ourselves with
+    * rebuilding this list during runtime since any database created after
+    * this process started will be running with checksums turned on from the
+    * start.
+    */
+   DatabaseList = BuildDatabaseList();
+
+   /*
+    * If there are no databases at all to checksum, we can exit immediately
+    * as there is no work to do.
+    */
+   if (DatabaseList == NIL || list_length(DatabaseList) == 0)
+       return;
+
+   foreach(lc, DatabaseList)
+   {
+       ChecksumHelperDatabase *db = (ChecksumHelperDatabase *) lfirst(lc);
+       ChecksumHelperResult processing;
+
+       processing = ProcessDatabase(db);
+
+       if (processing == SUCCESSFUL)
+       {
+           pfree(db->dbname);
+           pfree(db);
+
+           if (ChecksumHelperShmem->process_shared_catalogs)
+
+               /*
+                * Now that one database has completed shared catalogs, we
+                * don't have to process them again.
+                */
+               ChecksumHelperShmem->process_shared_catalogs = false;
+       }
+       else if (processing == FAILED)
+       {
+           /*
+            * Put failed databases on the remaining list.
+            */
+           remaining = lappend(remaining, db);
+       }
+       else
+           /* aborted */
+           return;
+   }
+   list_free(DatabaseList);
+
+   /*
+    * remaining now has all databases not yet processed. This can be because
+    * they failed for some reason, or because the database was dropped
+    * between us getting the database list and trying to process it. Get a
+    * fresh list of databases to detect the second case where the database
+    * was dropped before we had started processing it. If a database still
+    * exists, but enabling checksums failed then we fail the entire
+    * checksumming process and exit with an error.
+    */
+   CurrentDatabases = BuildDatabaseList();
+
+   foreach(lc, remaining)
+   {
+       ChecksumHelperDatabase *db = (ChecksumHelperDatabase *) lfirst(lc);
+       bool        found = false;
+
+       foreach(lc2, CurrentDatabases)
+       {
+           ChecksumHelperDatabase *db2 = (ChecksumHelperDatabase *) lfirst(lc2);
+
+           if (db->dboid == db2->dboid)
+           {
+               found = true;
+               ereport(WARNING,
+                       (errmsg("failed to enable checksums in \"%s\"",
+                               db->dbname)));
+               break;
+           }
+       }
+
+       if (found)
+           found_failed = true;
+       else
+       {
+           ereport(LOG,
+                   (errmsg("database \"%s\" has been dropped, skipping",
+                           db->dbname)));
+       }
+
+       pfree(db->dbname);
+       pfree(db);
+   }
+   list_free(remaining);
+
+   /* Free the extra list of databases */
+   foreach(lc, CurrentDatabases)
+   {
+       ChecksumHelperDatabase *db = (ChecksumHelperDatabase *) lfirst(lc);
+
+       pfree(db->dbname);
+       pfree(db);
+   }
+   list_free(CurrentDatabases);
+
+   if (found_failed)
+   {
+       /* Disable checksums on cluster, because we failed */
+       SetDataChecksumsOff();
+       ereport(ERROR,
+               (errmsg("checksumhelper failed to enable checksums in all databases, aborting")));
+   }
+
+   /*
+    * Force a checkpoint to get everything out to disk.
+    */
+   RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | CHECKPOINT_IMMEDIATE);
+
+   /*
+    * Everything has been processed, so flag checksums enabled.
+    */
+   SetDataChecksumsOn();
+
+   ereport(LOG,
+           (errmsg("checksums enabled, checksumhelper launcher shutting down")));
+}
+
+/*
+ * ChecksumHelperShmemSize
+ *     Compute required space for checksumhelper-related shared memory
+ */
+Size
+ChecksumHelperShmemSize(void)
+{
+   Size        size;
+
+   size = sizeof(ChecksumHelperShmemStruct);
+   size = MAXALIGN(size);
+
+   return size;
+}
+
+/*
+ * ChecksumHelperShmemInit
+ *     Allocate and initialize checksumhelper-related shared memory
+ */
+void
+ChecksumHelperShmemInit(void)
+{
+   bool        found;
+
+   ChecksumHelperShmem = (ChecksumHelperShmemStruct *)
+       ShmemInitStruct("ChecksumHelper Data",
+                       ChecksumHelperShmemSize(),
+                       &found);
+
+   if (!found)
+   {
+       MemSet(ChecksumHelperShmem, 0, ChecksumHelperShmemSize());
+       pg_atomic_init_flag(&ChecksumHelperShmem->launcher_started);
+   }
+}
+
+/*
+ * BuildDatabaseList
+ *     Compile a list of all currently available databases in the cluster
+ *
+ * This creates the list of databases for the checksumhelper workers to add
+ * checksums to.
+ */
+static List *
+BuildDatabaseList(void)
+{
+   List       *DatabaseList = NIL;
+   Relation    rel;
+   HeapScanDesc scan;
+   HeapTuple   tup;
+   MemoryContext ctx = CurrentMemoryContext;
+   MemoryContext oldctx;
+
+   StartTransactionCommand();
+
+   rel = heap_open(DatabaseRelationId, AccessShareLock);
+   scan = heap_beginscan_catalog(rel, 0, NULL);
+
+   while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+   {
+       Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup);
+       ChecksumHelperDatabase *db;
+
+       oldctx = MemoryContextSwitchTo(ctx);
+
+       db = (ChecksumHelperDatabase *) palloc(sizeof(ChecksumHelperDatabase));
+
+       db->dboid = HeapTupleGetOid(tup);
+       db->dbname = pstrdup(NameStr(pgdb->datname));
+
+       DatabaseList = lappend(DatabaseList, db);
+
+       MemoryContextSwitchTo(oldctx);
+   }
+
+   heap_endscan(scan);
+   heap_close(rel, AccessShareLock);
+
+   CommitTransactionCommand();
+
+   return DatabaseList;
+}
+
+/*
+ * BuildRelationList
+ *     Compile a list of all relations in the database
+ *
+ * If shared is true, both shared relations and local ones are returned, else
+ * all non-shared relations are returned.
+ * Temp tables are not included.
+ */
+static List *
+BuildRelationList(bool include_shared)
+{
+   List       *RelationList = NIL;
+   Relation    rel;
+   HeapScanDesc scan;
+   HeapTuple   tup;
+   MemoryContext ctx = CurrentMemoryContext;
+   MemoryContext oldctx;
+
+   StartTransactionCommand();
+
+   rel = heap_open(RelationRelationId, AccessShareLock);
+   scan = heap_beginscan_catalog(rel, 0, NULL);
+
+   while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+   {
+       Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
+       ChecksumHelperRelation *relentry;
+
+       if (pgc->relpersistence == 't')
+           continue;
+
+       if (pgc->relisshared && !include_shared)
+           continue;
+
+       /*
+        * Foreign tables have by definition no local storage that can be
+        * checksummed, so skip.
+        */
+       if (pgc->relkind == RELKIND_FOREIGN_TABLE)
+           continue;
+
+       oldctx = MemoryContextSwitchTo(ctx);
+       relentry = (ChecksumHelperRelation *) palloc(sizeof(ChecksumHelperRelation));
+
+       relentry->reloid = HeapTupleGetOid(tup);
+       relentry->relkind = pgc->relkind;
+
+       RelationList = lappend(RelationList, relentry);
+
+       MemoryContextSwitchTo(oldctx);
+   }
+
+   heap_endscan(scan);
+   heap_close(rel, AccessShareLock);
+
+   CommitTransactionCommand();
+
+   return RelationList;
+}
+
+/*
+ * BuildTempTableList
+ *     Compile a list of all temporary tables in database
+ *
+ * Returns a List of oids.
+ */
+static List *
+BuildTempTableList(void)
+{
+   List       *RelationList = NIL;
+   Relation    rel;
+   HeapScanDesc scan;
+   HeapTuple   tup;
+   MemoryContext ctx = CurrentMemoryContext;
+   MemoryContext oldctx;
+
+   StartTransactionCommand();
+
+   rel = heap_open(RelationRelationId, AccessShareLock);
+   scan = heap_beginscan_catalog(rel, 0, NULL);
+
+   while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+   {
+       Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
+
+       if (pgc->relpersistence != 't')
+           continue;
+
+       oldctx = MemoryContextSwitchTo(ctx);
+       RelationList = lappend_oid(RelationList, HeapTupleGetOid(tup));
+       MemoryContextSwitchTo(oldctx);
+   }
+
+   heap_endscan(scan);
+   heap_close(rel, AccessShareLock);
+
+   CommitTransactionCommand();
+
+   return RelationList;
+}
+
+/*
+ * Main function for enabling checksums in a single database
+ */
+void
+ChecksumHelperWorkerMain(Datum arg)
+{
+   Oid         dboid = DatumGetObjectId(arg);
+   List       *RelationList = NIL;
+   List       *InitialTempTableList = NIL;
+   ListCell   *lc;
+   BufferAccessStrategy strategy;
+   bool        aborted = false;
+
+   pqsignal(SIGTERM, die);
+
+   BackgroundWorkerUnblockSignals();
+
+   init_ps_display(pgstat_get_backend_desc(B_CHECKSUMHELPER_WORKER), "", "", "");
+
+   ereport(DEBUG1,
+           (errmsg("checksum worker starting for database oid %d", dboid)));
+
+   BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, BGWORKER_BYPASS_ALLOWCONN);
+
+   /*
+    * Get a list of all temp tables present as we start in this database. We
+    * need to wait until they are all gone until we are done, since we cannot
+    * access those files and modify them.
+    */
+   InitialTempTableList = BuildTempTableList();
+
+   /*
+    * Enable vacuum cost delay, if any.
+    */
+   VacuumCostDelay = ChecksumHelperShmem->cost_delay;
+   VacuumCostLimit = ChecksumHelperShmem->cost_limit;
+   VacuumCostActive = (VacuumCostDelay > 0);
+   VacuumCostBalance = 0;
+   VacuumPageHit = 0;
+   VacuumPageMiss = 0;
+   VacuumPageDirty = 0;
+
+   /*
+    * Create and set the vacuum strategy as our buffer strategy.
+    */
+   strategy = GetAccessStrategy(BAS_VACUUM);
+
+   RelationList = BuildRelationList(ChecksumHelperShmem->process_shared_catalogs);
+   foreach(lc, RelationList)
+   {
+       ChecksumHelperRelation *rel = (ChecksumHelperRelation *) lfirst(lc);
+
+       if (!ProcessSingleRelationByOid(rel->reloid, strategy))
+       {
+           aborted = true;
+           break;
+       }
+   }
+   list_free_deep(RelationList);
+
+   if (aborted)
+   {
+       ChecksumHelperShmem->success = ABORTED;
+       ereport(DEBUG1,
+               (errmsg("checksum worker aborted in database oid %d", dboid)));
+       return;
+   }
+
+   /*
+    * Wait for all temp tables that existed when we started to go away. This
+    * is necessary since we cannot "reach" them to enable checksums. Any temp
+    * tables created after we started will already have checksums in them
+    * (due to the inprogress state), so those are safe.
+    */
+   while (true)
+   {
+       List       *CurrentTempTables;
+       ListCell   *lc;
+       int         numleft;
+       char        activity[64];
+
+       CurrentTempTables = BuildTempTableList();
+       numleft = 0;
+       foreach(lc, InitialTempTableList)
+       {
+           if (list_member_oid(CurrentTempTables, lfirst_oid(lc)))
+               numleft++;
+       }
+       list_free(CurrentTempTables);
+
+       if (numleft == 0)
+           break;
+
+       /* At least one temp table left to wait for */
+       snprintf(activity, sizeof(activity), "Waiting for %d temp tables to be removed", numleft);
+       pgstat_report_activity(STATE_RUNNING, activity);
+
+       /* Retry every 5 seconds */
+       ResetLatch(MyLatch);
+       (void) WaitLatch(MyLatch,
+                        WL_LATCH_SET | WL_TIMEOUT,
+                        5000,
+                        WAIT_EVENT_PG_SLEEP);
+   }
+
+   list_free(InitialTempTableList);
+
+   ChecksumHelperShmem->success = SUCCESSFUL;
+   ereport(DEBUG1,
+           (errmsg("checksum worker completed in database oid %d", dboid)));
+}
index 96ba2163878ed31047aa4a95faf59773101fc4bb..83328a27662a0fbcb6ca619a1f7fee554472fa74 100644 (file)
@@ -4125,6 +4125,11 @@ pgstat_get_backend_desc(BackendType backendType)
        case B_WAL_WRITER:
            backendDesc = "walwriter";
            break;
+       case B_CHECKSUMHELPER_LAUNCHER:
+           backendDesc = "checksumhelper launcher";
+           break;
+       case B_CHECKSUMHELPER_WORKER:
+           backendDesc = "checksumhelper worker";
    }
 
    return backendDesc;
index 1a0bae4c15fb61003192c824c102500f64c8b4ce..8ba29453b91d33b8852ec63a65e7a73bd74036e6 100644 (file)
@@ -1383,7 +1383,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 
    _tarWriteHeader(tarfilename, NULL, statbuf, false);
 
-   if (!noverify_checksums && DataChecksumsEnabled())
+   if (!noverify_checksums && DataChecksumsNeedVerify())
    {
        char       *filename;
 
index 6eb0d5527e0b8c1bd9d67b34deeeac921e9bb154..84183f82031340cfda828c8f700404f2227e3101 100644 (file)
@@ -198,6 +198,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
        case XLOG_FPW_CHANGE:
        case XLOG_FPI_FOR_HINT:
        case XLOG_FPI:
+       case XLOG_CHECKSUMS:
            break;
        default:
            elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
index 0c86a581c031d64154f0c99418562bfaac785875..853e1e472f6cd2b6ee8b816ac88720a9602a4e11 100644 (file)
@@ -27,6 +27,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checksumhelper.h"
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
@@ -261,6 +262,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
    WalSndShmemInit();
    WalRcvShmemInit();
    ApplyLauncherShmemInit();
+   ChecksumHelperShmemInit();
 
    /*
     * Set up other modules that need some shared memory space
index 5127d98da37ccdf681dd96ea6f08febf9662a34f..f873fb0eea102fbd30f439cef690b7e76e32ab7a 100644 (file)
@@ -9,7 +9,8 @@ have a very low measured incidence according to research on large server farms,
 http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed
 2010/12/22 on -hackers list.
 
-Current implementation requires this be enabled system-wide at initdb time.
+Checksums can be enabled at initdb time, but can also be turned on and off
+using pg_enable_data_checksums()/pg_disable_data_checksums() at runtime.
 
 The checksum is not valid at all times on a data page!!
 The checksum is valid when the page leaves the shared pool and is checked
index dfbda5458fdb253bcb08dbc42437d50b55bebc1e..790e4b860adf054ee88b8adb3c55f7339c09ac26 100644 (file)
@@ -93,7 +93,7 @@ PageIsVerified(Page page, BlockNumber blkno)
     */
    if (!PageIsNew(page))
    {
-       if (DataChecksumsEnabled())
+       if (DataChecksumsNeedVerify())
        {
            checksum = pg_checksum_page((char *) page, blkno);
 
@@ -1168,7 +1168,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
    static char *pageCopy = NULL;
 
    /* If we don't need a checksum, just return the passed-in data */
-   if (PageIsNew(page) || !DataChecksumsEnabled())
+   if (PageIsNew(page) || !DataChecksumsNeedWrite())
        return (char *) page;
 
    /*
@@ -1195,7 +1195,7 @@ void
 PageSetChecksumInplace(Page page, BlockNumber blkno)
 {
    /* If we don't need a checksum, just return */
-   if (PageIsNew(page) || !DataChecksumsEnabled())
+   if (PageIsNew(page) || !DataChecksumsNeedWrite())
        return;
 
    ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno);
index 260ae264d88054a0d767430aaa056ce5d82dee37..71c2b4eff1613bbdb333ffd109307b21118d0009 100644 (file)
@@ -32,6 +32,7 @@
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_authid.h"
@@ -68,6 +69,7 @@
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
+#include "storage/checksum.h"
 #include "storage/dsm_impl.h"
 #include "storage/standby.h"
 #include "storage/fd.h"
@@ -419,6 +421,17 @@ static const struct config_enum_entry password_encryption_options[] = {
    {NULL, 0, false}
 };
 
+/*
+ * data_checksum used to be a boolean, but was only set by initdb so there is
+ * no need to support variants of boolean input.
+ */
+static const struct config_enum_entry data_checksum_options[] = {
+   {"on", DATA_CHECKSUMS_ON, true},
+   {"off", DATA_CHECKSUMS_OFF, true},
+   {"inprogress", DATA_CHECKSUMS_INPROGRESS, true},
+   {NULL, 0, false}
+};
+
 /*
  * Options for enum values stored in other modules
  */
@@ -514,7 +527,7 @@ static int  max_identifier_length;
 static int block_size;
 static int segment_size;
 static int wal_block_size;
-static bool data_checksums;
+static int data_checksums_tmp; /* only accessed locally! */
 static bool integer_datetimes;
 static bool assert_enabled;
 
@@ -1683,17 +1696,6 @@ static struct config_bool ConfigureNamesBool[] =
        NULL, NULL, NULL
    },
 
-   {
-       {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS,
-           gettext_noop("Shows whether data checksums are turned on for this cluster."),
-           NULL,
-           GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE
-       },
-       &data_checksums,
-       false,
-       NULL, NULL, NULL
-   },
-
    {
        {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE,
            gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."),
@@ -4111,6 +4113,17 @@ static struct config_enum ConfigureNamesEnum[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS,
+           gettext_noop("Shows whether data checksums are turned on for this cluster."),
+           NULL,
+           GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE
+       },
+       &data_checksums_tmp,
+       DATA_CHECKSUMS_OFF, data_checksum_options,
+       NULL, NULL, show_data_checksums
+   },
+
    /* End-of-list marker */
    {
        {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL
index 3b35835abe3749c0a2a3fa864405ff9f755cac9c..8c11060a2f5c99e1997d37addc0d2a75d8764250 100644 (file)
@@ -26,6 +26,7 @@ SUBDIRS = \
    pg_test_fsync \
    pg_test_timing \
    pg_upgrade \
+   pg_verify_checksums \
    pg_waldump \
    pgbench \
    psql \
index 0fe98a550e15746452f92df2fef6c37f60cd48b3..4bb2b7e6ec3b1b101fbc524600383336ab39e457 100644 (file)
@@ -590,6 +590,15 @@ check_control_data(ControlData *oldctrl,
     * check_for_isn_and_int8_passing_mismatch().
     */
 
+   /*
+    * If checksums have been turned on in the old cluster, but the
+    * checksumhelper have yet to finish, then disallow upgrading. The user
+    * should either let the process finish, or turn off checksums, before
+    * retrying.
+    */
+   if (oldctrl->data_checksum_version == 2)
+       pg_fatal("transition to data checksums not completed in old cluster\n");
+
    /*
     * We might eventually allow upgrades from checksum to no-checksum
     * clusters.
index 7e5e97129471cddd6922b6e5f76b9dc0c4be5cc7..449a703c475c6e3a18e2467140fb01af661fbe37 100644 (file)
@@ -226,7 +226,7 @@ typedef struct
    uint32      large_object;
    bool        date_is_int;
    bool        float8_pass_by_value;
-   bool        data_checksum_version;
+   uint32      data_checksum_version;
 } ControlData;
 
 /*
diff --git a/src/bin/pg_verify_checksums/.gitignore b/src/bin/pg_verify_checksums/.gitignore
new file mode 100644 (file)
index 0000000..d1dcdaf
--- /dev/null
@@ -0,0 +1 @@
+/pg_verify_checksums
diff --git a/src/bin/pg_verify_checksums/Makefile b/src/bin/pg_verify_checksums/Makefile
new file mode 100644 (file)
index 0000000..d162615
--- /dev/null
@@ -0,0 +1,36 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/bin/pg_verify_checksums
+#
+# Copyright (c) 1998-2018, PostgreSQL Global Development Group
+#
+# src/bin/pg_verify_checksums/Makefile
+#
+#-------------------------------------------------------------------------
+
+PGFILEDESC = "pg_verify_checksums - verify data checksums in an offline cluster"
+PGAPPICON=win32
+
+subdir = src/bin/pg_verify_checksums
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS= pg_verify_checksums.o $(WIN32RES)
+
+all: pg_verify_checksums
+
+pg_verify_checksums: $(OBJS) | submake-libpgport
+   $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
+
+install: all installdirs
+   $(INSTALL_PROGRAM) pg_verify_checksums$(X) '$(DESTDIR)$(bindir)/pg_verify_checksums$(X)'
+
+installdirs:
+   $(MKDIR_P) '$(DESTDIR)$(bindir)'
+
+uninstall:
+   rm -f '$(DESTDIR)$(bindir)/pg_verify_checksums$(X)'
+
+clean distclean maintainer-clean:
+   rm -f pg_verify_checksums$(X) $(OBJS)
+   rm -rf tmp_check
diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c
new file mode 100644 (file)
index 0000000..e37f39b
--- /dev/null
@@ -0,0 +1,315 @@
+/*
+ * pg_verify_checksums
+ *
+ * Verifies page level checksums in an offline cluster
+ *
+ * Copyright (c) 2010-2018, PostgreSQL Global Development Group
+ *
+ * src/bin/pg_verify_checksums/pg_verify_checksums.c
+ */
+
+#define FRONTEND 1
+
+#include "postgres.h"
+#include "catalog/pg_control.h"
+#include "common/controldata_utils.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
+#include "storage/checksum_impl.h"
+
+#include <sys/stat.h>
+#include <dirent.h>
+#include <unistd.h>
+
+#include "pg_getopt.h"
+
+
+static int64 files = 0;
+static int64 blocks = 0;
+static int64 badblocks = 0;
+static ControlFileData *ControlFile;
+
+static char *only_relfilenode = NULL;
+static bool debug = false;
+
+static const char *progname;
+
+static void
+usage()
+{
+   printf(_("%s verifies page level checksums in offline PostgreSQL database cluster.\n\n"), progname);
+   printf(_("Usage:\n"));
+   printf(_("  %s [OPTION] [DATADIR]\n"), progname);
+   printf(_("\nOptions:\n"));
+   printf(_(" [-D] DATADIR    data directory\n"));
+   printf(_("  -f,            force check even if checksums are disabled\n"));
+   printf(_("  -r relfilenode check only relation with specified relfilenode\n"));
+   printf(_("  -d             debug output, listing all checked blocks\n"));
+   printf(_("  -V, --version  output version information, then exit\n"));
+   printf(_("  -?, --help     show this help, then exit\n"));
+   printf(_("\nIf no data directory (DATADIR) is specified, "
+            "the environment variable PGDATA\nis used.\n\n"));
+   printf(_("Report bugs to <[email protected]>.\n"));
+}
+
+static const char *skip[] = {
+   "pg_control",
+   "pg_filenode.map",
+   "pg_internal.init",
+   "PG_VERSION",
+   NULL,
+};
+
+static bool
+skipfile(char *fn)
+{
+   const char **f;
+
+   if (strcmp(fn, ".") == 0 ||
+       strcmp(fn, "..") == 0)
+       return true;
+
+   for (f = skip; *f; f++)
+       if (strcmp(*f, fn) == 0)
+           return true;
+   return false;
+}
+
+static void
+scan_file(char *fn, int segmentno)
+{
+   char        buf[BLCKSZ];
+   PageHeader  header = (PageHeader) buf;
+   int         f;
+   int         blockno;
+
+   f = open(fn, 0);
+   if (f < 0)
+   {
+       fprintf(stderr, _("%s: could not open file \"%s\": %m\n"), progname, fn);
+       exit(1);
+   }
+
+   files++;
+
+   for (blockno = 0;; blockno++)
+   {
+       uint16      csum;
+       int         r = read(f, buf, BLCKSZ);
+
+       if (r == 0)
+           break;
+       if (r != BLCKSZ)
+       {
+           fprintf(stderr, _("%s: short read of block %d in file \"%s\", got only %d bytes\n"),
+                   progname, blockno, fn, r);
+           exit(1);
+       }
+       blocks++;
+
+       csum = pg_checksum_page(buf, blockno + segmentno * RELSEG_SIZE);
+       if (csum != header->pd_checksum)
+       {
+           if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
+               fprintf(stderr, _("%s: checksum verification failed in file \"%s\", block %d: calculated checksum %X but expected %X\n"),
+                       progname, fn, blockno, csum, header->pd_checksum);
+           badblocks++;
+       }
+       else if (debug)
+           fprintf(stderr, _("%s: checksum verified in file \"%s\", block %d: %X\n"),
+                   progname, fn, blockno, csum);
+   }
+
+   close(f);
+}
+
+static void
+scan_directory(char *basedir, char *subdir)
+{
+   char        path[MAXPGPATH];
+   DIR        *dir;
+   struct dirent *de;
+
+   snprintf(path, MAXPGPATH, "%s/%s", basedir, subdir);
+   dir = opendir(path);
+   if (!dir)
+   {
+       fprintf(stderr, _("%s: could not open directory \"%s\": %m\n"),
+               progname, path);
+       exit(1);
+   }
+   while ((de = readdir(dir)) != NULL)
+   {
+       char        fn[MAXPGPATH];
+       struct stat st;
+
+       if (skipfile(de->d_name))
+           continue;
+
+       snprintf(fn, MAXPGPATH, "%s/%s", path, de->d_name);
+       if (lstat(fn, &st) < 0)
+       {
+           fprintf(stderr, _("%s: could not stat file \"%s\": %m\n"),
+                   progname, fn);
+           exit(1);
+       }
+       if (S_ISREG(st.st_mode))
+       {
+           char       *forkpath,
+                      *segmentpath;
+           int         segmentno = 0;
+
+           /*
+            * Cut off at the segment boundary (".") to get the segment number
+            * in order to mix it into the checksum. Then also cut off at the
+            * fork boundary, to get the relfilenode the file belongs to for
+            * filtering.
+            */
+           segmentpath = strchr(de->d_name, '.');
+           if (segmentpath != NULL)
+           {
+               *segmentpath++ = '\0';
+               segmentno = atoi(segmentpath);
+               if (segmentno == 0)
+               {
+                   fprintf(stderr, _("%s: invalid segment number %d in filename \"%s\"\n"),
+                           progname, segmentno, fn);
+                   exit(1);
+               }
+           }
+
+           forkpath = strchr(de->d_name, '_');
+           if (forkpath != NULL)
+               *forkpath++ = '\0';
+
+           if (only_relfilenode && strcmp(only_relfilenode, de->d_name) != 0)
+               /* Relfilenode not to be included */
+               continue;
+
+           scan_file(fn, segmentno);
+       }
+       else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
+           scan_directory(path, de->d_name);
+   }
+   closedir(dir);
+}
+
+int
+main(int argc, char *argv[])
+{
+   char       *DataDir = NULL;
+   bool        force = false;
+   int         c;
+   bool        crc_ok;
+
+   set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_verify_checksums"));
+
+   progname = get_progname(argv[0]);
+
+   if (argc > 1)
+   {
+       if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+       {
+           usage();
+           exit(0);
+       }
+       if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+       {
+           puts("pg_verify_checksums (PostgreSQL) " PG_VERSION);
+           exit(0);
+       }
+   }
+
+   while ((c = getopt(argc, argv, "D:fr:d")) != -1)
+   {
+       switch (c)
+       {
+           case 'd':
+               debug = true;
+               break;
+           case 'D':
+               DataDir = optarg;
+               break;
+           case 'f':
+               force = true;
+               break;
+           case 'r':
+               if (atoi(optarg) <= 0)
+               {
+                   fprintf(stderr, _("%s: invalid relfilenode: %s\n"), progname, optarg);
+                   exit(1);
+               }
+               only_relfilenode = pstrdup(optarg);
+               break;
+           default:
+               fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+               exit(1);
+       }
+   }
+
+   if (DataDir == NULL)
+   {
+       if (optind < argc)
+           DataDir = argv[optind++];
+       else
+           DataDir = getenv("PGDATA");
+
+       /* If no DataDir was specified, and none could be found, error out */
+       if (DataDir == NULL)
+       {
+           fprintf(stderr, _("%s: no data directory specified\n"), progname);
+           fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+           exit(1);
+       }
+   }
+
+   /* Complain if any arguments remain */
+   if (optind < argc)
+   {
+       fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"),
+               progname, argv[optind]);
+       fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+               progname);
+       exit(1);
+   }
+
+   /* Check if cluster is running */
+   ControlFile = get_controlfile(DataDir, progname, &crc_ok);
+   if (!crc_ok)
+   {
+       fprintf(stderr, _("%s: pg_control CRC value is incorrect.\n"), progname);
+       exit(1);
+   }
+
+   if (ControlFile->state != DB_SHUTDOWNED &&
+       ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
+   {
+       fprintf(stderr, _("%s: cluster must be shut down to verify checksums.\n"), progname);
+       exit(1);
+   }
+
+   if (ControlFile->data_checksum_version == 0 && !force)
+   {
+       fprintf(stderr, _("%s: data checksums are not enabled in cluster.\n"), progname);
+       exit(1);
+   }
+
+   /* Scan all files */
+   scan_directory(DataDir, "global");
+   scan_directory(DataDir, "base");
+   scan_directory(DataDir, "pg_tblspc");
+
+   printf(_("Checksum scan completed\n"));
+   printf(_("Data checksum version: %d\n"), ControlFile->data_checksum_version);
+   printf(_("Files scanned:  %" INT64_MODIFIER "d\n"), files);
+   printf(_("Blocks scanned: %" INT64_MODIFIER "d\n"), blocks);
+   if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_VERSION)
+       printf(_("Blocks left in progress: %" INT64_MODIFIER "d\n"), badblocks);
+   else
+       printf(_("Bad checksums:  %" INT64_MODIFIER "d\n"), badblocks);
+
+   if (badblocks > 0)
+       return 1;
+
+   return 0;
+}
index 421ba6d7755605473333b95bc7ae308766f2b4c7..f21870c6443a1f18ebc07b11d398db048efafdf8 100644 (file)
@@ -154,7 +154,7 @@ extern PGDLLIMPORT int wal_level;
  * of the bits make it to disk, but the checksum wouldn't match.  Also WAL-log
  * them if forced by wal_log_hints=on.
  */
-#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+#define XLogHintBitIsNeeded() (DataChecksumsNeedWrite() || wal_log_hints)
 
 /* Do we need to WAL-log information required only for Hot Standby and logical replication? */
 #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
@@ -257,7 +257,13 @@ extern char *XLogFileNameP(TimeLineID tli, XLogSegNo segno);
 extern void UpdateControlFile(void);
 extern uint64 GetSystemIdentifier(void);
 extern char *GetMockAuthenticationNonce(void);
-extern bool DataChecksumsEnabled(void);
+extern bool DataChecksumsNeedWrite(void);
+extern bool DataChecksumsNeedVerify(void);
+extern bool DataChecksumsInProgress(void);
+extern void SetDataChecksumsInProgress(void);
+extern void SetDataChecksumsOn(void);
+extern void SetDataChecksumsOff(void);
+extern const char *show_data_checksums(void);
 extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
 extern Size XLOGShmemSize(void);
 extern void XLOGShmemInit(void);
index a5c074642f6891c957ba9bf955a8b8ac704f1365..0530fd1a43cd3cb5a9acba0862932ecca8737ee0 100644 (file)
@@ -25,6 +25,7 @@
 #include "lib/stringinfo.h"
 #include "pgtime.h"
 #include "storage/block.h"
+#include "storage/checksum.h"
 #include "storage/relfilenode.h"
 
 
@@ -240,6 +241,12 @@ typedef struct xl_restore_point
    char        rp_name[MAXFNAMELEN];
 } xl_restore_point;
 
+/* Information logged when checksum level is changed */
+typedef struct xl_checksum_state
+{
+   ChecksumType new_checksumtype;
+}          xl_checksum_state;
+
 /* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
 typedef struct xl_end_of_recovery
 {
index 5f63efc35520c7ca7df09df6557463d5b4a9cafa..cbeca5776bd5525e6ae62190dc2abbc2d7013d17 100644 (file)
@@ -53,6 +53,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 201804051
+#define CATALOG_VERSION_NO 201804052
 
 #endif
index 773d9e6ebae967829f1a651bef17730b3984f149..33c59f9a630cdc318527e0aed073f0815e11e6d2 100644 (file)
@@ -76,6 +76,7 @@ typedef struct CheckPoint
 #define XLOG_END_OF_RECOVERY           0x90
 #define XLOG_FPI_FOR_HINT              0xA0
 #define XLOG_FPI                       0xB0
+#define XLOG_CHECKSUMS                 0xC0
 
 
 /*
index edf212fcf0f9a738ce63ea267404eee094853d25..02be8a5fbdf991f5b7361f55ac1e6dcaf35e6ef9 100644 (file)
@@ -5583,6 +5583,11 @@ DESCR("pg_controldata recovery state information as a function");
 DATA(insert OID = 3444 ( pg_control_init PGNSP PGUID 12 1 0 0 0 f f f t f v s 0 0 2249 "" "{23,23,23,23,23,23,23,23,23,16,16,23}" "{o,o,o,o,o,o,o,o,o,o,o,o}" "{max_data_alignment,database_block_size,blocks_per_segment,wal_block_size,bytes_per_wal_segment,max_identifier_length,max_index_columns,max_toast_chunk_size,large_object_chunk_size,float4_pass_by_value,float8_pass_by_value,data_page_checksum_version}" _null_ _null_ pg_control_init _null_ _null_ _null_ ));
 DESCR("pg_controldata init state information as a function");
 
+DATA(insert OID = 3996 ( pg_disable_data_checksums     PGNSP PGUID 12 1 0 0 0 f f f t f v s 0 0 2278 "" _null_ _null_ _null_ _null_ _null_ disable_data_checksums _null_ _null_ _null_ ));
+DESCR("disable data checksums");
+DATA(insert OID = 3998 ( pg_enable_data_checksums      PGNSP PGUID 12 1 0 0 0 f f f t f v s 2 0 2278 "23 23" _null_ _null_ "{cost_delay,cost_limit}" _null_ _null_ enable_data_checksums _null_ _null_ _null_ ));
+DESCR("enable data checksums");
+
 /* collation management functions */
 DATA(insert OID = 3445 ( pg_import_system_collations PGNSP PGUID 12 100 0 0 0 f f f t f v u 1 0 23 "4089" _null_ _null_ _null_ _null_ _null_ pg_import_system_collations _null_ _null_ _null_ ));
 DESCR("import collations from operating system");
index be2f59239bf9d7e7512cffedb6ab8e0f838da103..4ed9ed76cc283fa5d1eb9849017a93d0e6c7e31a 100644 (file)
@@ -710,7 +710,9 @@ typedef enum BackendType
    B_STARTUP,
    B_WAL_RECEIVER,
    B_WAL_SENDER,
-   B_WAL_WRITER
+   B_WAL_WRITER,
+   B_CHECKSUMHELPER_LAUNCHER,
+   B_CHECKSUMHELPER_WORKER
 } BackendType;
 
 
diff --git a/src/include/postmaster/checksumhelper.h b/src/include/postmaster/checksumhelper.h
new file mode 100644 (file)
index 0000000..289bf2a
--- /dev/null
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksumhelper.h
+ *   header file for checksum helper background worker
+ *
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/postmaster/checksumhelper.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHECKSUMHELPER_H
+#define CHECKSUMHELPER_H
+
+/* Shared memory */
+extern Size ChecksumHelperShmemSize(void);
+extern void ChecksumHelperShmemInit(void);
+
+/* Start the background processes for enabling checksums */
+bool       StartChecksumHelperLauncher(int cost_delay, int cost_limit);
+
+/* Shutdown the background processes, if any */
+void       ShutdownChecksumHelperIfRunning(void);
+
+/* Background worker entrypoints */
+void       ChecksumHelperLauncherMain(Datum arg);
+void       ChecksumHelperWorkerMain(Datum arg);
+
+#endif                         /* CHECKSUMHELPER_H */
index 85dd10c45a3ebead868d77bb36da34037d9699b7..bd46bf2ce6e8fe3053c2f5bed7757484e0437573 100644 (file)
@@ -194,6 +194,7 @@ typedef PageHeaderData *PageHeader;
  */
 #define PG_PAGE_LAYOUT_VERSION     4
 #define PG_DATA_CHECKSUM_VERSION   1
+#define PG_DATA_CHECKSUM_INPROGRESS_VERSION        2
 
 /* ----------------------------------------------------------------
  *                     page support macros
index 433755e279fa4863c29b4eef8580cbbf0a954b69..902ec29e2a54335b20f19648ded10fd86724889d 100644 (file)
 
 #include "storage/block.h"
 
+typedef enum ChecksumType
+{
+   DATA_CHECKSUMS_OFF = 0,
+   DATA_CHECKSUMS_ON,
+   DATA_CHECKSUMS_INPROGRESS
+}          ChecksumType;
+
 /*
  * Compute the checksum for a Postgres page.  The page must be aligned on a
  * 4-byte boundary.
index efb206aa75014d83d28da68e21fd29a63891ca33..6469ac94a4746b7bf7991565b828cf128ae76a19 100644 (file)
@@ -12,7 +12,8 @@ subdir = src/test
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS = perl regress isolation modules authentication recovery subscription
+SUBDIRS = perl regress isolation modules authentication recovery subscription \
+           checksum
 
 # Test suites that are not safe by default but can be run if selected
 # by the user via the whitespace-separated list in variable
diff --git a/src/test/checksum/.gitignore b/src/test/checksum/.gitignore
new file mode 100644 (file)
index 0000000..871e943
--- /dev/null
@@ -0,0 +1,2 @@
+# Generated by test suite
+/tmp_check/
diff --git a/src/test/checksum/Makefile b/src/test/checksum/Makefile
new file mode 100644 (file)
index 0000000..f3ad9df
--- /dev/null
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/test/checksum
+#
+# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/test/checksum/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/test/checksum
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+check:
+   $(prove_check)
+
+installcheck:
+   $(prove_installcheck)
+
+clean distclean maintainer-clean:
+   rm -rf tmp_check
+
diff --git a/src/test/checksum/README b/src/test/checksum/README
new file mode 100644 (file)
index 0000000..e3fbd2b
--- /dev/null
@@ -0,0 +1,22 @@
+src/test/checksum/README
+
+Regression tests for data checksums
+===================================
+
+This directory contains a test suite for enabling data checksums
+in a running cluster with streaming replication.
+
+Running the tests
+=================
+
+    make check
+
+or
+
+    make installcheck
+
+NOTE: This creates a temporary installation (in the case of "check"),
+with multiple nodes, be they master or standby(s) for the purpose of
+the tests.
+
+NOTE: This requires the --enable-tap-tests argument to configure.
diff --git a/src/test/checksum/t/001_standby_checksum.pl b/src/test/checksum/t/001_standby_checksum.pl
new file mode 100644 (file)
index 0000000..6a45356
--- /dev/null
@@ -0,0 +1,101 @@
+# Test suite for testing enabling data checksums with streaming replication
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 10;
+
+my $MAX_TRIES = 30;
+
+# Initialize master node
+my $node_master = get_new_node('master');
+$node_master->init(allows_streaming => 1);
+$node_master->start;
+my $backup_name = 'my_backup';
+
+# Take backup
+$node_master->backup($backup_name);
+
+# Create streaming standby linking to master
+my $node_standby_1 = get_new_node('standby_1');
+$node_standby_1->init_from_backup($node_master, $backup_name,
+   has_streaming => 1);
+$node_standby_1->start;
+
+# Create some content on master to have un-checksummed data in the cluster
+$node_master->safe_psql('postgres',
+   "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Wait for standbys to catch up
+$node_master->wait_for_catchup($node_standby_1, 'replay',
+   $node_master->lsn('insert'));
+
+# Check that checksums are turned off
+my $result = $node_master->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "off", 'ensure checksums are turned off on master');
+
+$result = $node_standby_1->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "off", 'ensure checksums are turned off on standby_1');
+
+# Enable checksums for the cluster
+$node_master->safe_psql('postgres', "SELECT pg_enable_data_checksums();");
+
+# Ensure that the master has switched to inprogress immediately
+$result = $node_master->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "inprogress", 'ensure checksums are in progress on master');
+
+# Wait for checksum enable to be replayed
+$node_master->wait_for_catchup($node_standby_1, 'replay');
+
+# Ensure that the standby has switched to inprogress
+$result = $node_standby_1->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "inprogress", 'ensure checksums are in progress on standby_1');
+
+# Insert some more data which should be checksummed on INSERT
+$node_master->safe_psql('postgres',
+   "INSERT INTO t VALUES (generate_series(1,10000));");
+
+# Wait for checksums enabled on the master
+for (my $i = 0; $i < $MAX_TRIES; $i++)
+{
+   $result = $node_master->safe_psql('postgres',
+       "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+   last if ($result eq 'on');
+   sleep(1);
+}
+is ($result, "on", 'ensure checksums are enabled on master');
+
+# Wait for checksums enabled on the standby
+for (my $i = 0; $i < $MAX_TRIES; $i++)
+{
+   $result = $node_standby_1->safe_psql('postgres',
+       "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+   last if ($result eq 'on');
+   sleep(1);
+}
+is ($result, "on", 'ensure checksums are enabled on standby');
+
+$result = $node_master->safe_psql('postgres', "SELECT count(a) FROM t");
+is ($result, "20000", 'ensure we can safely read all data with checksums');
+
+# Disable checksums and ensure it's propagated to standby and that we can
+# still read all data
+$node_master->safe_psql('postgres', "SELECT pg_disable_data_checksums();");
+$result = $node_master->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "off", 'ensure checksums are in progress on master');
+
+# Wait for checksum disable to be replayed
+$node_master->wait_for_catchup($node_standby_1, 'replay');
+
+# Ensure that the standby has switched to off
+$result = $node_standby_1->safe_psql('postgres',
+   "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';");
+is($result, "off", 'ensure checksums are in progress on standby_1');
+
+$result = $node_master->safe_psql('postgres', "SELECT count(a) FROM t");
+is ($result, "20000", 'ensure we can safely read all data without checksums');
diff --git a/src/test/isolation/expected/checksum_cancel.out b/src/test/isolation/expected/checksum_cancel.out
new file mode 100644 (file)
index 0000000..c449e7b
--- /dev/null
@@ -0,0 +1,27 @@
+Parsed test spec with 2 sessions
+
+starting permutation: c_verify_checksums_off r_seqread c_enable_checksums c_verify_checksums_inprogress c_disable_checksums c_wait_checksums_off
+step c_verify_checksums_off: SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+?column?       
+
+t              
+step r_seqread: SELECT * FROM reader_loop();
+reader_loop    
+
+t              
+step c_enable_checksums: SELECT pg_enable_data_checksums(1000);
+pg_enable_data_checksums
+
+               
+step c_verify_checksums_inprogress: SELECT setting = 'inprogress' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+?column?       
+
+t              
+step c_disable_checksums: SELECT pg_disable_data_checksums();
+pg_disable_data_checksums
+
+               
+step c_wait_checksums_off: SELECT test_checksums_off();
+test_checksums_off
+
+t              
diff --git a/src/test/isolation/expected/checksum_enable.out b/src/test/isolation/expected/checksum_enable.out
new file mode 100644 (file)
index 0000000..0a68f47
--- /dev/null
@@ -0,0 +1,27 @@
+Parsed test spec with 3 sessions
+
+starting permutation: c_verify_checksums_off w_insert100k r_seqread c_enable_checksums c_wait_for_checksums c_verify_checksums_on
+step c_verify_checksums_off: SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+?column?       
+
+t              
+step w_insert100k: SELECT insert_1k(100);
+insert_1k      
+
+t              
+step r_seqread: SELECT * FROM reader_loop();
+reader_loop    
+
+t              
+step c_enable_checksums: SELECT pg_enable_data_checksums();
+pg_enable_data_checksums
+
+               
+step c_wait_for_checksums: SELECT test_checksums_on();
+test_checksums_on
+
+t              
+step c_verify_checksums_on: SELECT setting = 'on' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+?column?       
+
+t              
index 99dd7c6bdbfb5ac3ad26599cbc461f0b85bb9bc8..31900cb920bd22b4d0071825775a201800540cc0 100644 (file)
@@ -72,3 +72,7 @@ test: timeouts
 test: vacuum-concurrent-drop
 test: predicate-gist
 test: predicate-gin
+# The checksum_enable suite will enable checksums for the cluster so should
+# not run before anything expecting the cluster to have checksums turned off
+test: checksum_cancel
+test: checksum_enable
diff --git a/src/test/isolation/specs/checksum_cancel.spec b/src/test/isolation/specs/checksum_cancel.spec
new file mode 100644 (file)
index 0000000..3466a74
--- /dev/null
@@ -0,0 +1,47 @@
+setup
+{
+   CREATE TABLE t1 (a serial, b integer, c text);
+   INSERT INTO t1 (b, c) VALUES (generate_series(1,10000), 'starting values');
+
+   CREATE OR REPLACE FUNCTION test_checksums_off() RETURNS boolean AS $$
+   DECLARE
+       enabled boolean;
+   BEGIN
+       PERFORM pg_sleep(1);
+       SELECT setting = 'off' INTO enabled FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+       RETURN enabled;
+   END;
+   $$ LANGUAGE plpgsql;
+   
+   CREATE OR REPLACE FUNCTION reader_loop() RETURNS boolean AS $$
+   DECLARE
+       counter integer;
+       enabled boolean;
+   BEGIN
+       FOR counter IN 1..100 LOOP
+           PERFORM count(a) FROM t1;
+       END LOOP;
+       RETURN True;
+   END;
+   $$ LANGUAGE plpgsql;
+}
+
+teardown
+{
+   DROP FUNCTION reader_loop();
+   DROP FUNCTION test_checksums_off();
+
+   DROP TABLE t1;
+}
+
+session "reader"
+step "r_seqread"                       { SELECT * FROM reader_loop(); }
+
+session "checksums"
+step "c_verify_checksums_off"          { SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums'; }
+step "c_enable_checksums"              { SELECT pg_enable_data_checksums(1000); }
+step "c_disable_checksums"             { SELECT pg_disable_data_checksums(); }
+step "c_verify_checksums_inprogress"   { SELECT setting = 'inprogress' FROM pg_catalog.pg_settings WHERE name = 'data_checksums'; }
+step "c_wait_checksums_off"                { SELECT test_checksums_off(); }
+
+permutation "c_verify_checksums_off" "r_seqread" "c_enable_checksums" "c_verify_checksums_inprogress" "c_disable_checksums" "c_wait_checksums_off"
diff --git a/src/test/isolation/specs/checksum_enable.spec b/src/test/isolation/specs/checksum_enable.spec
new file mode 100644 (file)
index 0000000..ba85dd6
--- /dev/null
@@ -0,0 +1,70 @@
+setup
+{
+   CREATE TABLE t1 (a serial, b integer, c text);
+   INSERT INTO t1 (b, c) VALUES (generate_series(1,10000), 'starting values');
+
+   CREATE OR REPLACE FUNCTION insert_1k(iterations int) RETURNS boolean AS $$
+   DECLARE
+       counter integer;
+   BEGIN
+       FOR counter IN 1..$1 LOOP
+           INSERT INTO t1 (b, c) VALUES (
+               generate_series(1, 1000),
+               array_to_string(array(select chr(97 + (random() * 25)::int) from generate_series(1,250)), '')
+           );
+           PERFORM pg_sleep(0.1);
+       END LOOP;
+       RETURN True;
+   END;
+   $$ LANGUAGE plpgsql;
+   
+   CREATE OR REPLACE FUNCTION test_checksums_on() RETURNS boolean AS $$
+   DECLARE
+       enabled boolean;
+   BEGIN
+       LOOP
+           SELECT setting = 'on' INTO enabled FROM pg_catalog.pg_settings WHERE name = 'data_checksums';
+           IF enabled THEN
+               EXIT;
+           END IF;
+           PERFORM pg_sleep(1);
+       END LOOP;
+       RETURN enabled;
+   END;
+   $$ LANGUAGE plpgsql;
+   
+   CREATE OR REPLACE FUNCTION reader_loop() RETURNS boolean AS $$
+   DECLARE
+       counter integer;
+   BEGIN
+       FOR counter IN 1..30 LOOP
+           PERFORM count(a) FROM t1;
+           PERFORM pg_sleep(0.2);
+       END LOOP;
+       RETURN True;
+   END;
+   $$ LANGUAGE plpgsql;
+}
+
+teardown
+{
+   DROP FUNCTION reader_loop();
+   DROP FUNCTION test_checksums_on();
+   DROP FUNCTION insert_1k(int);
+
+   DROP TABLE t1;
+}
+
+session "writer"
+step "w_insert100k"                { SELECT insert_1k(100); }
+
+session "reader"
+step "r_seqread"               { SELECT * FROM reader_loop(); }
+
+session "checksums"
+step "c_verify_checksums_off"  { SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums'; }
+step "c_enable_checksums"      { SELECT pg_enable_data_checksums(); }
+step "c_wait_for_checksums"        { SELECT test_checksums_on(); }
+step "c_verify_checksums_on"   { SELECT setting = 'on' FROM pg_catalog.pg_settings WHERE name = 'data_checksums'; }
+
+permutation "c_verify_checksums_off" "w_insert100k" "r_seqread" "c_enable_checksums" "c_wait_for_checksums" "c_verify_checksums_on"