aio: wal: padding of partial records.
authorAndres Freund <[email protected]>
Thu, 31 Dec 2020 04:26:52 +0000 (20:26 -0800)
committerAndres Freund <[email protected]>
Mon, 11 Jan 2021 23:09:15 +0000 (15:09 -0800)
Author:
Reviewed-By:
Discussion: https://postgr.es/m/
Backpatch:

src/backend/access/rmgrdesc/xlogdesc.c
src/backend/access/transam/xlog.c
src/backend/storage/buffer/bufmgr.c
src/backend/utils/misc/guc.c
src/include/catalog/pg_control.h
src/include/storage/bufmgr.h

index 92cc7ea0735170692a9a36faf7423c23f3fb7ac4..a876915fa1fe52bb310c19a1e881e96be32f4c7f 100644 (file)
@@ -140,6 +140,10 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
                         xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
                         timestamptz_to_str(xlrec.end_time));
    }
+   else if (info == XLOG_WASTE)
+   {
+       appendStringInfo(buf, "waste");
+   }
 }
 
 const char *
index 37b0cf32a46ac909802ec401fc6c8be012312b80..cd7ad390de6f19be486a43d59d9a03385daaf475 100644 (file)
@@ -1513,6 +1513,104 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
    return true;
 }
 
+static XLogRecPtr
+PadPartialPage(XLogRecPtr upto, XLogRecPtr *final_pad)
+{
+   XLogCtlInsert *Insert = &XLogCtl->Insert;
+   bool        padded = false;
+   uint64      startbytepos;
+   uint64      endbytepos;
+   uint64      prevbytepos;
+   size_t      pad_size = 0;
+   XLogRecord  rechdr = {0};
+   pg_crc32c   rdata_crc;
+
+   rechdr.xl_xid = InvalidTransactionId;
+   rechdr.xl_info = XLOG_NOOP;
+   rechdr.xl_rmid = RM_XLOG_ID;
+
+   Assert(upto == upto - (upto % XLOG_BLCKSZ));
+
+   START_CRIT_SECTION();
+
+   WALInsertLockAcquire();
+
+   SpinLockAcquire(&Insert->insertpos_lck);
+
+   startbytepos = Insert->CurrBytePos;
+   if (startbytepos < XLogRecPtrToBytePos(upto))
+   {
+       padded = true;
+
+       pad_size = upto - XLogBytePosToEndRecPtr(startbytepos);
+       pad_size = Max(pad_size, MAXALIGN64(SizeOfXLogRecord + SizeOfXLogRecordDataHeaderLong));
+
+       endbytepos = startbytepos + pad_size;
+       prevbytepos = Insert->PrevBytePos;
+       Insert->CurrBytePos = endbytepos;
+       Insert->PrevBytePos = startbytepos;
+   }
+
+   SpinLockRelease(&Insert->insertpos_lck);
+
+   if (padded)
+   {
+       XLogRecData recdata[4];
+       uint32 k;
+       uint32 rdata_len;
+
+       rechdr.xl_prev = XLogBytePosToRecPtr(prevbytepos);
+       rechdr.xl_tot_len = pad_size;
+
+       recdata[0].data = (char *) &rechdr;
+       recdata[0].len = SizeOfXLogRecord;
+       recdata[0].next = &recdata[1];
+
+       k = XLR_BLOCK_ID_DATA_LONG;
+       rdata_len = pad_size - SizeOfXLogRecord - SizeOfXLogRecordDataHeaderLong;
+
+       recdata[1].data = (char *) &k;
+       recdata[1].len = sizeof(uint8);
+       recdata[1].next = &recdata[2];
+
+       recdata[2].data = (char *) &rdata_len;
+       recdata[2].len = sizeof(uint32);
+       recdata[2].next = &recdata[3];
+
+       recdata[3].data = XLogCtl->zerobuf;
+       recdata[3].len = pad_size - SizeOfXLogRecord - SizeOfXLogRecordDataHeaderLong;
+       recdata[3].next = NULL;
+
+
+       INIT_CRC32C(rdata_crc);
+       for (XLogRecData *rdt = &recdata[1]; rdt != NULL; rdt = rdt->next)
+           COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
+       COMP_CRC32C(rdata_crc, &rechdr, offsetof(XLogRecord, xl_crc));
+       FIN_CRC32C(rdata_crc);
+       rechdr.xl_crc = rdata_crc;
+
+       CopyXLogRecordToWAL(rechdr.xl_tot_len, false, recdata,
+                           XLogBytePosToRecPtr(startbytepos), XLogBytePosToEndRecPtr(endbytepos));
+   }
+
+   WALInsertLockRelease();
+
+   END_CRIT_SECTION();
+
+   if (padded)
+   {
+       pgWalUsage.wal_bytes += pad_size;
+
+       *final_pad = XLogBytePosToEndRecPtr(endbytepos);
+       return true;
+   }
+   else
+   {
+       *final_pad = InvalidXLogRecPtr;
+       return false;
+   }
+}
+
 /*
  * Checks whether the current buffer page and backup page stored in the
  * WAL record are consistent or not. Before comparing the two pages, a
@@ -3092,6 +3190,54 @@ XLogWriteIssueWrites(XLogWritePos *write_pos, bool flexible)
 #endif
            }
 
+           // FIXME: need to figure out how to not cause problems during
+           // shutdown checkpoints etc.
+           if (ispartialpage && io_wal_pad_partial && XLogInsertAllowed())
+           {
+               XLogRecPtr pad_upto;
+               XLogRecPtr final_pad;
+
+               //elog(DEBUG1, "WALWriteLock padd");
+               LWLockRelease(WALWriteLock);
+
+               pgaio_submit_pending(true);
+
+               pad_upto = newinsertpos - newinsertpos % XLOG_BLCKSZ + XLOG_BLCKSZ;
+
+               if (0)
+               {
+                   XLogRecPtr insert_lsn = XLogBytePosToRecPtr(XLogCtl->Insert.CurrBytePos);
+
+                   elog(DEBUG1, "for min %X/%X, pad %X/%X up to: %X/%X (%d bytes), insert %X/%X",
+                        (uint32)(write_pos->write_init_min >> 32), (uint32) write_pos->write_init_min,
+                        (uint32)(newinsertpos >> 32), (uint32) newinsertpos,
+                        (uint32)(pad_upto >> 32), (uint32) pad_upto,
+                        (int32)(pad_upto - newinsertpos),
+                        (uint32)(insert_lsn >> 32), (uint32) insert_lsn);
+               }
+
+               if (PadPartialPage(pad_upto, &final_pad))
+               {
+                   if (0)
+                   {
+                       elog(DEBUG1, "actually pad req %X/%X new %X/%X up to: %X/%X (%d/%d bytes), started at %X/%X",
+                            (uint32)(write_pos->write_init_min >> 32), (uint32) write_pos->write_init_min,
+                            (uint32)(newinsertpos >> 32), (uint32) newinsertpos,
+                            (uint32)(final_pad >> 32), (uint32) final_pad,
+                            (int32)(final_pad - write_pos->write_init_min),
+                            (int32)(final_pad - newinsertpos),
+                            (uint32)(startwrite_first >> 32), (uint32) startwrite_first);
+                   }
+                   write_pos->write_init_opt = WaitXLogInsertionsToFinish(EndRecPtr);
+               }
+               else
+               {
+                   //elog(LOG, "didn't need to pad");
+               }
+
+               goto write_out_wait;
+           }
+
            if ((write_pos->write_init_opt % XLOG_BLCKSZ) != 0)
                lastnonpartialidx = XLogRecPtrToBufIdx(write_pos->write_init_opt - XLOG_BLCKSZ);
            else
index 679d7e4b202e3026814da8b84a21be2a3aaed3f0..6bbcde40830c12bd7359f1d2f2ea3fe91df119b1 100644 (file)
@@ -163,6 +163,7 @@ bool        io_data_direct = 0;
 bool       io_data_force_async = 1;
 bool       io_wal_direct = 0;
 bool       io_wal_init_direct = 0;
+bool       io_wal_pad_partial = true;
 int            io_wal_concurrency = 32;
 int            io_wal_target_blocks = 8;
 
index 7f8369e939499c17cb780a92b6b7f5158475d7d1..b8152a080ff60886510451c3f6c4b300e6ad26db 100644 (file)
@@ -2084,6 +2084,15 @@ static struct config_bool ConfigureNamesBool[] =
        NULL, NULL, NULL
    },
 
+   {
+       {"io_wal_pad_partial", PGC_SIGHUP, RESOURCES_DISK,
+           gettext_noop("pad WAL files upon flash"),
+       },
+       &io_wal_pad_partial,
+       false,
+       NULL, NULL, NULL
+   },
+
    /* End-of-list marker */
    {
        {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
index e3f48158ce7a45a3ebf271043dd28a1b52f5a241..7997dc7bbaa576ea4c50a425a5cce0a425d161a1 100644 (file)
@@ -76,6 +76,7 @@ typedef struct CheckPoint
 #define XLOG_END_OF_RECOVERY           0x90
 #define XLOG_FPI_FOR_HINT              0xA0
 #define XLOG_FPI                       0xB0
+#define XLOG_WASTE                     0xC0
 
 
 /*
index bd868ea2735f81fbd09a16b9d4abe93ce789d5af..779ccd7df397bc5b76e422d3768486e50f7898ff 100644 (file)
@@ -80,6 +80,7 @@ extern bool io_data_direct;
 extern bool io_data_force_async;
 extern bool io_wal_direct;
 extern bool io_wal_init_direct;
+extern bool io_wal_pad_partial;
 extern int io_wal_concurrency;
 extern int io_wal_target_blocks;