#include "access/relation.h"
#include "fmgr.h"
#include "miscadmin.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/smgr.h"
#include "utils/acl.h"
#include "utils/builtins.h"
{
PREWARM_PREFETCH,
PREWARM_READ,
- PREWARM_BUFFER
+ PREWARM_READ_AIO,
+ PREWARM_BUFFER,
+ PREWARM_BUFFER_AIO
} PrewarmType;
static PGAlignedBlock blockbuffer;
+typedef struct prefetch
+{
+ Relation rel;
+ ForkNumber forkNumber;
+ int64 curblock;
+ int64 lastblock;
+ List *bbs;
+} prefetch;
+
+static PgStreamingReadNextStatus
+prewarm_buffer_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+ prefetch *p = (prefetch *) pgsr_private;
+ Buffer buf;
+ bool already_valid;
+ BlockNumber blockno;
+
+ if (p->curblock <= p->lastblock)
+ blockno = p->curblock++;
+ else
+ return PGSR_NEXT_END;
+
+ buf = ReadBufferAsync(p->rel, p->forkNumber, blockno,
+ RBM_NORMAL, NULL, &already_valid,
+ &aio);
+
+ *read_private = (uintptr_t) buf;
+
+ if (already_valid)
+ {
+ ereport(DEBUG3,
+ errmsg("pgsr %s: found block %d already in buf %d",
+ NameStr(p->rel->rd_rel->relname),
+ blockno, buf),
+ errhidestmt(true),
+ errhidecontext(true));
+ return PGSR_NEXT_NO_IO;
+ }
+ else
+ {
+ ereport(DEBUG3,
+ errmsg("pgsr %s: fetching block %d into buf %d",
+ NameStr(p->rel->rd_rel->relname),
+ blockno, buf),
+ errhidestmt(true),
+ errhidecontext(true));
+ return PGSR_NEXT_IO;
+ }
+}
+
+static void
+prewarm_buffer_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+ prefetch *p = (prefetch *) pgsr_private;
+ Buffer buf = (Buffer) read_private;
+
+ ereport(DEBUG2,
+ errmsg("pgsr %s: releasing buf %d",
+ NameStr(p->rel->rd_rel->relname),
+ buf),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ Assert(BufferIsValid(buf));
+ ReleaseBuffer(buf);
+}
+
+
+static PgStreamingReadNextStatus
+prewarm_smgr_next(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private)
+{
+ prefetch *p = (prefetch *) pgsr_private;
+ BlockNumber blockno;
+ PgAioBounceBuffer *bb;
+
+ if (p->curblock <= p->lastblock)
+ blockno = p->curblock++;
+ else
+ return PGSR_NEXT_END;
+
+ if (p->bbs != NIL)
+ {
+ bb = lfirst(list_tail(p->bbs));
+ p->bbs = list_delete_last(p->bbs);
+ }
+ else
+ bb = pgaio_bounce_buffer_get();
+
+ pgaio_assoc_bounce_buffer(aio, bb);
+
+ smgrstartread(aio, p->rel->rd_smgr, p->forkNumber, blockno,
+ pgaio_bounce_buffer_buffer(bb), InvalidBuffer, 0);
+
+ *read_private = (uintptr_t) bb;
+
+ return PGSR_NEXT_IO;
+}
+
+static void
+prewarm_smgr_release(uintptr_t pgsr_private, uintptr_t read_private)
+{
+}
+
/*
* pg_prewarm(regclass, mode text, fork text,
* first_block int8, last_block int8)
ptype = PREWARM_READ;
else if (strcmp(ttype, "buffer") == 0)
ptype = PREWARM_BUFFER;
+ else if (strcmp(ttype, "buffer_aio") == 0)
+ ptype = PREWARM_BUFFER_AIO;
+ else if (strcmp(ttype, "read_aio") == 0)
+ ptype = PREWARM_READ_AIO;
else
{
ereport(ERROR,
++blocks_done;
}
}
+ else if (ptype == PREWARM_BUFFER_AIO)
+ {
+ PgStreamingRead *pgsr;
+ prefetch p;
+
+ p.rel = rel;
+ p.forkNumber = forkNumber;
+ p.curblock = 0;
+ p.lastblock = last_block;
+ p.bbs = NIL;
+
+ pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+ prewarm_buffer_next,
+ prewarm_buffer_release);
+
+ for (block = first_block; block <= last_block; ++block)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buf = (Buffer) pg_streaming_read_get_next(pgsr);
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+ else
+ elog(ERROR, "prefetch ended early");
+
+ ++blocks_done;
+ }
+
+ if (BufferIsValid(pg_streaming_read_get_next(pgsr)))
+ elog(ERROR, "unexpected additional buffer");
+
+ pg_streaming_read_free(pgsr);
+ }
+ else if (ptype == PREWARM_READ_AIO)
+ {
+ PgStreamingRead *pgsr;
+ prefetch p;
+ ListCell *lc;
+
+ p.rel = rel;
+ p.forkNumber = forkNumber;
+ p.curblock = 0;
+ p.lastblock = last_block;
+ p.bbs = NIL;
+
+ pgsr = pg_streaming_read_alloc(512, (uintptr_t) &p,
+ prewarm_smgr_next,
+ prewarm_smgr_release);
+
+ for (block = first_block; block <= last_block; ++block)
+ {
+ PgAioBounceBuffer *bb;
+
+ CHECK_FOR_INTERRUPTS();
+
+ bb = (PgAioBounceBuffer *) pg_streaming_read_get_next(pgsr);
+ if (bb == NULL)
+ elog(ERROR, "prefetch ended early");
+
+ p.bbs = lappend(p.bbs, (void *) bb);
+
+ ++blocks_done;
+ }
+
+ if (pg_streaming_read_get_next(pgsr) != 0)
+ elog(ERROR, "unexpected additional buffer");
+
+ pg_streaming_read_free(pgsr);
+
+ foreach(lc, p.bbs)
+ {
+ pgaio_bounce_buffer_release(lfirst(lc));
+ }
+ }
/* Close relation, release lock. */
relation_close(rel, AccessShareLock);
return buffer;
}
+static Buffer
+ExtendRelation(Relation relation, BulkInsertState bistate, bool use_fsm)
+{
+ Buffer buf;
+ Page page;
+
+ /* FIXME: There should be a better approach to both of these exceptions */
+ if (RELATION_IS_LOCAL(relation) || !use_fsm)
+ {
+ LockRelationForExtension(relation, ExclusiveLock);
+ buf = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+ UnlockRelationForExtension(relation, ExclusiveLock);
+ }
+ else
+ {
+ int extendby;
+ int newblockno;
+
+ /*
+ * Determine number of pages to extend relation by.
+ */
+ {
+ BlockNumber start_nblocks;
+
+ RelationOpenSmgr(relation);
+ start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+ extendby = Max(Min(start_nblocks / 16 * BLCKSZ, (16 * 1024 * 1024)) / BLCKSZ, 1);
+ extendby = Max(Min(extendby, NBuffers / 128), 1);
+ }
+
+ /*
+ * FIXME: There should probably be some chunking, either from here, or
+ * inside BulkExtendBuffered.
+ */
+ buf = BulkExtendBuffered(relation, MAIN_FORKNUM, extendby,
+ bistate ? bistate->strategy : NULL);
+
+ newblockno = BufferGetBlockNumber(buf);
+
+ for (int i = newblockno + 1; i < newblockno + extendby; i++)
+ {
+ RecordPageWithFreeSpace(relation, i, BLCKSZ - SizeOfPageHeaderData);
+ }
+
+ if (use_fsm && extendby > 1)
+ {
+ FreeSpaceMapVacuumRange(relation, newblockno, newblockno + extendby);
+ }
+ }
+
+ /*
+ * We need to initialize the empty new page. Double-check that it really
+ * is empty (this should never happen, but if it does we don't want to
+ * risk wiping out valid data).
+ */
+ page = BufferGetPage(buf);
+
+ if (!PageIsNew(page))
+ elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
+ BufferGetBlockNumber(buf),
+ RelationGetRelationName(relation));
+
+ PageInit(page, BufferGetPageSize(buf), 0);
+ MarkBufferDirty(buf);
+
+ return buf;
+}
+
+
/*
* For each heap page which is all-visible, acquire a pin on the appropriate
* visibility map page, if we haven't already got one.
}
}
-/*
- * Extend a relation by multiple blocks to avoid future contention on the
- * relation extension lock. Our goal is to pre-extend the relation by an
- * amount which ramps up as the degree of contention ramps up, but limiting
- * the result to some sane overall value.
- */
-static void
-RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
-{
- BlockNumber blockNum,
- firstBlock = InvalidBlockNumber;
- int extraBlocks;
- int lockWaiters;
-
- /* Use the length of the lock wait queue to judge how much to extend. */
- lockWaiters = RelationExtensionLockWaiterCount(relation);
- if (lockWaiters <= 0)
- return;
-
- /*
- * It might seem like multiplying the number of lock waiters by as much as
- * 20 is too aggressive, but benchmarking revealed that smaller numbers
- * were insufficient. 512 is just an arbitrary cap to prevent
- * pathological results.
- */
- extraBlocks = Min(512, lockWaiters * 20);
-
- do
- {
- Buffer buffer;
- Page page;
- Size freespace;
-
- /*
- * Extend by one page. This should generally match the main-line
- * extension code in RelationGetBufferForTuple, except that we hold
- * the relation extension lock throughout, and we don't immediately
- * initialize the page (see below).
- */
- buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
- page = BufferGetPage(buffer);
-
- if (!PageIsNew(page))
- elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
- BufferGetBlockNumber(buffer),
- RelationGetRelationName(relation));
-
- /*
- * Add the page to the FSM without initializing. If we were to
- * initialize here, the page would potentially get flushed out to disk
- * before we add any useful content. There's no guarantee that that'd
- * happen before a potential crash, so we need to deal with
- * uninitialized pages anyway, thus avoid the potential for
- * unnecessary writes.
- */
-
- /* we'll need this info below */
- blockNum = BufferGetBlockNumber(buffer);
- freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData;
-
- UnlockReleaseBuffer(buffer);
-
- /* Remember first block number thus added. */
- if (firstBlock == InvalidBlockNumber)
- firstBlock = blockNum;
-
- /*
- * Immediately update the bottom level of the FSM. This has a good
- * chance of making this page visible to other concurrently inserting
- * backends, and we want that to happen without delay.
- */
- RecordPageWithFreeSpace(relation, blockNum, freespace);
- }
- while (--extraBlocks > 0);
-
- /*
- * Updating the upper levels of the free space map is too expensive to do
- * for every block, but it's worth doing once at the end to make sure that
- * subsequent insertion activity sees all of those nifty free pages we
- * just inserted.
- */
- FreeSpaceMapVacuumRange(relation, firstBlock, blockNum + 1);
-}
-
/*
* RelationGetBufferForTuple
*
saveFreeSpace = 0;
BlockNumber targetBlock,
otherBlock;
- bool needLock;
len = MAXALIGN(len); /* be conservative */
ReleaseBuffer(buffer);
}
+ /*
+ * FIXME: definitely needs a better solution.
+ */
+ if (!use_fsm && bistate && bistate->current_buf != InvalidBuffer)
+ {
+ BlockNumber blocknum = BufferGetBlockNumber(bistate->current_buf) + 1;
+
+ RelationOpenSmgr(relation);
+
+ if (blocknum < smgrnblocks(relation->rd_smgr, MAIN_FORKNUM))
+ {
+ targetBlock = blocknum;
+
+ goto loop;
+ }
+ }
+
/* Without FSM, always fall out of the loop and extend */
if (!use_fsm)
break;
len + saveFreeSpace);
}
- /*
- * Have to extend the relation.
- *
- * We have to use a lock to ensure no one else is extending the rel at the
- * same time, else we will both try to initialize the same new page. We
- * can skip locking for new or temp relations, however, since no one else
- * could be accessing them.
- */
- needLock = !RELATION_IS_LOCAL(relation);
-
- /*
- * If we need the lock but are not able to acquire it immediately, we'll
- * consider extending the relation by multiple blocks at a time to manage
- * contention on the relation extension lock. However, this only makes
- * sense if we're using the FSM; otherwise, there's no point.
- */
- if (needLock)
- {
- if (!use_fsm)
- LockRelationForExtension(relation, ExclusiveLock);
- else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
- {
- /* Couldn't get the lock immediately; wait for it. */
- LockRelationForExtension(relation, ExclusiveLock);
-
- /*
- * Check if some other backend has extended a block for us while
- * we were waiting on the lock.
- */
- targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
- /*
- * If some other waiter has already extended the relation, we
- * don't need to do so; just use the existing freespace.
- */
- if (targetBlock != InvalidBlockNumber)
- {
- UnlockRelationForExtension(relation, ExclusiveLock);
- goto loop;
- }
-
- /* Time to bulk-extend. */
- RelationAddExtraBlocks(relation, bistate);
- }
- }
-
- /*
- * In addition to whatever extension we performed above, we always add at
- * least one block to satisfy our own request.
- *
- * XXX This does an lseek - rather expensive - but at the moment it is the
- * only way to accurately determine how many blocks are in a relation. Is
- * it worth keeping an accurate file length in shared memory someplace,
- * rather than relying on the kernel to do it for us?
- */
- buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
+ buffer = ExtendRelation(relation, bistate, use_fsm);
- /*
- * We need to initialize the empty new page. Double-check that it really
- * is empty (this should never happen, but if it does we don't want to
- * risk wiping out valid data).
- */
page = BufferGetPage(buffer);
- if (!PageIsNew(page))
- elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
- BufferGetBlockNumber(buffer),
- RelationGetRelationName(relation));
-
- PageInit(page, BufferGetPageSize(buffer), 0);
- MarkBufferDirty(buffer);
-
- /*
- * Release the file-extension lock; it's now OK for someone else to extend
- * the relation some more.
- */
- if (needLock)
- UnlockRelationForExtension(relation, ExclusiveLock);
-
/*
* Lock the other buffer. It's guaranteed to be of a lower page number
* than the new page. To conform with the deadlock prevent rules, we ought
#include "catalog/namespace.h"
#include "catalog/pg_enum.h"
#include "catalog/storage.h"
+#include "storage/aio.h"
#include "commands/async.h"
#include "commands/tablecmds.h"
#include "commands/trigger.h"
*/
ProcArrayEndTransaction(MyProc, latestXid);
+ pgaio_at_commit();
+
/*
* This is all post-commit cleanup. Note that if an error is raised here,
* it's too late to abort the transaction. This should be just
AtAbort_Memory();
AtAbort_ResourceOwner();
+ pgaio_at_abort();
+
/*
* Release any LW locks we might be holding as quickly as possible.
* (Regular locks, however, must be held till we finish aborting.)
#include "replication/snapbuild.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
return false;
}
+
+void
+XLogWriteComplete(PgAioInProgress *aio, uint32 write_no)
+{
+}
+
+void
+XLogFlushComplete(struct PgAioInProgress *aio, uint32 flush_no)
+{
+}
+
/*
* Write and/or fsync the log at least as far as WriteRqst indicates.
*
return true;
}
+static void
+XLogFileInitComplete(void *pgsw_private, void *write_private)
+{
+}
+
/*
* Create a new XLOG file segment, or open a pre-existing one.
*
XLogSegNo max_segno;
int fd;
int save_errno;
+ int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
unlink(tmppath);
+ if (io_wal_init_direct)
+ open_flags |= PG_O_DIRECT;
+
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
- fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ fd = BasicOpenFile(tmppath, open_flags);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
* use the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
- if (!XLogIsNeeded())
+ if (!XLogIsNeeded() && !io_wal_direct)
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif
{
int o_direct_flag = 0;
+ /* make O_DIRECT setting only depend on GUC */
+ if (io_wal_direct)
+ o_direct_flag |= PG_O_DIRECT;
+
+#if 0
/* If fsync is disabled, never open in sync mode */
if (!enableFsync)
return 0;
+#endif
+#if 0
/*
* Optimize writes by bypassing kernel cache with O_DIRECT when using
* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
* read if we bypassed the kernel cache. We also skip the
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
* reason.
- *
+ */
+ if (!XLogIsNeeded())
+ o_direct_flag = PG_O_DIRECT;
+#endif
+
+ /*
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
* written by walreceiver is normally read by the startup process soon
* after it's written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
- if (!XLogIsNeeded() && !AmWalReceiverProcess())
- o_direct_flag = PG_O_DIRECT;
+ if (AmWalReceiverProcess())
+ return 0;
switch (method)
{
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
- return 0;
+ return o_direct_flag;
#ifdef OPEN_SYNC_FLAG
case SYNC_METHOD_OPEN:
return OPEN_SYNC_FLAG | o_direct_flag;
*/
SetProcessingMode(BootstrapProcessing);
bootstrap_signals();
+ InitProcess();
BootStrapXLOG();
BootstrapModeMain();
proc_exit(1); /* should never return */
if (pg_link_canary_is_frontend())
elog(ERROR, "backend is incorrectly linked to frontend functions");
- /*
- * Do backend-like initialization for bootstrap mode
- */
- InitProcess();
-
InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
/* Initialize stuff for bootstrap-file processing */
w.stats_reset
FROM pg_stat_get_wal() w;
+CREATE VIEW pg_stat_aio_backends AS
+ /* FIXME: easier to maintain without column names during development */
+ SELECT s.*
+ FROM pg_stat_get_aio_backends() s;
+
+CREATE VIEW pg_stat_aios AS
+ /* FIXME: easier to maintain without column names during development */
+ SELECT s.*
+ FROM pg_stat_get_aios() s;
+
CREATE VIEW pg_stat_progress_analyze AS
SELECT
S.pid AS pid, S.datid AS datid, D.datname AS datname,
#include "postmaster/bgwriter.h"
#include "postmaster/interrupt.h"
#include "replication/syncrep.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/condition_variable.h"
#include "storage/fd.h"
*/
pgstat_send_bgwriter();
+ /*
+ * Ensure all pending IO is submitted to avoid unnecessary delays
+ * for other processes.
+ */
+ pgaio_submit_pending(true);
+
/*
* This sleep used to be connected to bgwriter_delay, typically 200ms.
* That resulted in more frequent wakeups if not much work to do.
* Checkpointer and bgwriter are no longer related so take the Big
* Sleep.
*/
- pg_usleep(100000L);
+ if (IsCheckpointOnSchedule(progress))
+ pg_usleep(100000L);
}
else if (--absorb_counter <= 0)
{
case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN:
event_name = "WALSyncMethodAssign";
break;
+ case WAIT_EVENT_WAL_WAIT_FLUSH:
+ event_name = "WALWaitFlush";
+ break;
case WAIT_EVENT_WAL_WAIT_INSERT:
event_name = "WALWaitInsert";
break;
+ case WAIT_EVENT_WAL_WAIT_WRITE:
+ event_name = "WALWaitWrite";
+ break;
case WAIT_EVENT_WAL_WRITE:
event_name = "WALWrite";
break;
case WAIT_EVENT_LOGICAL_SUBXACT_WRITE:
event_name = "LogicalSubxactWrite";
break;
+ case WAIT_EVENT_AIO_SUBMIT:
+ event_name = "AIOSubmit";
+ break;
+ case WAIT_EVENT_AIO_IO_COMPLETE_ANY:
+ event_name = "AIOCompleteAny";
+ break;
+ case WAIT_EVENT_AIO_IO_COMPLETE_ONE:
+ event_name = "AIOCompleteOne";
+ break;
+ case WAIT_EVENT_AIO_REFS:
+ event_name = "AIORefs";
+ break;
+ case WAIT_EVENT_AIO_BACKPRESSURE:
+ event_name = "AIOBackpressure";
+ break;
/* no default case, so that compiler will warn */
}
#include "postmaster/syslogger.h"
#include "replication/logicallauncher.h"
#include "replication/walsender.h"
+#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
*/
InitializeMaxBackends();
+ /*
+ * As AIO might create interal FDs, and will trigger shared memory
+ * allocations, need to do this before reset_shared() and
+ * set_max_safe_fds().
+ */
+ pgaio_postmaster_init();
+
/*
* Set up shared memory and semaphores.
*/
buf->buf_id = i;
+ pgaio_io_ref_clear(&buf->io_in_progress);
+
/*
* Initially link all the buffers together as unused. Subsequent
* management of this list is done by freelist.c.
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
+#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
int bgwriter_flush_after = 0;
int backend_flush_after = 0;
+
+/*
+ * GUC variables related to the AIO subsystem.
+ *
+ * XXX: It's not that clear where these best belong? Particularly the WAL ones
+ * probably should move.
+ */
+bool io_data_direct = 0;
+bool io_data_force_async = 1;
+bool io_wal_direct = 0;
+bool io_wal_init_direct = 0;
+
/* local state for StartBufferIO and related functions */
static BufferDesc *InProgressBuf = NULL;
static bool IsForInput;
ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy,
bool *hit);
+static BufferDesc *ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy, bool *hit, bool isLocalBuf);
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
static void PinBuffer_Locked(BufferDesc *buf);
static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
static void BufferSync(int flags);
static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
-static int SyncOneBuffer(int buf_id, bool skip_recently_used,
- WritebackContext *wb_context);
+static int BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+ pg_streaming_write *pgsw);
static void WaitIO(BufferDesc *buf);
static bool StartBufferIO(BufferDesc *buf, bool forInput);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
- uint32 set_flag_bits);
+static void TerminateBufferIO(BufferDesc *buf, bool local, bool syncio,
+ bool clear_dirty, uint32 set_flag_bits);
+static void TerminateSharedBufferIO(BufferDesc *buf,
+ bool sync_io,
+ bool clear_dirty,
+ uint32 set_flag_bits);
static void shared_buffer_write_error_callback(void *arg);
static void local_buffer_write_error_callback(void *arg);
static BufferDesc *BufferAlloc(SMgrRelation smgr,
BufferAccessStrategy strategy,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static bool AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2);
* Implementation of PrefetchBuffer() for shared buffers.
*/
PrefetchBufferResult
-PrefetchSharedBuffer(SMgrRelation smgr_reln,
+PrefetchSharedBuffer(Relation reln,
+ SMgrRelation smgr_reln,
ForkNumber forkNum,
BlockNumber blockNum)
{
PrefetchBufferResult result = {InvalidBuffer, false};
- BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
- int buf_id;
-
- Assert(BlockNumberIsValid(blockNum));
-
- /* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
- forkNum, blockNum);
-
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
- /* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
- LWLockRelease(newPartitionLock);
-
- /* If not in buffers, initiate prefetch */
- if (buf_id < 0)
- {
-#ifdef USE_PREFETCH
- /*
- * Try to initiate an asynchronous read. This returns false in
- * recovery if the relation file doesn't exist.
- */
- if (smgrprefetch(smgr_reln, forkNum, blockNum))
- result.initiated_io = true;
-#endif /* USE_PREFETCH */
- }
- else
- {
- /*
- * Report the buffer it was in at that time. The caller may be able
- * to avoid a buffer table lookup, but it's not pinned and it must be
- * rechecked!
- */
- result.recent_buffer = buf_id + 1;
- }
+ bool already_valid;
/*
- * If the block *is* in buffers, we do nothing. This is not really ideal:
- * the block might be just about to be evicted, which would be stupid
- * since we know we are going to need it soon. But the only easy answer
- * is to bump the usage_count, which does not seem like a great solution:
- * when the caller does ultimately touch the block, usage_count would get
- * bumped again, resulting in too much favoritism for blocks that are
- * involved in a prefetch sequence. A real fix would involve some
- * additional per-buffer state, and it's not clear that there's enough of
- * a problem to justify that.
+ * Report the buffer it was in at that time. The caller may be able
+ * to avoid a buffer table lookup, but it's not pinned and it must be
+ * rechecked!
*/
+ result.recent_buffer = ReadBufferAsync(reln, forkNum, blockNum, RBM_NORMAL,
+ NULL, &already_valid, NULL);
+ result.initiated_io = !already_valid;
+
+ if (already_valid)
+ ReleaseBuffer(result.recent_buffer);
+#if 0
+ else
+ pgaio_submit_pending(true);
+#endif
return result;
}
else
{
/* pass it to the shared buffer version */
- return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
+ return PrefetchSharedBuffer(reln, reln->rd_smgr, forkNum, blockNum);
}
}
mode, strategy, &hit);
}
+static void
+ReadBufferInitRead(PgAioInProgress *aio,
+ SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
+ Buffer buf, BufferDesc *bufHdr, int mode)
+{
+ Block bufBlock;
+
+ //bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+ if (BufferIsLocal(buf))
+ bufBlock = LocalBufHdrGetBlock(bufHdr);
+ else
+ bufBlock = BufHdrGetBlock(bufHdr);
+
+ /*
+ * if we have gotten to this point, we have allocated a buffer for the
+ * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
+ * if it's a shared buffer.
+ */
+ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
+
+ /* FIXME: improve */
+ InProgressBuf = NULL;
+
+ pgaio_io_ref(aio, &bufHdr->io_in_progress);
+
+ smgrstartread(aio, smgr, forkNum, blockNum,
+ bufBlock, buf, mode);
+}
+
+Buffer
+ReadBufferAsync(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+ ReadBufferMode mode, BufferAccessStrategy strategy,
+ bool *already_valid, PgAioInProgress **aiop)
+{
+ Buffer buf;
+ BufferDesc *bufHdr;
+ bool hit;
+ bool release_io;
+ PgAioInProgress *aio;
+
+ if (mode != RBM_NORMAL || blockNum == P_NEW)
+ elog(ERROR, "unsupported");
+
+ /*
+ * Don't support AIO for local buffers yet, so just fall back to operating
+ * synchronously. This is important because otherwise callers would all
+ * need to have a non-prefetching fallback implementation.
+ */
+ if (RelationUsesLocalBuffers(reln))
+ {
+ *already_valid = true;
+ return ReadBufferExtended(reln, forkNum, blockNum, mode, strategy);
+ }
+
+ /* Open it at the smgr level if not already done */
+ RelationOpenSmgr(reln);
+
+ pgstat_count_buffer_read(reln);
+
+ bufHdr = ReadBuffer_start(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum,
+ blockNum, mode, strategy, &hit, false);
+ buf = BufferDescriptorGetBuffer(bufHdr);
+
+ if (hit)
+ {
+ pgstat_count_buffer_hit(reln);
+
+ //Assert(BufferIsPinned(buf));
+ *already_valid = true;
+ return buf;
+ }
+
+ *already_valid = false;
+
+ if (aiop == NULL)
+ {
+ release_io = true;
+ aio = pgaio_io_get();
+ }
+ else if(*aiop == NULL)
+ {
+ release_io = false;
+ *aiop = aio = pgaio_io_get();
+ }
+ else
+ {
+ release_io = false;
+ aio = *aiop;
+ }
+
+ /*
+ * FIXME: Not accurate anymore.
+ * Decrement local pin, but keep shared pin. The latter will be released
+ * upon completion of the IO. Otherwise the buffer could be recycled while
+ * the IO is ongoing.
+ *
+ * FIXME: Make this optional? It's only useful for fire-and-forget style
+ * IO.
+ */
+ if (!release_io)
+ {
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(bufHdr);
+ buf_state += BUF_REFCOUNT_ONE;
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+ else
+ {
+ PrivateRefCountEntry *ref;
+
+ ref = GetPrivateRefCountEntry(buf, false);
+ Assert(ref != NULL);
+ Assert(ref->refcount > 0);
+
+ ResourceOwnerForgetBuffer(CurrentResourceOwner, buf);
+ ref->refcount--;
+ ForgetPrivateRefCountEntry(ref);
+ }
+
+ ReadBufferInitRead(aio, reln->rd_smgr, forkNum, blockNum, buf, bufHdr, mode);
+
+ if (release_io)
+ pgaio_io_release(aio);
+
+ return buf;
+}
-/*
- * ReadBuffer_common -- common logic for all ReadBuffer variants
- *
- * *hit is set to true if the request was satisfied from shared buffer cache.
- */
static Buffer
-ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ReadBuffer_extend(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
- BufferAccessStrategy strategy, bool *hit)
+ BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
{
+ bool found;
BufferDesc *bufHdr;
Block bufBlock;
- bool found;
- bool isExtend;
- bool isLocalBuf = SmgrIsTemp(smgr);
*hit = false;
- /* Make sure we will have room to remember the buffer pin */
- ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
-
- isExtend = (blockNum == P_NEW);
-
TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
- isExtend);
+ true);
+
+ /* Make sure we will have room to remember the buffer pin */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
- /* Substitute proper block number if caller asked for P_NEW */
- if (isExtend)
- blockNum = smgrnblocks(smgr, forkNum);
+ /* Substitute proper block number */
+ blockNum = smgrnblocks(smgr, forkNum);
if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
- if (found)
- pgBufferUsage.local_blks_hit++;
- else if (isExtend)
- pgBufferUsage.local_blks_written++;
- else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
- mode == RBM_ZERO_ON_ERROR)
- pgBufferUsage.local_blks_read++;
+ pgBufferUsage.local_blks_written++;
}
else
{
- /*
- * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
- * not currently in memory.
- */
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
- if (found)
- pgBufferUsage.shared_blks_hit++;
- else if (isExtend)
- pgBufferUsage.shared_blks_written++;
- else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
- mode == RBM_ZERO_ON_ERROR)
- pgBufferUsage.shared_blks_read++;
+ pgBufferUsage.shared_blks_written++;
}
- /* At this point we do NOT hold any locks. */
-
- /* if it was already in the buffer pool, we're done */
if (found)
{
- if (!isExtend)
- {
- /* Just need to update stats before we exit */
- *hit = true;
- VacuumPageHit++;
-
- if (VacuumCostActive)
- VacuumCostBalance += VacuumCostPageHit;
-
- TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
- smgr->smgr_rnode.node.spcNode,
- smgr->smgr_rnode.node.dbNode,
- smgr->smgr_rnode.node.relNode,
- smgr->smgr_rnode.backend,
- isExtend,
- found);
-
- /*
- * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
- * locked on return.
- */
- if (!isLocalBuf)
- {
- if (mode == RBM_ZERO_AND_LOCK)
- LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
- LW_EXCLUSIVE);
- else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
- LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
- }
-
- return BufferDescriptorGetBuffer(bufHdr);
- }
+ Block bufBlock;
/*
* We get here only in the corner case where we are trying to extend
UnlockBufHdr(bufHdr, buf_state);
} while (!StartBufferIO(bufHdr, true));
}
+
}
/*
- * if we have gotten to this point, we have allocated a buffer for the
- * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
- * if it's a shared buffer.
- *
* Note: if smgrextend fails, we will end up with a buffer that is
* allocated but not marked BM_VALID. P_NEW will still select the same
* block number (because the relation didn't get any longer on disk) and
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
- if (isExtend)
- {
- /* new buffers are zero-filled */
- MemSet((char *) bufBlock, 0, BLCKSZ);
- /* don't set checksum for all-zero page */
- smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
-
- /*
- * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
- * although we're essentially performing a write. At least on linux
- * doing so defeats the 'delayed allocation' mechanism, leading to
- * increased file fragmentation.
- */
- }
- else
- {
- /*
- * Read in the page, unless the caller intends to overwrite it and
- * just wants us to allocate a buffer.
- */
- if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
- MemSet((char *) bufBlock, 0, BLCKSZ);
- else
- {
- instr_time io_start,
- io_time;
-
- if (track_io_timing)
- INSTR_TIME_SET_CURRENT(io_start);
-
- smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+ /* new buffers are zero-filled */
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ /* don't set checksum for all-zero page */
+ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
- if (track_io_timing)
- {
- INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_SUBTRACT(io_time, io_start);
- pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
- INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
- }
-
- /* check for garbage data */
- if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
- PIV_LOG_WARNING | PIV_REPORT_STAT))
- {
- if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
- {
- ereport(WARNING,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("invalid page in block %u of relation %s; zeroing out page",
- blockNum,
- relpath(smgr->smgr_rnode, forkNum))));
- MemSet((char *) bufBlock, 0, BLCKSZ);
- }
- else
- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("invalid page in block %u of relation %s",
- blockNum,
- relpath(smgr->smgr_rnode, forkNum))));
- }
- }
- }
+ /*
+ * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
+ * although we're essentially performing a write. At least on linux
+ * doing so defeats the 'delayed allocation' mechanism, leading to
+ * increased file fragmentation.
+ */
/*
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
}
+
+ TerminateBufferIO(bufHdr, isLocalBuf,
+ /* syncio = */ true, /* clear_dirty = */ false,
+ BM_VALID);
+
+ return BufferDescriptorGetBuffer(bufHdr);
+}
+
+static BufferDesc *
+ReadBuffer_start(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy, bool *hit, bool isLocalBuf)
+{
+ BufferDesc *bufHdr;
+ bool found;
+
+ Assert(blockNum != P_NEW);
+
+ *hit = false;
+
+ /* Make sure we will have room to remember the buffer pin */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
+ smgr->smgr_rnode.node.spcNode,
+ smgr->smgr_rnode.node.dbNode,
+ smgr->smgr_rnode.node.relNode,
+ smgr->smgr_rnode.backend,
+ false);
+
if (isLocalBuf)
{
- /* Only need to adjust flags */
- uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
+ bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
+ if (found)
+ pgBufferUsage.local_blks_hit++;
+ else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+ mode == RBM_ZERO_ON_ERROR)
+ pgBufferUsage.local_blks_read++;
+ }
+ else
+ {
+ /*
+ * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
+ * not currently in memory.
+ */
+ bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
+ strategy, &found);
+ if (found)
+ pgBufferUsage.shared_blks_hit++;
+ else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+ mode == RBM_ZERO_ON_ERROR)
+ pgBufferUsage.shared_blks_read++;
+ }
- buf_state |= BM_VALID;
- pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+ /* At this point we do NOT hold any locks. */
+
+ /* if it was already in the buffer pool, we're done */
+ if (found)
+ {
+ /* Just need to update stats before we exit */
+ *hit = true;
+ VacuumPageHit++;
+
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageHit;
+
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+ smgr->smgr_rnode.node.spcNode,
+ smgr->smgr_rnode.node.dbNode,
+ smgr->smgr_rnode.node.relNode,
+ smgr->smgr_rnode.backend,
+ false,
+ found);
+
+ /*
+ * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
+ * locked on return.
+ */
+ if (!isLocalBuf)
+ {
+ if (mode == RBM_ZERO_AND_LOCK)
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+ LW_EXCLUSIVE);
+ else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+ }
+
+ return bufHdr;
+ }
+ else if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ {
+ /*
+ * The caller intends to overwrite the page and just wants us to
+ * allocate a buffer. Finish IO here, so sync/async don't have to
+ * duplicate the logic.
+ */
+
+ Block bufBlock;
+
+ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+
+ /*
+ * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+ * the page as valid, to make sure that no other backend sees the zeroed
+ * page before the caller has had a chance to initialize it.
+ *
+ * Since no-one else can be looking at the page contents yet, there is no
+ * difference between an exclusive lock and a cleanup-strength lock. (Note
+ * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
+ * they assert that the buffer is already valid.)
+ */
+ if (!isLocalBuf)
+ {
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+ }
+
+ TerminateBufferIO(bufHdr, isLocalBuf,
+ /* syncio = */ true, /* clear_dirty = */ false,
+ BM_VALID);
+
+ *hit = true;
+ }
+
+ return bufHdr;
+}
+
+/*
+ * ReadBuffer_common -- common logic for all ReadBuffer variants
+ *
+ * *hit is set to true if the request was satisfied from shared buffer cache.
+ */
+static Buffer
+ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy, bool *hit)
+{
+ Block bufBlock;
+ bool isLocalBuf = SmgrIsTemp(smgr);
+ BufferDesc *bufHdr;
+ Buffer buf;
+ instr_time io_start, io_time;
+
+ if (blockNum == P_NEW)
+ return ReadBuffer_extend(smgr, relpersistence, forkNum,
+ blockNum, mode, strategy,
+ hit, isLocalBuf);
+
+
+ bufHdr = ReadBuffer_start(smgr, relpersistence, forkNum,
+ blockNum, mode, strategy,
+ hit, isLocalBuf);
+
+ if (*hit)
+ return BufferDescriptorGetBuffer(bufHdr);
+
+ /*
+ * if we have gotten to this point, we have allocated a buffer for the
+ * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
+ * if it's a shared buffer.
+ */
+ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
+
+ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+ buf = BufferDescriptorGetBuffer(bufHdr);
+
+ if (isLocalBuf || !io_data_force_async)
+ {
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+
+ smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_SUBTRACT(io_time, io_start);
+ pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
+ INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
+ }
+
+ /* check for garbage data */
+ if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+ PIV_LOG_WARNING | PIV_REPORT_STAT))
+ {
+ if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s; zeroing out page",
+ blockNum,
+ relpath(smgr->smgr_rnode, forkNum))));
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s",
+ blockNum,
+ relpath(smgr->smgr_rnode, forkNum))));
+ }
+
+ TerminateBufferIO(bufHdr, isLocalBuf,
+ /* syncio = */ true, /* clear_dirty = */ false,
+ BM_VALID);
}
else
{
- /* Set BM_VALID, terminate IO, and wake up any waiters */
- TerminateBufferIO(bufHdr, false, BM_VALID);
+ PgAioInProgress* aio = pgaio_io_get();
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(bufHdr);
+ buf_state += BUF_REFCOUNT_ONE;
+ UnlockBufHdr(bufHdr, buf_state);
+
+ ReadBufferInitRead(aio, smgr, forkNum, blockNum, buf, bufHdr, mode);
+ pgaio_io_wait(aio);
+ pgaio_io_release(aio);
}
VacuumPageMiss++;
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode,
smgr->smgr_rnode.backend,
- isExtend,
- found);
+ false,
+ false);
return BufferDescriptorGetBuffer(bufHdr);
}
+void
+ReadBufferCompleteRead(Buffer buffer, const AioBufferTag *tag, char *bufdata, int mode, bool failed)
+{
+ /* FIXME: implement track_io_timing */
+
+ if (!failed)
+ {
+ Block bufBlock = (Block) bufdata;
+ BlockNumber blockNum = tag->blockNum;
+
+ /* check for garbage data */
+ if (!PageIsVerified((Page) bufdata, blockNum))
+ {
+ RelFileNode rnode = tag->rnode.node;
+ BlockNumber forkNum = tag->forkNum;
+
+ failed = true;
+
+ if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s; zeroing out page",
+ blockNum,
+ relpathperm(rnode, forkNum))));
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s",
+ blockNum,
+ relpathperm(rnode, forkNum))));
+ }
+ }
+ }
+
+ if (BufferIsValid(buffer))
+ {
+ bool islocal = BufferIsLocal(buffer);
+ BufferDesc *bufHdr;
+
+ if (islocal)
+ bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+ else
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ TerminateBufferIO(bufHdr, islocal,
+ /* syncio = */ false, /* clear_dirty = */ false,
+ failed ? BM_IO_ERROR : BM_VALID);
+ }
+}
+
+void
+ReadBufferCompleteWrite(Buffer buffer, bool failed, bool release_lock)
+{
+ BufferDesc *bufHdr;
+ bool islocal = BufferIsLocal(buffer);
+
+ if (islocal)
+ bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+ else
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ /* FIXME: implement track_io_timing */
+
+ TerminateBufferIO(bufHdr, islocal,
+ /* syncio = */ false, /* clear_dirty = */ true,
+ failed ? BM_IO_ERROR : 0);
+
+ /*
+ * The initiator of IO is not managing the lock (i.e. called
+ * LWLockReleaseOwnership()), we are.
+ */
+ if (release_lock)
+ LWLockReleaseUnowned(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+}
+
/*
* BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
* buffer. If no buffer exists already, selects a replacement
bool valid;
uint32 buf_state;
+ Assert(blockNum != P_NEW);
+
/* create a tag so we can lookup the buffer */
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
goto retry;
}
- /*
- * Clear out the buffer's tag and flags. We must do this to ensure that
- * linear scans of the buffer array don't think the buffer is valid.
- */
- oldFlags = buf_state & BUF_FLAG_MASK;
- CLEAR_BUFFERTAG(buf->tag);
- buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
- UnlockBufHdr(buf, buf_state);
+ /*
+ * Clear out the buffer's tag and flags. We must do this to ensure that
+ * linear scans of the buffer array don't think the buffer is valid.
+ */
+ oldFlags = buf_state & BUF_FLAG_MASK;
+ CLEAR_BUFFERTAG(buf->tag);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ UnlockBufHdr(buf, buf_state);
+
+ /*
+ * Remove the buffer from the lookup hashtable, if it was in there.
+ */
+ if (oldFlags & BM_TAG_VALID)
+ BufTableDelete(&oldTag, oldHash);
+
+ /*
+ * Done with mapping lock.
+ */
+ LWLockRelease(oldPartitionLock);
+
+ /*
+ * Insert the buffer at the head of the list of free buffers.
+ */
+ StrategyFreeBuffer(buf);
+}
+
+static bool
+bulk_extend_buffer_inval(BufferDesc *buf_hdr)
+{
+ uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) > 0);
+
+ /* can't change while we're holding the pin */
+ if (buf_state & BM_TAG_VALID)
+ {
+ uint32 hash;
+ LWLock *partition_lock;
+ BufferTag tag;
+ uint32 old_flags;
+
+ /* have buffer pinned, so it's safe to read tag without lock */
+ tag = buf_hdr->tag;
+ hash = BufTableHashCode(&tag);
+ partition_lock = BufMappingPartitionLock(hash);
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+ /* lock the buffer header */
+ buf_state = LockBufHdr(buf_hdr);
+ old_flags = buf_state & BUF_FLAG_MASK;
+
+ /*
+ * If somebody else pinned the buffer since, or even worse, dirtied it,
+ * give up on this buffer: It's clearly in use.
+ */
+ if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
+ {
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+ UnlockBufHdr(buf_hdr, buf_state);
+ LWLockRelease(partition_lock);
+
+ return false;
+ }
+
+ /*
+ * Clear out the buffer's tag and flags. We must do this to ensure that
+ * linear scans of the buffer array don't think the buffer is valid.
+ */
+ CLEAR_BUFFERTAG(buf_hdr->tag);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ UnlockBufHdr(buf_hdr, buf_state);
+
+ if (old_flags & BM_TAG_VALID)
+ BufTableDelete(&tag, hash);
+
+ LWLockRelease(partition_lock);
+ }
+
+ return true;
+}
+
+
+typedef struct BulkExtendOneBuffer
+{
+ BufferDesc *buf_hdr;
+ dlist_node node;
+} BulkExtendOneBuffer;
+
+typedef struct BulkExtendBufferedState
+{
+ int acquired_buffers_count;
+ int pending_buffers_count;
+
+ dlist_head acquired_buffers;
+ dlist_head pending_buffers;
+
+ dlist_head allocated_buffers;
+
+ pg_streaming_write *pgsw;
+
+ BulkExtendOneBuffer ios[];
+} BulkExtendBufferedState;
+
+static void
+bulk_extend_undirty_complete(void *pgsw_private, void *write_private)
+{
+ BulkExtendBufferedState *be_state = (BulkExtendBufferedState * ) pgsw_private;
+ BulkExtendOneBuffer *ex_buf = (BulkExtendOneBuffer *) write_private;
+ BufferTag tag;
+
+ /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+ tag = ex_buf->buf_hdr->tag;
+ be_state->pending_buffers_count--;
+ dlist_delete_from(&be_state->pending_buffers, &ex_buf->node);
+
+ if (bulk_extend_buffer_inval(ex_buf->buf_hdr))
+ {
+ dlist_push_head(&be_state->acquired_buffers, &ex_buf->node);
+ be_state->acquired_buffers_count++;
+ }
+ else
+ {
+ dlist_push_tail(&be_state->allocated_buffers, &ex_buf->node);
+ UnpinBuffer(ex_buf->buf_hdr, true);
+ }
+
+ ScheduleBufferTagForWriteback(&BackendWritebackContext, &tag);
+}
+
+/*
+ * WIP interface to more efficient relation extension.
+ *
+ * Todo:
+ *
+ * - Write initialized buffers - otherwise we'll waste a lot of time doing
+ * another set of memsets at PageInit(), as well as making PageIsVerified()
+ * a lot more expensive (verifying all-zeroes).
+ * - De-duplication of work between concurrent extensions?
+ * - Chunking, to avoid pinning quite as many buffers at once
+ * - Strategy integration
+ * - cleanup
+ *
+ */
+extern Buffer
+BulkExtendBuffered(Relation relation, ForkNumber forkNum, int extendby, BufferAccessStrategy strategy)
+{
+ BulkExtendBufferedState *be_state;
+ bool need_extension_lock = !RELATION_IS_LOCAL(relation);
+ BlockNumber start_nblocks;
+ SMgrRelation smgr;
+ BufferDesc *return_buf_hdr = NULL;
+ char relpersistence = relation->rd_rel->relpersistence;
+ dlist_iter iter;
+ BlockNumber extendto;
+ bool first;
+
+ RelationOpenSmgr(relation);
+ smgr = relation->rd_smgr;
+
+ be_state = palloc0(offsetof(BulkExtendBufferedState, ios) + sizeof(BulkExtendOneBuffer) * extendby);
+
+ dlist_init(&be_state->acquired_buffers);
+ dlist_init(&be_state->pending_buffers);
+ dlist_init(&be_state->allocated_buffers);
+
+ for (int i = 0; i < extendby; i++)
+ {
+ dlist_push_tail(&be_state->allocated_buffers, &be_state->ios[i].node);
+ }
+
+ be_state->pgsw = pg_streaming_write_alloc(128, be_state, bulk_extend_undirty_complete);
+
+ while (be_state->acquired_buffers_count < extendby)
+ {
+ uint32 cur_old_flags;
+ uint32 cur_buf_state;
+ BufferDesc *cur_buf_hdr;
+ BulkExtendOneBuffer *cur_ex_buf = NULL;
+ bool buffer_usable;
+ bool buffer_io;
+
+ /*
+ * If there's buffers being written out that might or might not be
+ * available, depending on whether they've concurrently been pinned,
+ * wait for all of those to finish, and check again.
+ */
+ if ((be_state->acquired_buffers_count + be_state->pending_buffers_count) >= extendby)
+ {
+ pg_streaming_write_wait_all(be_state->pgsw);
+ continue;
+ }
+
+ Assert(!dlist_is_empty(&be_state->allocated_buffers));
+ cur_ex_buf = dlist_container(BulkExtendOneBuffer, node,
+ dlist_pop_head_node(&be_state->allocated_buffers));
+
+ ReservePrivateRefCountEntry();
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ cur_buf_hdr = StrategyGetBuffer(NULL, &cur_buf_state);
+ cur_ex_buf->buf_hdr = cur_buf_hdr;
+
+ Assert(BUF_STATE_GET_REFCOUNT(cur_buf_state) == 0);
+
+ /* Must copy buffer flags while we still hold the spinlock */
+ cur_old_flags = cur_buf_state & BUF_FLAG_MASK;
+
+ /* Pin the buffer and then release the buffer spinlock */
+ PinBuffer_Locked(cur_buf_hdr);
+
+ if (cur_old_flags & BM_DIRTY)
+ {
+ LWLock *content_lock;
+ PgAioInProgress *aio;
+
+ content_lock = BufferDescriptorGetContentLock(cur_buf_hdr);
+
+ /*
+ * NB: this protect against deadlocks due to holding multiple
+ * buffer locks, as well as avoids unnecessary blocking (see
+ * BufferAlloc() for the latter).
+ */
+ if (LWLockConditionalAcquire(content_lock, LW_SHARED))
+ {
+ aio = pg_streaming_write_get_io(be_state->pgsw);
+
+ // XXX: could use strategy reject logic here too
+ if (AsyncFlushBuffer(aio, cur_buf_hdr, NULL))
+ {
+ buffer_usable = true;
+ buffer_io = true;
+
+ pg_streaming_write_write(be_state->pgsw, aio, cur_ex_buf);
+ }
+ else
+ {
+ buffer_io = false;
+
+ if (!bulk_extend_buffer_inval(cur_buf_hdr))
+ buffer_usable = false;
+ else
+ buffer_usable = true;
+
+ LWLockRelease(content_lock);
+ }
+ }
+ else
+ {
+ /*
+ * Someone else has locked the buffer, so give it up and loop
+ * back to get another one.
+ */
+ buffer_usable = false;
+ buffer_io = false;
+ }
+ }
+ else
+ {
+ /*
+ * This buffer can be used, unless it's getting pinned
+ * concurrently.
+ */
+ buffer_io = false;
+
+ if (!bulk_extend_buffer_inval(cur_buf_hdr))
+ buffer_usable = false;
+ else
+ buffer_usable = true;
+ }
+
+ if (buffer_io)
+ {
+ dlist_push_tail(&be_state->pending_buffers, &cur_ex_buf->node);
+ be_state->pending_buffers_count++;
+ }
+ else if (buffer_usable)
+ {
+ dlist_push_head(&be_state->acquired_buffers, &cur_ex_buf->node);
+ be_state->acquired_buffers_count++;
+ }
+ else
+ {
+ UnpinBuffer(cur_ex_buf->buf_hdr, true);
+ dlist_push_tail(&be_state->allocated_buffers, &cur_ex_buf->node);
+ }
+ }
+
+ // FIXME: shouldn't be needed
+ pg_streaming_write_wait_all(be_state->pgsw);
+ pg_streaming_write_free(be_state->pgsw);
/*
- * Remove the buffer from the lookup hashtable, if it was in there.
+ * Now we have our hands on N buffers that are guaranteed to be clean
+ * (since they are pinned they cannot be reused by other backends).
+ *
+ * Now acquire extension lock, extend relation, and try to point the
+ * victim buffers acquired above to the extended part of the relation.
*/
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag, oldHash);
+ if (need_extension_lock)
+ LockRelationForExtension(relation, ExclusiveLock);
- /*
- * Done with mapping lock.
- */
- LWLockRelease(oldPartitionLock);
+ start_nblocks = smgrnblocks(relation->rd_smgr, MAIN_FORKNUM);
+ extendto = start_nblocks;
/*
- * Insert the buffer at the head of the list of free buffers.
+ * Set up identities of all the new buffers. This way there cannot be race
+ * conditions where other backends lock the returned page first.
*/
- StrategyFreeBuffer(buf);
+ first = true;
+ dlist_foreach(iter, &be_state->acquired_buffers)
+ {
+ BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+ BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+ BufferTag new_tag;
+ uint32 new_hash;
+ int existing_buf;
+ uint32 buf_state;
+ LWLock *partition_lock;
+
+ Assert(extendto < start_nblocks + extendby);
+ Assert(ex_buf->buf_hdr);
+
+ INIT_BUFFERTAG(new_tag, smgr->smgr_rnode.node, forkNum, extendto);
+ new_hash = BufTableHashCode(&new_tag);
+
+ partition_lock = BufMappingPartitionLock(new_hash);
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+ existing_buf = BufTableInsert(&new_tag, new_hash, new_buf_hdr->buf_id);
+ if (existing_buf >= 0)
+ {
+ /* FIXME: This is probably possible when extension fails due to ENOSPC or such */
+ elog(ERROR, "buffer beyond EOF");
+ }
+
+ /* lock to install new identity */
+ buf_state = LockBufHdr(new_buf_hdr);
+
+ buf_state |= BM_TAG_VALID | BM_IO_IN_PROGRESS | BUF_USAGECOUNT_ONE * BM_MAX_USAGE_COUNT;
+
+ if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
+ buf_state |= BM_PERMANENT;
+
+ new_buf_hdr->tag = new_tag;
+ UnlockBufHdr(new_buf_hdr, buf_state);
+
+ LWLockRelease(partition_lock);
+
+ if (first)
+ {
+ return_buf_hdr = ex_buf->buf_hdr;
+ first = false;
+ }
+
+ extendto++;
+ }
+ Assert(extendto == start_nblocks + extendby);
+
+ /* finally extend the relation */
+ smgrzeroextend(relation->rd_smgr, forkNum, start_nblocks,
+ extendby, false);
+
+ /* Ensure taht the returned buffer cannot be reached by another backend first */
+ LWLockAcquire(BufferDescriptorGetContentLock(return_buf_hdr), LW_EXCLUSIVE);
+
+ /* Mark all buffers as having completed */
+ dlist_foreach(iter, &be_state->acquired_buffers)
+ {
+ BulkExtendOneBuffer *ex_buf = dlist_container(BulkExtendOneBuffer, node, iter.cur);
+ BufferDesc *new_buf_hdr = ex_buf->buf_hdr;
+ Block new_buf_block;
+ uint32 buf_state;
+
+ new_buf_block = BufHdrGetBlock(new_buf_hdr);
+
+ /* new buffers are zero-filled */
+ memset((char *) __builtin_assume_aligned(new_buf_block, 4096), 0, BLCKSZ);
+
+ buf_state = LockBufHdr(new_buf_hdr);
+ buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+ buf_state |= BM_VALID;
+ UnlockBufHdr(new_buf_hdr, buf_state);
+ ConditionVariableBroadcast(BufferDescriptorGetIOCV(new_buf_hdr));
+
+ if (new_buf_hdr != return_buf_hdr)
+ UnpinBuffer(new_buf_hdr, true);
+ }
+
+ if (need_extension_lock)
+ UnlockRelationForExtension(relation, ExclusiveLock);
+
+ return BufferDescriptorGetBuffer(return_buf_hdr);
}
/*
}
}
+static void
+buffer_sync_complete(void *pgsw_private, void *write_private)
+{
+ WritebackContext *wb_context = (WritebackContext *) pgsw_private;
+ BufferDesc *bufHdr = (BufferDesc *) write_private;
+ BufferTag tag;
+
+ /* the buffer lock has already been released by ReadBufferCompleteWrite */
+
+ tag = bufHdr->tag;
+ UnpinBuffer(bufHdr, true);
+
+ if (wb_context)
+ ScheduleBufferTagForWriteback(wb_context, &tag);
+}
+
+static bool
+BufferSyncWriteOne(pg_streaming_write *pgsw, BufferDesc *bufHdr)
+{
+ uint32 buf_state;
+ bool did_write = false;
+
+ ReservePrivateRefCountEntry();
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ buf_state = LockBufHdr(bufHdr);
+
+ if ((buf_state & BM_VALID) && (buf_state & BM_DIRTY))
+ {
+ LWLock *content_lock;
+ PgAioInProgress *aio;
+
+ PinBuffer_Locked(bufHdr);
+
+ content_lock = BufferDescriptorGetContentLock(bufHdr);
+
+ aio = pg_streaming_write_get_io(pgsw);
+
+ /*
+ * If there are pre-existing IOs in-flight, we can't block on the
+ * content lock, it could lead to a deadlock. So first wait for
+ * outstanding IO, and then block on acquiring the lock.
+ */
+ if (pg_streaming_write_inflight(pgsw) > 0 &&
+ LWLockConditionalAcquire(content_lock, LW_SHARED))
+ {
+ }
+ else
+ {
+ pg_streaming_write_wait_all(pgsw);
+ LWLockAcquire(content_lock, LW_SHARED);
+ }
+
+ if (AsyncFlushBuffer(aio, bufHdr, NULL))
+ {
+ pg_streaming_write_write(pgsw, aio, bufHdr);
+ BgWriterStats.m_buf_written_checkpoints++;
+ did_write = true;
+ }
+ else
+ {
+ LWLockRelease(content_lock);
+ UnpinBuffer(bufHdr, true);
+ }
+ }
+ else
+ {
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ return did_write;
+}
+
/*
* BufferSync -- Write out all dirty buffers in the pool.
*
binaryheap *ts_heap;
int i;
int mask = BM_DIRTY;
+ pg_streaming_write *pgsw;
WritebackContext wb_context;
/* Make sure we can handle the pin inside SyncOneBuffer */
CHECKPOINT_FLUSH_ALL))))
mask |= BM_PERMANENT;
+ elog(DEBUG1, "checkpoint looking at buffers");
+
/*
* Loop over all buffers, and mark the ones that need to be written with
* BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
WritebackContextInit(&wb_context, &checkpoint_flush_after);
+ pgsw = pg_streaming_write_alloc(128, &wb_context, buffer_sync_complete);
+
TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
+ elog(DEBUG1, "checkpoint predicts to write %u buffers", num_to_scan);
+
/*
* Sort buffers that need to be written to reduce the likelihood of random
* IO. The sorting is also important for the implementation of balancing
num_spaces = 0;
+ elog(DEBUG1, "checkpoint done sorting");
+
/*
* Allocate progress status for each tablespace with buffers that need to
* be flushed. This requires the to-be-flushed array to be sorted.
binaryheap_build(ts_heap);
+ elog(DEBUG1, "checkpoint done heaping");
+
/*
* Iterate through to-be-checkpointed buffers and write the ones (still)
* marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
*/
if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
{
- if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+ if (BufferSyncWriteOne(pgsw, bufHdr))
{
TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
BgWriterStats.m_buf_written_checkpoints++;
CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
}
+ pg_streaming_write_wait_all(pgsw);
+ pg_streaming_write_free(pgsw);
+
/* issue all pending flushes */
IssuePendingWritebacks(&wb_context);
long new_strategy_delta;
uint32 new_recent_alloc;
+ pg_streaming_write *pgsw;
+
/*
* Find out where the freelist clock sweep currently is, and how many
* buffer allocations have happened since our last call.
upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
}
+ pgsw = pg_streaming_write_alloc(128, wb_context, buffer_sync_complete);
+
/*
* Now write out dirty reusable buffers, working forward from the
* next_to_clean point, until we have lapped the strategy scan, or cleaned
/* Execute the LRU scan */
while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
{
- int sync_state = SyncOneBuffer(next_to_clean, true,
- wb_context);
+ int sync_state =
+ BgBufferSyncWriteOne(next_to_clean, true, pgsw);
if (++next_to_clean >= NBuffers)
{
reusable_buffers++;
}
+ pg_streaming_write_wait_all(pgsw);
+ pg_streaming_write_free(pgsw);
+
BgWriterStats.m_buf_written_clean += num_written;
#ifdef BGW_DEBUG
* Note: caller must have done ResourceOwnerEnlargeBuffers.
*/
static int
-SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+BgBufferSyncWriteOne(int buf_id, bool skip_recently_used,
+ pg_streaming_write *pgsw)
{
BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
int result = 0;
uint32 buf_state;
- BufferTag tag;
+ LWLock *content_lock;
+ PgAioInProgress *aio;
ReservePrivateRefCountEntry();
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
* Check whether buffer needs writing.
return result;
}
- /*
- * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
- * buffer is clean by the time we've locked it.)
- */
PinBuffer_Locked(bufHdr);
- LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
-
- FlushBuffer(bufHdr, NULL);
- LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+ aio = pg_streaming_write_get_io(pgsw);
- tag = bufHdr->tag;
+ content_lock = BufferDescriptorGetContentLock(bufHdr);
- UnpinBuffer(bufHdr, true);
+ /*
+ * If there are pre-existing IOs in-flight, we can't block on the
+ * content lock, it could lead to a deadlock. So first wait for
+ * outstanding IO, and then block on acquiring the lock.
+ */
+ if (pg_streaming_write_inflight(pgsw) > 0 &&
+ LWLockConditionalAcquire(content_lock, LW_SHARED))
+ {
+ }
+ else
+ {
+ pg_streaming_write_wait_all(pgsw);
+ LWLockAcquire(content_lock, LW_SHARED);
+ }
- ScheduleBufferTagForWriteback(wb_context, &tag);
+ if (AsyncFlushBuffer(aio, bufHdr, NULL))
+ {
+ pg_streaming_write_write(pgsw, aio, bufHdr);
+ result |= BUF_WRITTEN;
+ }
+ else
+ {
+ LWLockRelease(content_lock);
+ UnpinBuffer(bufHdr, true);
+ }
return result | BUF_WRITTEN;
}
PrintBufferLeakWarning(res->buffer);
RefCountErrors++;
}
-
}
Assert(RefCountErrors == 0);
Block bufBlock;
char *bufToWrite;
uint32 buf_state;
+ PgAioBounceBuffer *bb;
/*
* Try to start an I/O operation. If StartBufferIO returns false, then
if (buf_state & BM_PERMANENT)
XLogFlush(recptr);
+ pgBufferUsage.shared_blks_written++;
+
/*
* Now it's safe to write buffer to disk. Note that no one else should
* have been able to write it while we were busy with log flushing because
* buffer, other processes might be updating hint bits in it, so we must
* copy the page to private storage if we do checksumming.
*/
- bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum,
+ io_data_force_async ? &bb : NULL);
- if (track_io_timing)
- INSTR_TIME_SET_CURRENT(io_start);
+ if (!io_data_force_async)
+ {
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
- /*
- * bufToWrite is either the shared buffer or a copy, as appropriate.
- */
- smgrwrite(reln,
- buf->tag.forkNum,
- buf->tag.blockNum,
- bufToWrite,
- false);
+ /*
+ * bufToWrite is either the shared buffer or a copy, as appropriate.
+ */
+ smgrwrite(reln,
+ buf->tag.forkNum,
+ buf->tag.blockNum,
+ bufToWrite,
+ false);
- if (track_io_timing)
- {
- INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_SUBTRACT(io_time, io_start);
- pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
- INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_SUBTRACT(io_time, io_start);
+ pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
+ INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
+ }
+
+ /*
+ * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
+ * end the BM_IO_IN_PROGRESS state.
+ */
+ TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ true, 0);
}
+ else
+ {
+ PgAioInProgress *aio = pgaio_io_get();
+ uint32 buf_state;
- pgBufferUsage.shared_blks_written++;
+ buf_state = LockBufHdr(buf);
+ buf_state += BUF_REFCOUNT_ONE;
+ UnlockBufHdr(buf, buf_state);
- /*
- * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
- * end the BM_IO_IN_PROGRESS state.
- */
- TerminateBufferIO(buf, true, 0);
+ /* FIXME: improve */
+ InProgressBuf = NULL;
+
+ if (bb)
+ {
+ pgaio_assoc_bounce_buffer(aio, bb);
+ pgaio_bounce_buffer_release(bb);
+ }
+
+ pgaio_io_ref(aio, &buf->io_in_progress);
+
+ smgrstartwrite(aio,
+ reln,
+ buf->tag.forkNum,
+ buf->tag.blockNum,
+ bufToWrite,
+ BufferDescriptorGetBuffer(buf),
+ /* skipFsync = */ false,
+ /* release_lock = */ false);
+ pgaio_io_wait(aio);
+ pgaio_io_release(aio);
+ }
TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
buf->tag.blockNum,
error_context_stack = errcallback.previous;
}
+static bool
+AsyncFlushBuffer(PgAioInProgress *aio, BufferDesc *buf, SMgrRelation reln)
+{
+ XLogRecPtr recptr;
+ Block bufBlock;
+ char *bufToWrite;
+ uint32 buf_state;
+ PgAioBounceBuffer *bb;
+
+ Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_SHARED));
+
+ /*
+ * Try to start an I/O operation. If StartBufferIO returns false, then
+ * someone else flushed the buffer before we could, so we need not do
+ * anything.
+ */
+ if (!StartBufferIO(buf, false))
+ return false;
+
+ /* Find smgr relation for buffer */
+ if (reln == NULL)
+ reln = smgropen(buf->tag.rnode, InvalidBackendId);
+
+ buf_state = LockBufHdr(buf);
+
+ /*
+ * Run PageGetLSN while holding header lock, since we don't have the
+ * buffer locked exclusively in all cases.
+ */
+ recptr = BufferGetLSN(buf);
+
+ /* To check if block content changes while flushing. - vadim 01/17/97 */
+ buf_state &= ~BM_JUST_DIRTIED;
+
+ /* ownership while in AIO subsystem */
+ buf_state += BUF_REFCOUNT_ONE;
+
+ UnlockBufHdr(buf, buf_state);
+
+ if (buf_state & BM_PERMANENT)
+ XLogFlush(recptr);
+
+ bufBlock = BufHdrGetBlock(buf);
+
+ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum, &bb);
+
+ /* FIXME: improve */
+ InProgressBuf = NULL;
+
+ pgBufferUsage.shared_blks_written++;
+
+ if (bb)
+ {
+ pgaio_assoc_bounce_buffer(aio, bb);
+ pgaio_bounce_buffer_release(bb);
+ }
+
+ pgaio_io_ref(aio, &buf->io_in_progress);
+
+ /*
+ * Ask the completion routine to release the lock for us. That's important
+ * for two reasons:
+ *
+ * 1) It allows other backends to move the block into a modifyable state
+ * by completing the IO, avoiding some deadlock risks. Otherwise this
+ * process would need to ensure the IO is completed before being
+ * allowed to sleep.
+ *
+ * 2) For processes doing lots of streaming writes (e.g. checkpointer) the
+ * lwlock ownership management turns out to be very expensive, because
+ * lwlocks for locks aquired earlier are also likely to be released
+ * earlier, leading to held_lwlocks needing to be shifted around.
+ *
+ * This is safe because we only release the lock ownership once the AIO
+ * subsystem successfully started tracking the IO.
+ */
+
+ smgrstartwrite(aio, reln,
+ buf->tag.forkNum, buf->tag.blockNum,
+ bufToWrite,
+ BufferDescriptorGetBuffer(buf),
+ /* skipFsync = */ false,
+ /* release_lock = */ true);
+
+ /*
+ * XXX: The lock ownership release should probably be moved into the AIO
+ * layer, so we correctly handle errors happening during IO submission.
+ */
+ LWLockReleaseOwnership(BufferDescriptorGetContentLock(buf));
+ RESUME_INTERRUPTS();
+
+ return true;
+}
+
/*
* RelationGetNumberOfBlocksInFork
* Determines the current number of pages in the specified relation fork.
for (;;)
{
uint32 buf_state;
+ PgAioIoRef aio_ref;
/*
* It may not be necessary to acquire the spinlock to check the flag
* play it safe.
*/
buf_state = LockBufHdr(buf);
+ aio_ref = buf->io_in_progress;
UnlockBufHdr(buf, buf_state);
if (!(buf_state & BM_IO_IN_PROGRESS))
break;
+
+ if (pgaio_io_ref_valid(&aio_ref))
+ {
+ pgaio_io_wait_ref(&aio_ref, true);
+ ConditionVariablePrepareToSleep(cv);
+ continue;
+ }
+
ConditionVariableSleep(cv, WAIT_EVENT_BUFFILE_WAITIO);
}
ConditionVariableCancelSleep();
return true;
}
+static void
+TerminateBufferIO(BufferDesc *bufHdr, bool local, bool syncio,
+ bool clear_dirty, uint32 set_flag_bits)
+{
+ if (local)
+ {
+ /* Only need to adjust flags */
+ uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ buf_state |= set_flag_bits;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ }
+ else
+ {
+ TerminateSharedBufferIO(bufHdr, syncio, clear_dirty, set_flag_bits);
+ }
+}
+
/*
- * TerminateBufferIO: release a buffer we were doing I/O on
+ * TerminateSharedBufferIO: release a buffer we were doing I/O on
* (Assumptions)
* My process is executing IO for the buffer
* BM_IO_IN_PROGRESS bit is set for the buffer
* be 0, or BM_VALID if we just finished reading in the page.
*/
static void
-TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+TerminateSharedBufferIO(BufferDesc *buf, bool syncio, bool clear_dirty, uint32 set_flag_bits)
{
uint32 buf_state;
- Assert(buf == InProgressBuf);
+ if (syncio)
+ Assert(buf == InProgressBuf);
buf_state = LockBufHdr(buf);
buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
buf_state |= set_flag_bits;
+
+ if (!syncio)
+ {
+ buf_state -= BUF_REFCOUNT_ONE;
+ pgaio_io_ref_clear(&buf->io_in_progress);
+ }
+
UnlockBufHdr(buf, buf_state);
- InProgressBuf = NULL;
+ if (syncio)
+ InProgressBuf = NULL;
ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
+
+
+ /* Support LockBufferForCleanup() */
+ if (!syncio && buf_state & BM_PIN_COUNT_WAITER)
+ {
+ /*
+ * Acquire the buffer header lock, re-check that there's a waiter.
+ * Another backend could have unpinned this buffer, and already
+ * woken up the waiter. There's no danger of the buffer being
+ * replaced after we unpinned it above, as it's pinned by the
+ * waiter.
+ */
+ buf_state = LockBufHdr(buf);
+
+ if ((buf_state & BM_PIN_COUNT_WAITER) &&
+ BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+ {
+ /* we just released the last pin other than the waiter's */
+ int wait_backend_pid = buf->wait_backend_pid;
+
+ buf_state &= ~BM_PIN_COUNT_WAITER;
+ UnlockBufHdr(buf, buf_state);
+ ProcSendSignal(wait_backend_pid);
+ }
+ else
+ UnlockBufHdr(buf, buf_state);
+ }
}
/*
{
BufferDesc *buf = InProgressBuf;
+ pgaio_at_abort();
+
if (buf)
{
uint32 buf_state;
pfree(path);
}
}
- TerminateBufferIO(buf, false, BM_IO_ERROR);
+ TerminateSharedBufferIO(buf, /* syncio = */ true, /* clear_dirty = */ false, BM_IO_ERROR);
}
}
{
PendingWriteback *pending;
+ if (io_data_direct)
+ return;
+
/*
* Add buffer to the pending writeback array, unless writeback control is
* disabled.
}
context->nr_pending = 0;
+
+ if (i > 0)
+ pgaio_submit_pending(true);
}
bool found;
uint32 buf_state;
+ Assert(blockNum != P_NEW);
+
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
/* Initialize local buffers if first request in this session */
#include "miscadmin.h"
#include "pgstat.h"
#include "portability/mem.h"
+#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
* We compile all alternatives that are supported on the current platform,
* to find portability problems more easily.
*/
+#if USE_LIBURING
+ {
+ PgAioInProgress *aio = pgaio_io_get();
+
+ pgaio_io_start_flush_range(aio, fd, offset, nbytes);
+ pgaio_io_release(aio);
+ return;
+ }
+#endif
#if defined(HAVE_SYNC_FILE_RANGE)
{
int rc;
fd = open(fileName, fileFlags, fileMode);
if (fd >= 0)
+ {
+ //elog(DEBUG1, "opening file %s fd %d", fileName, fd);
+
return fd; /* success! */
+ }
if (errno == EMFILE || errno == ENFILE)
{
vfdP = &VfdCache[file];
+ pgaio_submit_pending(false);
+
/*
* Close the file. We aren't expecting this to fail; if it does, better
* to leak the FD than to mess up our internal state.
if (!FileIsNotOpen(file))
{
+ pgaio_submit_pending(false);
+
/* close the file */
if (close(vfdP->fd) != 0)
{
if (nbytes <= 0)
return;
+ /* no point */
+ if (VfdCache[file].fileFlags & O_DIRECT)
+ return;
+
returnCode = FileAccess(file);
if (returnCode < 0)
return;
return returnCode;
}
+bool
+FileStartRead(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, int mode)
+{
+ int returnCode;
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileStartRead: %d (%s) " INT64_FORMAT " %d %p",
+ file, VfdCache[file].fileName,
+ (int64) offset,
+ amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return false;
+
+ vfdP = &VfdCache[file];
+
+ pgaio_io_start_read_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, mode);
+
+ return true;
+}
+
int
FileWrite(File file, char *buffer, int amount, off_t offset,
uint32 wait_event_info)
return returnCode;
}
+bool
+FileStartWrite(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const AioBufferTag *tag, int bufid, bool release_lock)
+{
+ int returnCode;
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileStartWrite: %d (%s) " INT64_FORMAT " %d %p",
+ file, VfdCache[file].fileName,
+ (int64) offset,
+ amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return false;
+
+ vfdP = &VfdCache[file];
+
+ pgaio_io_start_write_buffer(io, tag, vfdP->fd, offset, amount, buffer, bufid, release_lock);
+
+ return true;
+}
+
int
FileSync(File file, uint32 wait_event_info)
{
int
FileGetRawDesc(File file)
{
+ int returnCode;
+
Assert(FileIsValid(file));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
return VfdCache[file].fd;
}
result = closedir(desc->desc.dir);
break;
case AllocateDescRawFD:
+ pgaio_submit_pending(false);
result = close(desc->desc.fd);
break;
default:
/* Only get here if someone passes us a file not in allocatedDescs */
elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
+ pgaio_submit_pending(false);
+
return close(fd);
}
include $(top_builddir)/src/Makefile.global
OBJS = \
+ aio.o \
+ aio_util.o \
barrier.o \
dsm.o \
dsm_impl.o \
sinvaladt.o \
standby.o
+aio.o: override CPPFLAGS += $(LIBURING_CFLAGS)
+aio.bc: override CPPFLAGS += $(LIBURING_CFLAGS)
+
include $(top_srcdir)/src/backend/common.mk
--- /dev/null
+/*
+ * Big picture changes:
+ * - backend local recycleable IOs
+ * - merging of IOs when submitting individual IOs, not when submitting all pending IOs
+ * - reorganization of shared callback system, so there's an underlying
+ * "write" operation that's used both by WAL, generic, ... writes.
+ * - Consider not exposing PgAioInProgress* at all, instead expose a PgAioReference { uint32 io; uint64 generation; }
+ * which would make it a lot less problematic to immediate reuse IOs.
+ * - Shrink size of PgAioInProgress
+ * - refcount bounc buffers / redesign
+ * - get rid of the current backpressure logic
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <liburing.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "fmgr.h"
+#include "funcapi.h"
+#include "lib/ilist.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "nodes/memnodes.h"
+#include "pgstat.h"
+#include "access/xlog_internal.h"
+#include "storage/aio.h"
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/fmgrprotos.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+
+#define PGAIO_VERBOSE
+
+
+/*
+ * FIXME: This is just so large because merging happens when submitting
+ * pending requests, rather than when staging them.
+ */
+#define PGAIO_SUBMIT_BATCH_SIZE 256
+#define PGAIO_MAX_LOCAL_REAPED 128
+#define PGAIO_MAX_COMBINE 16
+
+#define PGAIO_NUM_CONTEXTS 8
+
+/*
+ * The type of AIO.
+ *
+ * To keep PgAioInProgress smaller try to tell the compiler to only use the
+ * minimal space. We could alternatively just use a uint8, but then we'd need
+ * casts in more places...
+ */
+typedef enum
+#ifdef pg_attribute_packed
+pg_attribute_packed()
+#endif
+ PgAioAction
+{
+ /* intentionally the zero value, to help catch zeroed memory etc */
+ PGAIO_INVALID = 0,
+
+ PGAIO_NOP,
+ /* FIXME: unify */
+ PGAIO_FSYNC,
+ PGAIO_FSYNC_WAL,
+ PGAIO_FLUSH_RANGE,
+
+ PGAIO_READ_BUFFER,
+ /* FIXME: unify */
+ PGAIO_WRITE_BUFFER,
+ PGAIO_WRITE_WAL,
+ PGAIO_WRITE_GENERIC,
+} PgAioAction;
+
+typedef enum PgAioInProgressFlags
+{
+ /* request in the ->unused list */
+ PGAIOIP_UNUSED = 1 << 0,
+
+ /* */
+ PGAIOIP_IDLE = 1 << 1,
+
+ /* */
+ PGAIOIP_IN_PROGRESS = 1 << 2,
+
+ /* somewhere */
+ PGAIOIP_PENDING = 1 << 3,
+
+ /* request in kernel */
+ PGAIOIP_INFLIGHT = 1 << 4,
+
+ /* request reaped */
+ PGAIOIP_REAPED = 1 << 5,
+
+ /* shared completion callback was called */
+ PGAIOIP_SHARED_CALLBACK_CALLED = 1 << 6,
+
+ /* completed */
+ PGAIOIP_DONE = 1 << 7,
+
+ PGAIOIP_FOREIGN_DONE = 1 << 8,
+
+ /* IO is merged with others */
+ PGAIOIP_MERGE = 1 << 9,
+
+ PGAIOIP_RETRY = 1 << 10,
+
+ /* request failed completely */
+ PGAIOIP_HARD_FAILURE = 1 << 11,
+
+ /* request failed partly, e.g. a short write */
+ PGAIOIP_SOFT_FAILURE = 1 << 12,
+
+ PGAIOIP_SHARED_FAILED = 1 << 13,
+
+ /* local completion callback was called */
+ PGAIOIP_LOCAL_CALLBACK_CALLED = 1 << 14,
+
+} PgAioInProgressFlags;
+
+/* IO completion callback */
+typedef bool (*PgAioCompletedCB)(PgAioInProgress *io);
+
+typedef uint16 PgAioIPFlags;
+
+struct PgAioInProgress
+{
+ /* PgAioAction, indexes PgAioCompletionCallbacks */
+ PgAioAction type;
+
+ PgAioIPFlags flags;
+
+ bool user_referenced;
+ bool system_referenced;
+
+ /* which AIO ring is this entry active for */
+ uint8 ring;
+
+ /* index into allProcs, or PG_UINT32_MAX for process local IO */
+ uint32 owner_id;
+
+ /* the IOs result, depends on operation. E.g. the length of a read */
+ int32 result;
+
+ /*
+ * Single callback that can be registered on an IO to be called upon
+ * completion. Note that this is reset whenever an IO is recycled..
+ */
+ PgAioOnCompletionLocalContext *on_completion_local;
+
+ /*
+ * Membership in one of
+ * PgAioCtl->unused,
+ * PgAioPerBackend->unused,
+ * PgAioPerBackend->outstanding,
+ * PgAioPerBackend->issued,
+ */
+ dlist_node owner_node;
+
+ /*
+ * Membership in
+ * PgAioPerBackend->pending,
+ * PgAioPerBackend->reaped,
+ * local_recycle_requests
+ * PgAioPerBackend->foreign_completed,
+ * PgAioPerBackend->local_completed
+ */
+ dlist_node io_node;
+
+ ConditionVariable cv;
+
+ /* index into context->iovec, or -1 */
+ int32 used_iovec;
+
+ PgAioBounceBuffer *bb;
+
+ PgAioInProgress *merge_with;
+
+ uint64 generation;
+
+ /*
+ * NB: Note that fds in here may *not* be relied upon for re-issuing
+ * requests (e.g. for partial reads/writes) - the fd might be from another
+ * process, or closed since. That's not a problem for IOs waiting to be
+ * issued only because the queue is flushed when closing an fd.
+ */
+ union {
+ struct
+ {
+ int fd;
+ bool barrier;
+ bool datasync;
+ } fsync;
+
+ struct
+ {
+ int fd;
+ bool barrier;
+ bool datasync;
+ uint32 flush_no;
+ } fsync_wal;
+
+ struct
+ {
+ int fd;
+ uint64 nbytes;
+ uint32 offset;
+ } flush_range;
+
+ struct
+ {
+ uint32 offset;
+ uint32 nbytes;
+ uint32 already_done;
+ int fd;
+ char *bufdata;
+ Buffer buf;
+ AioBufferTag tag;
+ int mode;
+ } read_buffer;
+
+ struct
+ {
+ uint32 offset;
+ uint32 nbytes;
+ uint32 already_done;
+ int fd;
+ char *bufdata;
+ Buffer buf;
+ bool release_lock;
+ AioBufferTag tag;
+ } write_buffer;
+
+ struct
+ {
+ int fd;
+ uint32 offset;
+ uint32 nbytes;
+ uint32 already_done;
+ char *bufdata;
+ bool no_reorder;
+ uint32 write_no;
+ } write_wal;
+
+ struct
+ {
+ int fd;
+ uint64 offset;
+ uint32 nbytes;
+ uint32 already_done;
+ char *bufdata;
+ bool no_reorder;
+ } write_generic;
+ } d;
+};
+
+/* typedef in header */
+struct PgAioBounceBuffer
+{
+ pg_atomic_uint32 refcount;
+ dlist_node node;
+ char *buffer;
+};
+
+/*
+ * An iovec that can represent the biggest possible iovec (due to combining)
+ * we may need for a single IO submission.
+ */
+typedef struct PgAioIovec
+{
+ slist_node node;
+ struct iovec iovec[PGAIO_MAX_COMBINE];
+} PgAioIovec;
+
+
+/*
+ * XXX: Really want a proclist like structure that works with integer
+ * offsets. Given the limited number of IOs ever existing, using full pointers
+ * is completely unnecessary.
+ */
+
+typedef struct PgAioPerBackend
+{
+ uint32 last_context;
+
+ /*
+ * Local unused IOs. There's only a limited number of these. Used to
+ * reduce overhead of the central unused list.
+ *
+ * FIXME: Actually use.
+ *
+ * Could be singly linked list.
+ *
+ * PgAioInProgress->owner_node
+ */
+ dlist_head unused;
+ uint32 unused_count;
+
+ /*
+ * IOs handed out to code within the backend.
+ *
+ * PgAioInProgress->owner_node
+ */
+ dlist_head outstanding;
+ uint32 outstanding_count;
+
+ /*
+ * Requests waiting to be issued to the kernel. They are submitted to the
+ * kernel in batches, for efficiency (local merging of IOs, and better
+ * kernel side queue processing).
+ *
+ * Could be singly linked list.
+ *
+ * PgAioInProgress->io_node
+ */
+ dlist_head pending;
+ uint32 pending_count;
+
+ /*
+ * Requests issued by backend that have not yet completed yet (but may be
+ * foreign_completed) and are still referenced by backend code (see
+ * issued_abandoned for those).
+ *
+ * PgAioInProgress->owner_node
+ */
+ dlist_head issued;
+ uint32 issued_count;
+
+ /*
+ * Requests issued by backend that have not yet completed yet (but may be
+ * foreign_completed) and that are not referenced by backend code anymore (see
+ * issued for those).
+ *
+ * PgAioInProgress->owner_node
+ */
+ dlist_head issued_abandoned;
+ uint32 issued_abandoned_count;
+
+ /*
+ * PgAioInProgress that are issued to the ringbuffer, and have not yet
+ * been processed (but they may have completed without the completions
+ * having been processed).
+ */
+ pg_atomic_uint32 inflight_count;
+
+ /*
+ * Requests where we've received a kernel completion, but haven't yet
+ * processed them. This is needed to handle failing callbacks.
+ *
+ * Could be singly linked list.
+ *
+ * PgAioInProgress->io_node
+ */
+ dlist_head reaped;
+
+ /*
+ * IOs that were completed, but not yet recycled.
+ *
+ * PgAioInProgress->io_node
+ */
+ dlist_head local_completed;
+ uint32 local_completed_count;
+
+ /*
+ * IOs where the completion was received in another backend.
+ *
+ * Could be singly linked list.
+ *
+ * PgAioInProgress->io_node
+ */
+ slock_t foreign_completed_lock;
+ uint32 foreign_completed_count;
+ dlist_head foreign_completed;
+
+ /*
+ * Stats.
+ */
+ uint64 executed_total_count; /* un-merged */
+ uint64 issued_total_count; /* merged */
+ uint64 submissions_total_count; /* number of submission syscalls */
+ uint64 foreign_completed_total_count;
+ uint64 retry_total_count;
+
+} PgAioPerBackend;
+
+typedef struct PgAioContext
+{
+ LWLock submission_lock;
+ LWLock completion_lock;
+
+ struct io_uring io_uring_ring;
+
+ /*
+ * For many versions of io_uring iovecs need to be in shared memory. The
+ * lists of available iovecs are split to be under the submission /
+ * completion locks - that allows to avoid additional lock acquisitions in
+ * the common cases.
+ */
+ PgAioIovec *iovecs;
+
+ /* locked by submission lock */
+ slist_head unused_iovecs;
+ uint32 unused_iovecs_count;
+
+ /* locked by completion lock */
+ slist_head reaped_iovecs;
+ uint32 reaped_iovecs_count;
+
+ /* XXX: probably worth padding to a cacheline boundary here */
+} PgAioContext;
+
+typedef struct PgAioCtl
+{
+ /* PgAioInProgress that are not used */
+ dlist_head unused_ios;
+
+ /*
+ * Number of PgAioInProgressIOs that are in use. This includes pending
+ * requests, as well as requests actually issues to the queue.
+ *
+ * Protected by SharedAIOCtlLock.
+ */
+ uint32 used_count;
+
+ /*
+ * Protected by SharedAIOCtlLock.
+ */
+ dlist_head reaped_uncompleted;
+
+ PgAioBounceBuffer *bounce_buffers;
+ dlist_head unused_bounce_buffers;
+ uint32 unused_bounce_buffers_count;
+
+ int backend_state_count;
+ PgAioPerBackend *backend_state;
+
+ uint32 num_contexts;
+ PgAioContext *contexts;
+
+ PgAioInProgress in_progress_io[FLEXIBLE_ARRAY_MEMBER];
+} PgAioCtl;
+
+/* general pgaio helper functions */
+static void pgaio_complete_ios(bool in_error);
+static void pgaio_apply_backend_limit(void);
+static void pgaio_prepare_io(PgAioInProgress *io, PgAioAction action);
+static void pgaio_finish_io(PgAioInProgress *io);
+static void pgaio_bounce_buffer_release_internal(PgAioBounceBuffer *bb, bool holding_lock, bool release_resowner);
+static void pgaio_io_ref_internal(PgAioInProgress *io, PgAioIoRef *ref);
+
+/* io_uring related functions */
+static int pgaio_uring_submit(int max_submit, bool drain);
+static int pgaio_uring_drain(PgAioContext *context);
+static void pgaio_uring_wait_one(PgAioContext *context, PgAioInProgress *io, uint64 generation, uint32 wait_event_info);
+
+static void pgaio_uring_sq_from_io(PgAioContext *context, PgAioInProgress *io, struct io_uring_sqe *sqe);
+static void pgaio_uring_io_from_cqe(PgAioContext *context, struct io_uring_cqe *cqe);
+static void pgaio_uring_iovec_transfer(PgAioContext *context);
+
+static int __sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+ unsigned flags, sigset_t *sig);
+
+/* io completions */
+static bool pgaio_complete_nop(PgAioInProgress *io);
+static bool pgaio_complete_fsync(PgAioInProgress *io);
+static bool pgaio_complete_fsync_wal(PgAioInProgress *io);
+static bool pgaio_complete_flush_range(PgAioInProgress *io);
+static bool pgaio_complete_read_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_buffer(PgAioInProgress *io);
+static bool pgaio_complete_write_wal(PgAioInProgress *io);
+static bool pgaio_complete_write_generic(PgAioInProgress *io);
+
+
+static MemoryContext aio_retry_context;
+
+/*
+ * To support EXEC_BACKEND environments, where we cannot rely on callback
+ * addresses being equivalent across processes, completion actions are just
+ * indices into a process local array of callbacks, indexed by the type of
+ * action. Also makes the shared memory entries a bit smaller, but that's not
+ * a huge win.
+ */
+static const PgAioCompletedCB completion_callbacks[] =
+{
+ [PGAIO_NOP] = pgaio_complete_nop,
+ [PGAIO_FSYNC] = pgaio_complete_fsync,
+ [PGAIO_FSYNC_WAL] = pgaio_complete_fsync_wal,
+ [PGAIO_FLUSH_RANGE] = pgaio_complete_flush_range,
+ [PGAIO_READ_BUFFER] = pgaio_complete_read_buffer,
+ [PGAIO_WRITE_BUFFER] = pgaio_complete_write_buffer,
+ [PGAIO_WRITE_WAL] = pgaio_complete_write_wal,
+ [PGAIO_WRITE_GENERIC] = pgaio_complete_write_generic,
+};
+
+
+/* (future) GUC controlling global MAX number of in-progress IO entries */
+/* FIXME: find a good naming pattern */
+extern int max_aio_in_progress;
+/* FIXME: this is per context right now */
+extern int max_aio_in_flight;
+extern int max_aio_bounce_buffers;
+
+/* max per backend concurrency */
+extern int io_max_concurrency;
+
+int max_aio_in_progress = 32768; /* XXX: Multiple of MaxBackends instead? */
+int max_aio_in_flight = 4096;
+int max_aio_bounce_buffers = 1024;
+int io_max_concurrency = 128;
+
+/* global list of in-progress IO */
+static PgAioCtl *aio_ctl;
+
+/* current backend's per-backend-state */
+static PgAioPerBackend *my_aio;
+static int my_aio_id;
+
+/* FIXME: move into PgAioPerBackend / subsume into ->reaped */
+static dlist_head local_recycle_requests;
+
+
+/* io_uring local state */
+struct io_uring local_ring;
+
+static Size
+AioCtlShmemSize(void)
+{
+ Size sz;
+
+ /* aio_ctl itself */
+ sz = offsetof(PgAioCtl, in_progress_io);
+
+ /* ios */
+ sz = add_size(sz, mul_size(max_aio_in_progress, sizeof(PgAioInProgress)));
+
+ return sz;
+}
+
+static Size
+AioCtlBackendShmemSize(void)
+{
+ uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+ return mul_size(TotalProcs, sizeof(PgAioPerBackend));
+}
+
+static Size
+AioBounceShmemSize(void)
+{
+ Size sz;
+
+ /* PgAioBounceBuffer itself */
+ sz = mul_size(sizeof(PgAioBounceBuffer), max_aio_bounce_buffers);
+
+ /* and the associated buffer */
+ sz = add_size(sz,
+ mul_size(BLCKSZ, add_size(max_aio_bounce_buffers, 1)));
+
+ return sz;
+}
+
+static Size
+AioContextShmemSize(void)
+{
+ return mul_size(PGAIO_NUM_CONTEXTS, sizeof(PgAioContext));
+}
+
+static Size
+AioContextIovecsShmemSize(void)
+{
+ return mul_size(PGAIO_NUM_CONTEXTS,
+ mul_size(sizeof(PgAioIovec), max_aio_in_flight));
+}
+
+Size
+AioShmemSize(void)
+{
+ Size sz = 0;
+
+ sz = add_size(sz, AioCtlShmemSize());
+ sz = add_size(sz, AioBounceShmemSize());
+ sz = add_size(sz, AioCtlBackendShmemSize());
+ sz = add_size(sz, AioContextShmemSize());
+ sz = add_size(sz, AioContextIovecsShmemSize());
+
+ return sz;
+}
+
+void
+AioShmemInit(void)
+{
+ bool found;
+ uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS;
+
+ aio_ctl = (PgAioCtl *)
+ ShmemInitStruct("PgAio", AioCtlShmemSize(), &found);
+
+ if (!found)
+ {
+ memset(aio_ctl, 0, AioCtlShmemSize());
+
+ dlist_init(&aio_ctl->unused_ios);
+ dlist_init(&aio_ctl->reaped_uncompleted);
+
+ for (int i = 0; i < max_aio_in_progress; i++)
+ {
+ PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+
+ ConditionVariableInit(&io->cv);
+ dlist_push_tail(&aio_ctl->unused_ios, &io->owner_node);
+ io->flags = PGAIOIP_UNUSED;
+ io->system_referenced = true;
+ io->generation = 1;
+ }
+
+ aio_ctl->backend_state_count = TotalProcs;
+ aio_ctl->backend_state = (PgAioPerBackend *)
+ ShmemInitStruct("PgAioBackend", AioCtlBackendShmemSize(), &found);
+ memset(aio_ctl->backend_state, 0, AioCtlBackendShmemSize());
+
+ for (int procno = 0; procno < TotalProcs; procno++)
+ {
+ PgAioPerBackend *bs = &aio_ctl->backend_state[procno];
+
+ dlist_init(&bs->unused);
+ dlist_init(&bs->outstanding);
+ dlist_init(&bs->pending);
+ dlist_init(&bs->issued);
+ dlist_init(&bs->issued_abandoned);
+ pg_atomic_init_u32(&bs->inflight_count, 0);
+ dlist_init(&bs->reaped);
+
+ dlist_init(&bs->foreign_completed);
+ SpinLockInit(&bs->foreign_completed_lock);
+ }
+
+ {
+ char *p;
+ char *blocks;
+
+ dlist_init(&aio_ctl->unused_bounce_buffers);
+ aio_ctl->bounce_buffers =
+ ShmemInitStruct("PgAioBounceBuffers",
+ sizeof(PgAioBounceBuffer) * max_aio_bounce_buffers,
+ &found);
+ Assert(!found);
+
+ p = ShmemInitStruct("PgAioBounceBufferBlocks",
+ BLCKSZ * (max_aio_bounce_buffers + 1),
+ &found);
+ Assert(!found);
+ blocks = (char *) TYPEALIGN(BLCKSZ, (uintptr_t) p);
+
+ for (int i = 0; i < max_aio_bounce_buffers; i++)
+ {
+ PgAioBounceBuffer *bb = &aio_ctl->bounce_buffers[i];
+
+ bb->buffer = blocks + i * BLCKSZ;
+ memset(bb->buffer, 0, BLCKSZ);
+ pg_atomic_init_u32(&bb->refcount, 0);
+ dlist_push_tail(&aio_ctl->unused_bounce_buffers, &bb->node);
+ aio_ctl->unused_bounce_buffers_count++;
+ }
+ }
+
+ {
+ PgAioIovec *iovecs;
+
+ aio_ctl->num_contexts = PGAIO_NUM_CONTEXTS;
+ aio_ctl->contexts = ShmemInitStruct("PgAioContexts", AioContextShmemSize(), &found);
+ Assert(!found);
+
+ iovecs = (PgAioIovec *)
+ ShmemInitStruct("PgAioContextsIovecs", AioContextIovecsShmemSize(), &found);
+ Assert(!found);
+ memset(iovecs, 0, AioContextIovecsShmemSize());
+
+ for (int contextno = 0; contextno < aio_ctl->num_contexts; contextno++)
+ {
+ PgAioContext *context = &aio_ctl->contexts[contextno];
+ int ret;
+
+ LWLockInitialize(&context->submission_lock, LWTRANCHE_AIO_CONTEXT_SUBMISSION);
+ LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_CONTEXT_COMPLETION);
+
+ slist_init(&context->unused_iovecs);
+ slist_init(&context->reaped_iovecs);
+
+ context->iovecs = iovecs;
+ iovecs += max_aio_in_flight;
+
+ for (uint32 i = 0; i < max_aio_in_flight; i++)
+ {
+ slist_push_head(&context->unused_iovecs, &context->iovecs[i].node);
+ context->unused_iovecs_count++;
+ }
+
+ /*
+ * XXX: Probably worth sharing the WQ between the different
+ * rings, when supported by the kernel. Could also cause
+ * additional contention, I guess?
+ */
+ if (!AcquireExternalFD())
+ elog(ERROR, "io_uring_queue_init: %m");
+ ret = io_uring_queue_init(max_aio_in_flight, &context->io_uring_ring, 0);
+ if (ret < 0)
+ elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+ }
+ }
+
+ }
+}
+
+void
+pgaio_postmaster_init(void)
+{
+ /* FIXME: should also be allowed to use AIO */
+ dlist_init(&local_recycle_requests);
+
+ // XXX: could create a local queue here.
+
+ /*
+ * Need to be allowed to re-open files during retries. Those can happen,
+ * e.g. when fsyncing WAL, within a critical section. Reopening files
+ * currently requires memory. So create a context with small reservation
+ * that's allowed to be used within a critical section.
+ */
+ aio_retry_context = AllocSetContextCreate(TopMemoryContext,
+ "aio retry context",
+ 1024,
+ 1024,
+ 1024);
+ MemoryContextAllowInCriticalSection(aio_retry_context, true);
+}
+
+void
+pgaio_postmaster_child_init_local(void)
+{
+ /*
+ *
+ */
+ {
+ int ret;
+
+ ret = io_uring_queue_init(32, &local_ring, 0);
+ if (ret < 0)
+ {
+ elog(ERROR, "io_uring_queue_init failed: %s", strerror(-ret));
+ }
+ }
+}
+
+static void
+pgaio_postmaster_before_child_exit(int code, Datum arg)
+{
+ elog(DEBUG2, "aio before shmem exit: start");
+
+ /*
+ * Need to wait for in-progress IOs initiated by this backend to
+ * finish. Some operating systems, like linux w/ io_uring, cancel IOs that
+ * are still in progress when exiting. Other's don't provide access to the
+ * results of such IOs.
+ */
+ while (!dlist_is_empty(&my_aio->issued))
+ {
+ PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+ pgaio_io_release(io);
+ }
+
+ Assert(my_aio->issued_count == 0);
+ Assert(dlist_is_empty(&my_aio->issued));
+
+ while (!dlist_is_empty(&my_aio->issued_abandoned))
+ {
+ PgAioInProgress *io = NULL;
+ PgAioIoRef ref;
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ if (!dlist_is_empty(&my_aio->issued_abandoned))
+ {
+ io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued_abandoned);
+ pgaio_io_ref_internal(io, &ref);
+ }
+ LWLockRelease(SharedAIOCtlLock);
+
+ if (!io)
+ {
+ elog(LOG, "skipped exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+ break;
+ }
+
+ elog(LOG, "exit wait for abandoned IO %zu", io - aio_ctl->in_progress_io);
+ pgaio_io_print(io, NULL);
+ pgaio_io_wait_ref(&ref, false);
+ }
+
+ elog(DEBUG2, "aio before shmem exit: end");
+}
+
+static void
+pgaio_postmaster_child_exit(int code, Datum arg)
+{
+ /* FIXME: handle unused */
+ Assert(my_aio->outstanding_count == 0);
+ Assert(dlist_is_empty(&my_aio->outstanding));
+
+ Assert(my_aio->pending_count == 0);
+ Assert(dlist_is_empty(&my_aio->pending));
+
+ Assert(my_aio->issued_count == 0);
+ Assert(dlist_is_empty(&my_aio->issued));
+
+ Assert(my_aio->issued_abandoned_count == 0);
+ Assert(dlist_is_empty(&my_aio->issued_abandoned));
+
+ Assert(pg_atomic_read_u32(&my_aio->inflight_count) == 0);
+
+ Assert(dlist_is_empty(&my_aio->reaped));
+
+ Assert(my_aio->local_completed_count == 0);
+ Assert(dlist_is_empty(&my_aio->local_completed));
+
+ Assert(my_aio->foreign_completed_count == 0);
+ Assert(dlist_is_empty(&my_aio->foreign_completed));
+}
+
+void
+pgaio_postmaster_child_init(void)
+{
+ /* no locking needed here, only affects this process */
+ for (int i = 0; i < aio_ctl->num_contexts; i++)
+ io_uring_ring_dontfork(&aio_ctl->contexts[i].io_uring_ring);
+
+ my_aio_id = MyProc->pgprocno;
+ my_aio = &aio_ctl->backend_state[my_aio_id];
+
+ dlist_init(&local_recycle_requests);
+
+ before_shmem_exit(pgaio_postmaster_before_child_exit, 0);
+ on_shmem_exit(pgaio_postmaster_child_exit, 0);
+
+ Assert(my_aio->unused_count == 0);
+ Assert(my_aio->outstanding_count == 0);
+ Assert(my_aio->issued_count == 0);
+ Assert(my_aio->issued_abandoned_count == 0);
+ Assert(my_aio->pending_count == 0);
+ Assert(my_aio->local_completed_count == 0);
+ Assert(my_aio->foreign_completed_count == 0);
+
+ /* try to spread out a bit from the start */
+ my_aio->last_context = MyProcPid % PGAIO_NUM_CONTEXTS;
+
+ my_aio->executed_total_count = 0;
+ my_aio->issued_total_count = 0;
+ my_aio->submissions_total_count = 0;
+ my_aio->foreign_completed_total_count = 0;
+ my_aio->retry_total_count = 0;
+}
+
+void
+pgaio_at_abort(void)
+{
+ pgaio_complete_ios(/* in_error = */ true);
+
+ pgaio_submit_pending(false);
+
+ while (!dlist_is_empty(&my_aio->outstanding))
+ {
+ PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+ pgaio_io_release(io);
+ }
+
+ while (!dlist_is_empty(&my_aio->issued))
+ {
+ PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+ pgaio_io_release(io);
+ }
+}
+
+void
+pgaio_at_commit(void)
+{
+ Assert(dlist_is_empty(&local_recycle_requests));
+
+ if (my_aio->pending_count != 0)
+ {
+ elog(WARNING, "unsubmitted IOs %d", my_aio->pending_count);
+ pgaio_submit_pending(false);
+ }
+
+ while (!dlist_is_empty(&my_aio->outstanding))
+ {
+ PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->outstanding);
+
+ elog(WARNING, "leaked outstanding io %zu", io - aio_ctl->in_progress_io);
+
+ pgaio_io_release(io);
+ }
+
+ while (!dlist_is_empty(&my_aio->issued))
+ {
+ PgAioInProgress *io = dlist_head_element(PgAioInProgress, owner_node, &my_aio->issued);
+
+ elog(WARNING, "leaked issued io %zu", io - aio_ctl->in_progress_io);
+
+ pgaio_io_release(io);
+ }
+}
+
+static int
+pgaio_uncombine_one(PgAioInProgress *io)
+{
+ int orig_result = io->result;
+ int running_result = orig_result;
+ PgAioInProgress *cur = io;
+ PgAioInProgress *last = NULL;
+ int extracted = 0;
+
+ while (cur)
+ {
+ PgAioInProgress *next = cur->merge_with;
+
+ Assert(!(cur->flags & PGAIOIP_SHARED_CALLBACK_CALLED));
+ Assert(cur->merge_with || cur != io);
+
+ switch (cur->type)
+ {
+ case PGAIO_READ_BUFFER:
+ Assert(cur->d.read_buffer.already_done == 0);
+
+ if (orig_result < 0)
+ {
+ cur->result = io->result;
+ }
+ else if (running_result >= cur->d.read_buffer.nbytes)
+ {
+ cur->result = cur->d.read_buffer.nbytes;
+ running_result -= cur->result;
+ }
+ else if (running_result < cur->d.read_buffer.nbytes)
+ {
+ cur->result = running_result;
+ running_result = 0;
+ }
+
+ break;
+
+ case PGAIO_WRITE_BUFFER:
+ Assert(cur->d.write_buffer.already_done == 0);
+
+ if (orig_result < 0)
+ {
+ cur->result = io->result;
+ }
+ else if (running_result >= cur->d.write_buffer.nbytes)
+ {
+ cur->result = cur->d.write_buffer.nbytes;
+ running_result -= cur->result;
+ }
+ else if (running_result < cur->d.write_buffer.nbytes)
+ {
+ cur->result = running_result;
+ running_result = 0;
+ }
+ break;
+
+ case PGAIO_WRITE_WAL:
+ Assert(cur->d.write_wal.already_done == 0);
+
+ if (orig_result < 0)
+ {
+ cur->result = io->result;
+ }
+ else if (running_result >= cur->d.write_wal.nbytes)
+ {
+ cur->result = cur->d.write_wal.nbytes;
+ running_result -= cur->result;
+ }
+ else if (running_result < cur->d.write_wal.nbytes)
+ {
+ cur->result = running_result;
+ running_result = 0;
+ }
+ break;
+
+ case PGAIO_WRITE_GENERIC:
+ Assert(cur->d.write_generic.already_done == 0);
+
+ if (orig_result < 0)
+ {
+ cur->result = io->result;
+ }
+ else if (running_result >= cur->d.write_generic.nbytes)
+ {
+ cur->result = cur->d.write_generic.nbytes;
+ running_result -= cur->result;
+ }
+ else if (running_result < cur->d.write_generic.nbytes)
+ {
+ cur->result = running_result;
+ running_result = 0;
+ }
+ break;
+
+ default:
+ elog(PANIC, "merge for %d not supported yet", cur->type);
+ }
+
+ cur->merge_with = NULL;
+
+ if (last)
+ {
+ cur->flags =
+ (cur->flags & ~(PGAIOIP_INFLIGHT |
+ PGAIOIP_MERGE)) |
+ PGAIOIP_REAPED;
+
+ Assert(dlist_is_member(&my_aio->reaped, &last->io_node));
+ dlist_insert_after(&last->io_node, &cur->io_node);
+ extracted++;
+ }
+ else
+ {
+ cur->flags &= ~PGAIOIP_MERGE;
+ }
+
+ last = cur;
+ cur = next;
+ }
+
+ return extracted;
+}
+
+static void
+pgaio_uncombine(void)
+{
+ dlist_mutable_iter iter;
+
+ /* "unmerge" merged IOs, so they can be treated uniformly */
+ dlist_foreach_modify(iter, &my_aio->reaped)
+ {
+ PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, iter.cur);
+ uint32 extracted = 1;
+
+ if (io->flags & PGAIOIP_MERGE)
+ extracted += pgaio_uncombine_one(io);
+
+ pg_atomic_fetch_sub_u32(&aio_ctl->backend_state[io->owner_id].inflight_count, 1);
+ }
+}
+
+static void __attribute__((noinline))
+pgaio_complete_ios(bool in_error)
+{
+ int pending_count_before = my_aio->pending_count;
+
+ Assert(!LWLockHeldByMe(SharedAIOCtlLock));
+
+ /* call all callbacks, without holding lock */
+ while (!dlist_is_empty(&my_aio->reaped))
+ {
+ dlist_node *node = dlist_head_node(&my_aio->reaped);
+ PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+ Assert(dlist_is_member(&my_aio->reaped, &io->io_node));
+
+ Assert(node != NULL);
+
+ if (!(io->flags & PGAIOIP_SHARED_CALLBACK_CALLED))
+ {
+ PgAioCompletedCB cb;
+ bool finished;
+
+ /*
+ * Set flag before calling callback, otherwise we could easily end
+ * up looping forever.
+ */
+ *(volatile PgAioIPFlags*) &io->flags |= PGAIOIP_SHARED_CALLBACK_CALLED;
+
+ cb = completion_callbacks[io->type];
+ finished = cb(io);
+
+ dlist_delete_from(&my_aio->reaped, node);
+
+ if (finished)
+ {
+ dlist_push_tail(&local_recycle_requests, &io->io_node);
+ }
+ else
+ {
+ Assert((*(volatile PgAioIPFlags*) &io->flags) & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE));
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ *(volatile PgAioIPFlags*) &io->flags =
+ (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+ PGAIOIP_DONE |
+ PGAIOIP_SHARED_FAILED;
+ dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+ LWLockRelease(SharedAIOCtlLock);
+
+ /* signal state change */
+ if (IsUnderPostmaster)
+ ConditionVariableBroadcast(&io->cv);
+ }
+ }
+ else
+ {
+ Assert(in_error);
+
+ dlist_delete_from(&my_aio->reaped, node);
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ *(volatile PgAioIPFlags*) &io->flags =
+ (io->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+ PGAIOIP_DONE |
+ PGAIOIP_HARD_FAILURE |
+ PGAIOIP_SHARED_FAILED;
+ dlist_push_tail(&aio_ctl->reaped_uncompleted, &io->io_node);
+ LWLockRelease(SharedAIOCtlLock);
+ }
+ }
+
+ /* if any IOs weren't fully done, re-submit them */
+ if (pending_count_before != my_aio->pending_count)
+ pgaio_submit_pending(false);
+
+ /*
+ * Next, under lock, process all the still pending requests. This entails
+ * releasing the "system" reference on the IO and checking which callbacks
+ * need to be called.
+ */
+ START_CRIT_SECTION();
+
+ while (!dlist_is_empty(&local_recycle_requests))
+ {
+ dlist_mutable_iter iter;
+ PgAioInProgress* signal_ios[32];
+ int to_signal = 0;
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(iter, &local_recycle_requests)
+ {
+ PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+ dlist_delete_from(&local_recycle_requests, iter.cur);
+ signal_ios[to_signal++] = cur;
+
+ Assert(cur->system_referenced);
+ Assert(cur->flags & PGAIOIP_REAPED);
+ Assert(!(cur->flags & PGAIOIP_DONE));
+ Assert(!(cur->flags & PGAIOIP_INFLIGHT));
+ Assert(!(cur->flags & PGAIOIP_MERGE));
+ Assert(!(cur->flags & (PGAIOIP_SHARED_FAILED)));
+ Assert(!(cur->flags & (PGAIOIP_SOFT_FAILURE)));
+ Assert(cur->merge_with == NULL);
+
+ if (cur->user_referenced)
+ {
+ cur->system_referenced = false;
+
+ if (cur->owner_id != my_aio_id)
+ {
+ PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+ SpinLockAcquire(&other->foreign_completed_lock);
+
+ dlist_push_tail(&other->foreign_completed, &cur->io_node);
+ other->foreign_completed_count++;
+ other->foreign_completed_total_count++;
+
+ pg_write_barrier();
+
+ *(volatile PgAioIPFlags*) &cur->flags =
+ (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+ PGAIOIP_DONE |
+ PGAIOIP_FOREIGN_DONE;
+
+ SpinLockRelease(&other->foreign_completed_lock);
+ }
+ else
+ {
+ *(volatile PgAioIPFlags*) &cur->flags =
+ (cur->flags & ~(PGAIOIP_REAPED | PGAIOIP_IN_PROGRESS)) |
+ PGAIOIP_DONE;
+
+ dlist_push_tail(&my_aio->local_completed, &cur->io_node);
+ my_aio->local_completed_count++;
+ }
+ }
+ else
+ {
+ PgAioPerBackend *other = &aio_ctl->backend_state[cur->owner_id];
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG4,
+ errmsg("removing aio %zu from issued_abandoned complete_ios",
+ cur - aio_ctl->in_progress_io),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+
+ dlist_delete_from(&other->issued_abandoned, &cur->owner_node);
+ Assert(other->issued_abandoned_count > 0);
+ other->issued_abandoned_count--;
+
+ cur->generation++;
+ pg_write_barrier();
+
+ cur->flags = PGAIOIP_UNUSED;
+
+ if (cur->bb)
+ {
+ pgaio_bounce_buffer_release_internal(cur->bb,
+ /* holding_lock = */ true,
+ /* release_resowner = */ false);
+ cur->bb = NULL;
+ }
+
+ cur->type = 0;
+ cur->owner_id = INVALID_PGPROCNO;
+ cur->result = 0;
+ cur->system_referenced = true;
+ cur->on_completion_local = NULL;
+
+ dlist_push_head(&aio_ctl->unused_ios, &cur->owner_node);
+ aio_ctl->used_count--;
+ }
+
+ if (to_signal >= lengthof(signal_ios))
+ break;
+ }
+ LWLockRelease(SharedAIOCtlLock);
+
+ if (IsUnderPostmaster)
+ {
+ for (int i = 0; i < to_signal; i++)
+ {
+ ConditionVariableBroadcast(&signal_ios[i]->cv);
+ }
+ }
+ }
+
+ END_CRIT_SECTION();
+}
+
+static void
+pgaio_io_call_local_callback(PgAioInProgress *io, bool in_error)
+{
+ Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+ Assert(io->user_referenced);
+
+ dlist_delete_from(&my_aio->issued, &io->owner_node);
+ my_aio->issued_count--;
+ dlist_push_tail(&my_aio->outstanding, &io->owner_node);
+ my_aio->outstanding_count++;
+
+ io->flags |= PGAIOIP_LOCAL_CALLBACK_CALLED;
+
+ if (!io->on_completion_local)
+ return;
+
+ if (!in_error)
+ io->on_completion_local->callback(io->on_completion_local, io);
+
+}
+
+/*
+ * Call all pending local callbacks.
+ */
+static void
+pgaio_call_local_callbacks(bool in_error)
+{
+ if (my_aio->local_completed_count != 0 &&
+ CritSectionCount == 0)
+ {
+ /* FIXME: this isn't safe against errors */
+ static int local_callback_depth = 0;
+
+ if (local_callback_depth == 0)
+ {
+ local_callback_depth++;
+
+ while (!dlist_is_empty(&my_aio->local_completed))
+ {
+ dlist_node *node = dlist_pop_head_node(&my_aio->local_completed);
+ PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+ Assert(my_aio->local_completed_count > 0);
+ my_aio->local_completed_count--;
+
+ pgaio_io_call_local_callback(io, in_error);
+ }
+
+ local_callback_depth--;
+ }
+ }
+}
+
+/*
+ * Receive completions in ring.
+ */
+static int __attribute__((noinline))
+pgaio_drain(PgAioContext *context, bool in_error, bool call_local)
+{
+ int ndrained = 0;
+
+ ndrained = pgaio_uring_drain(context);
+
+ /*
+ * Transfer all the foreign completions into the local queue.
+ */
+ if (my_aio->foreign_completed_count != 0)
+ {
+ SpinLockAcquire(&my_aio->foreign_completed_lock);
+
+ while (!dlist_is_empty(&my_aio->foreign_completed))
+ {
+ dlist_node *node = dlist_pop_head_node(&my_aio->foreign_completed);
+ PgAioInProgress *io = dlist_container(PgAioInProgress, io_node, node);
+
+ Assert(!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED));
+
+ dlist_push_tail(&my_aio->local_completed, &io->io_node);
+ io->flags &= ~PGAIOIP_FOREIGN_DONE;
+ my_aio->foreign_completed_count--;
+ my_aio->local_completed_count++;
+ }
+ SpinLockRelease(&my_aio->foreign_completed_lock);
+ }
+
+ /*
+ * Call all pending local callbacks.
+ */
+ if (call_local && CritSectionCount == 0)
+ pgaio_call_local_callbacks(in_error);
+
+ return ndrained;
+}
+
+static bool
+pgaio_can_be_combined(PgAioInProgress *last, PgAioInProgress *cur)
+{
+ if (last->type != cur->type)
+ return false;
+
+ if (last->flags & PGAIOIP_RETRY ||
+ cur->flags & PGAIOIP_RETRY)
+ return false;
+
+ switch (last->type)
+ {
+ case PGAIO_INVALID:
+ elog(ERROR, "unexpected");
+ break;
+
+ case PGAIO_READ_BUFFER:
+ if (last->d.read_buffer.fd != cur->d.read_buffer.fd)
+ return false;
+ if ((last->d.read_buffer.offset + last->d.read_buffer.nbytes) != cur->d.read_buffer.offset)
+ return false;
+ if (last->d.read_buffer.mode != cur->d.read_buffer.mode)
+ return false;
+ if (last->d.read_buffer.already_done != 0 || cur->d.read_buffer.already_done != 0)
+ return false;
+
+ return true;
+
+ case PGAIO_NOP:
+ case PGAIO_FLUSH_RANGE:
+ case PGAIO_FSYNC:
+ case PGAIO_FSYNC_WAL:
+ return false;
+
+ case PGAIO_WRITE_BUFFER:
+ if (last->d.write_buffer.fd != cur->d.write_buffer.fd)
+ return false;
+ if ((last->d.write_buffer.offset + last->d.write_buffer.nbytes) != cur->d.write_buffer.offset)
+ return false;
+ if (last->d.write_buffer.already_done != 0 || cur->d.write_buffer.already_done != 0)
+ return false;
+ return true;
+
+ case PGAIO_WRITE_WAL:
+ if (last->d.write_wal.fd != cur->d.write_wal.fd)
+ return false;
+ if ((last->d.write_wal.offset + last->d.write_wal.nbytes) != cur->d.write_wal.offset)
+ return false;
+ if (last->d.write_wal.already_done != 0 || cur->d.write_wal.already_done != 0)
+ return false;
+ if (last->d.write_wal.no_reorder || cur->d.write_wal.no_reorder)
+ return false;
+ return true;
+
+ case PGAIO_WRITE_GENERIC:
+ if (last->d.write_generic.fd != cur->d.write_generic.fd)
+ return false;
+ if ((last->d.write_generic.offset + last->d.write_generic.nbytes) != cur->d.write_generic.offset)
+ return false;
+ if (last->d.write_generic.already_done != 0 || cur->d.write_generic.already_done != 0)
+ return false;
+ if (last->d.write_generic.no_reorder || cur->d.write_generic.no_reorder)
+ return false;
+ return true;
+ }
+
+ pg_unreachable();
+}
+
+static void
+pgaio_io_merge(PgAioInProgress *into, PgAioInProgress *tomerge)
+{
+ ereport(DEBUG3,
+ errmsg("merging %zu to %zu",
+ tomerge - aio_ctl->in_progress_io,
+ into - aio_ctl->in_progress_io),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ into->merge_with = tomerge;
+ into->flags |= PGAIOIP_MERGE;
+}
+
+static void
+pgaio_combine_pending(void)
+{
+ dlist_iter iter;
+ PgAioInProgress *last = NULL;
+ int combined = 1;
+
+ Assert(my_aio->pending_count > 1);
+
+ dlist_foreach(iter, &my_aio->pending)
+ {
+ PgAioInProgress *cur = dlist_container(PgAioInProgress, io_node, iter.cur);
+
+ /* can happen when failing partway through io submission */
+ if (cur->merge_with)
+ {
+ elog(DEBUG1, "already merged request (%zu), giving up on merging",
+ cur - aio_ctl->in_progress_io);
+ return;
+ }
+
+ Assert(cur->merge_with == NULL);
+ Assert(!(cur->flags & PGAIOIP_MERGE));
+
+ if (last == NULL)
+ {
+ last = cur;
+ continue;
+ }
+
+ if (pgaio_can_be_combined(last, cur))
+ {
+ combined++;
+
+ pgaio_io_merge(last, cur);
+ }
+ else
+ {
+ combined = 1;
+ }
+
+ if (combined >= PGAIO_MAX_COMBINE)
+ {
+ ereport(DEBUG3,
+ errmsg("max combine at %d", combined),
+ errhidestmt(true),
+ errhidecontext(true));
+ last = NULL;
+ combined = 1;
+ }
+ else
+ last = cur;
+ }
+}
+
+void __attribute__((noinline))
+pgaio_submit_pending(bool drain)
+{
+ int total_submitted = 0;
+ uint32 orig_total;
+
+ if (!aio_ctl || !my_aio)
+ return;
+
+ if (my_aio->pending_count == 0)
+ {
+ Assert(dlist_is_empty(&my_aio->pending));
+ return;
+ }
+
+ HOLD_INTERRUPTS();
+
+ orig_total = my_aio->pending_count;
+
+#define COMBINE_ENABLED
+
+#ifdef COMBINE_ENABLED
+#if 0
+ ereport(LOG, errmsg("before combine"),
+ errhidestmt(true),
+ errhidecontext(true));
+ pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+ if (my_aio->pending_count > 1 && PGAIO_MAX_COMBINE > 1)
+ pgaio_combine_pending();
+
+#if 0
+ ereport(LOG, errmsg("after combine"),
+ errhidestmt(true),
+ errhidecontext(true));
+ pgaio_print_list(&my_aio->pending, NULL, offsetof(PgAioInProgress, io_node));
+#endif
+#endif /* COMBINE_ENABLED */
+
+ /*
+ * Loop until all pending IOs are submitted. Throttle max in-flight before
+ * calling into the IO implementation specific routine, so this code can
+ * be shared.
+ */
+ while (!dlist_is_empty(&my_aio->pending))
+ {
+ int max_submit;
+ int did_submit;
+
+ Assert(my_aio->pending_count > 0);
+ pgaio_apply_backend_limit();
+
+ Assert(my_aio->pending_count > 0);
+ if (my_aio->pending_count == 0)
+ break;
+
+ max_submit = Min(my_aio->pending_count, PGAIO_SUBMIT_BATCH_SIZE);
+ max_submit = Min(max_submit, io_max_concurrency - pg_atomic_read_u32(&my_aio->inflight_count));
+ Assert(max_submit > 0);
+
+ START_CRIT_SECTION();
+ did_submit = pgaio_uring_submit(max_submit, drain);
+ total_submitted += did_submit;
+ Assert(did_submit > 0 && did_submit <= max_submit);
+ END_CRIT_SECTION();
+ }
+
+ my_aio->executed_total_count += orig_total;
+ my_aio->issued_total_count += total_submitted;
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("submitted %d (orig %d)", total_submitted, orig_total),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ RESUME_INTERRUPTS();
+
+ if (drain)
+ pgaio_call_local_callbacks(/* in_error = */ false);
+}
+
+static void
+pgaio_io_prepare_submit(PgAioInProgress *io, uint32 ring)
+{
+ PgAioInProgress *cur;
+
+ cur = io;
+
+ while (cur)
+ {
+ Assert(cur->flags & PGAIOIP_PENDING);
+
+ cur->ring = ring;
+
+ pg_write_barrier();
+
+ *(volatile PgAioIPFlags*) &cur->flags =
+ (cur->flags & ~PGAIOIP_PENDING) | PGAIOIP_INFLIGHT;
+
+ dlist_delete_from(&my_aio->pending, &cur->io_node);
+ my_aio->pending_count--;
+
+ if (cur->flags & PGAIOIP_RETRY)
+ {
+ /* XXX: more error checks */
+ }
+ else if (cur->user_referenced)
+ {
+ Assert(my_aio_id == cur->owner_id);
+ Assert(my_aio->outstanding_count > 0);
+ dlist_delete_from(&my_aio->outstanding, &cur->owner_node);
+ my_aio->outstanding_count--;
+
+ dlist_push_tail(&my_aio->issued, &cur->owner_node);
+ my_aio->issued_count++;
+ }
+ else
+ {
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG4,
+ errmsg("putting aio %zu onto issued_abandoned during submit",
+ cur - aio_ctl->in_progress_io),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ dlist_push_tail(&my_aio->issued_abandoned, &cur->owner_node);
+ my_aio->issued_abandoned_count++;
+ LWLockRelease(SharedAIOCtlLock);
+ }
+
+ ereport(DEBUG5,
+ errmsg("readied %zu/%llu for submit",
+ cur - aio_ctl->in_progress_io,
+ (unsigned long long ) cur->generation),
+ errhidecontext(1),
+ errhidestmt(1));
+
+ cur = cur->merge_with;
+ }
+}
+
+static void
+pgaio_apply_backend_limit(void)
+{
+ uint32 current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+
+ while (current_inflight >= io_max_concurrency)
+ {
+ PgAioInProgress *io;
+
+ /*
+ * XXX: Should we be a bit fairer and check the "oldest" in-flight IO
+ * between issued and issued_abandoned?
+ */
+
+ if (my_aio->issued_count > 0)
+ {
+ dlist_iter iter;
+
+ Assert(!dlist_is_empty(&my_aio->issued));
+
+ dlist_foreach(iter, &my_aio->issued)
+ {
+ io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+ if (io->flags & PGAIOIP_INFLIGHT)
+ {
+ PgAioIoRef ref;
+
+ ereport(DEBUG2,
+ errmsg("applying per-backend limit to issued IO %zu/%llu (current %d in %d, target %d)",
+ io - aio_ctl->in_progress_io,
+ (unsigned long long) io->generation,
+ my_aio->issued_count + my_aio->issued_abandoned_count,
+ current_inflight,
+ io_max_concurrency),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ pgaio_io_ref(io, &ref);
+ pgaio_io_wait_ref(&ref, /* call_local = */ false);
+ current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+ break;
+ }
+ }
+ }
+
+ if (current_inflight < io_max_concurrency)
+ break;
+
+ if (my_aio->issued_abandoned_count > 0)
+ {
+ dlist_iter iter;
+ PgAioIoRef ref;
+
+ io = NULL;
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ dlist_foreach(iter, &my_aio->issued_abandoned)
+ {
+ io = dlist_container(PgAioInProgress, owner_node, iter.cur);
+
+ if (io->flags & PGAIOIP_INFLIGHT)
+ {
+ pgaio_io_ref_internal(io, &ref);
+ break;
+ }
+ else
+ io = NULL;
+ }
+ LWLockRelease(SharedAIOCtlLock);
+
+ if (io == NULL)
+ continue;
+
+ ereport(DEBUG2,
+ errmsg("applying per-backend limit to issued_abandoned IO %zu/%llu (current %d in %d, target %d)",
+ io - aio_ctl->in_progress_io,
+ (unsigned long long) io->generation,
+ my_aio->issued_count + my_aio->issued_abandoned_count,
+ current_inflight,
+ io_max_concurrency),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ pgaio_io_wait_ref(&ref, false);
+ }
+
+ current_inflight = pg_atomic_read_u32(&my_aio->inflight_count);
+ }
+}
+
+void
+pgaio_io_wait_ref(PgAioIoRef *ref, bool call_local)
+{
+ uint64 ref_generation;
+ PgAioInProgress *io;
+ uint32 done_flags = PGAIOIP_DONE;
+ PgAioIPFlags flags;
+ bool am_owner;
+
+ Assert(ref->aio_index < max_aio_in_progress);
+
+ io = &aio_ctl->in_progress_io[ref->aio_index];
+ ref_generation = ((uint64) ref->generation_upper) << 32 |
+ ref->generation_lower;
+
+ Assert(ref_generation != 0);
+
+ am_owner = io->owner_id == my_aio_id;
+ flags = io->flags;
+ pg_read_barrier();
+
+ if (io->generation != ref_generation)
+ return;
+
+ if (am_owner && (flags & PGAIOIP_PENDING))
+ pgaio_submit_pending(false);
+
+ Assert(!(flags & (PGAIOIP_UNUSED)));
+
+ while (true)
+ {
+ PgAioContext *context;
+
+ flags = io->flags;
+ context = &aio_ctl->contexts[io->ring];
+ pg_read_barrier();
+
+ if (io->generation != ref_generation)
+ return;
+
+ if (flags & done_flags)
+ goto wait_ref_out;
+
+ Assert(!(flags & (PGAIOIP_UNUSED)));
+
+ if (flags & PGAIOIP_INFLIGHT)
+ {
+ pgaio_drain(context, false, call_local);
+
+ flags = io->flags;
+ context = &aio_ctl->contexts[io->ring];
+ pg_read_barrier();
+
+ if (io->generation != ref_generation)
+ return;
+
+ if (flags & done_flags)
+ goto wait_ref_out;
+ }
+
+ if (my_aio->pending_count > 0 && call_local)
+ {
+ /* FIXME: we should call this in a larger number of cases */
+
+ /*
+ * If we otherwise would have to sleep submit all pending
+ * requests, to avoid others having to wait for us to submit
+ * them. Don't want to do so when not needing to sleep, as
+ * submitting IOs in smaller increments can be less efficient.
+ */
+ pgaio_submit_pending(false);
+ }
+ else if (flags & PGAIOIP_INFLIGHT)
+ {
+ /* note that this is allowed to spuriously return */
+ pgaio_uring_wait_one(context, io, ref_generation, WAIT_EVENT_AIO_IO_COMPLETE_ANY);
+ }
+ else
+ {
+ /* shouldn't be reachable without concurrency */
+ Assert(IsUnderPostmaster);
+
+ /* ensure we're going to get woken up */
+ if (IsUnderPostmaster)
+ ConditionVariablePrepareToSleep(&io->cv);
+
+ flags = io->flags;
+ pg_read_barrier();
+ if (io->generation == ref_generation && !(flags & done_flags))
+ ConditionVariableSleep(&io->cv, WAIT_EVENT_AIO_IO_COMPLETE_ONE);
+
+ if (IsUnderPostmaster)
+ ConditionVariableCancelSleepEx(true);
+ }
+ }
+
+wait_ref_out:
+
+ flags = io->flags;
+ pg_read_barrier();
+ if (io->generation != ref_generation)
+ return;
+
+ Assert(flags & PGAIOIP_DONE);
+
+ if (unlikely(flags & (PGAIOIP_SOFT_FAILURE | PGAIOIP_HARD_FAILURE)))
+ {
+ /* can retry soft failures, but not hard ones */
+ /* FIXME: limit number of soft retries */
+ if (flags & PGAIOIP_SOFT_FAILURE)
+ {
+ pgaio_io_retry(io);
+ pgaio_io_wait_ref(ref, call_local);
+ }
+ else
+ {
+ pgaio_io_print(io, NULL);
+ elog(WARNING, "request %zd failed permanently",
+ io - aio_ctl->in_progress_io);
+ }
+
+ return;
+ }
+
+ if (am_owner && call_local && !(flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+ {
+ if (flags & PGAIOIP_FOREIGN_DONE)
+ {
+ SpinLockAcquire(&my_aio->foreign_completed_lock);
+ dlist_delete_from(&my_aio->foreign_completed, &io->io_node);
+ io->flags &= ~PGAIOIP_FOREIGN_DONE;
+ my_aio->foreign_completed_count--;
+ SpinLockRelease(&my_aio->foreign_completed_lock);
+ }
+ else
+ {
+ Assert(my_aio->local_completed_count > 0);
+ dlist_delete_from(&my_aio->local_completed, &io->io_node);
+ my_aio->local_completed_count--;
+ }
+
+ pgaio_io_call_local_callback(io, false);
+ }
+
+}
+
+bool
+pgaio_io_check_ref(PgAioIoRef *ref)
+{
+ uint64 ref_generation;
+ PgAioInProgress *io;
+ uint32 done_flags = PGAIOIP_DONE;
+ PgAioIPFlags flags;
+ PgAioContext *context;
+
+ Assert(ref->aio_index < max_aio_in_progress);
+
+ io = &aio_ctl->in_progress_io[ref->aio_index];
+ ref_generation = ((uint64) ref->generation_upper) << 32 |
+ ref->generation_lower;
+
+ Assert(ref_generation != 0);
+
+ flags = io->flags;
+ pg_read_barrier();
+
+ if (io->generation != ref_generation)
+ return true;
+
+ if (flags & PGAIOIP_PENDING)
+ return false;
+
+ if (flags & done_flags)
+ return true;
+
+ context = &aio_ctl->contexts[io->ring];
+ Assert(!(flags & (PGAIOIP_UNUSED)));
+
+ if (io->generation != ref_generation)
+ return true;
+
+ if (flags & PGAIOIP_INFLIGHT)
+ pgaio_drain(context, false, false);
+
+ flags = io->flags;
+ pg_read_barrier();
+
+ if (io->generation != ref_generation)
+ return true;
+
+ Assert(!(flags & PGAIOIP_PENDING));
+
+ if (flags & done_flags)
+ return true;
+ return false;
+}
+
+
+/*
+ */
+void
+pgaio_io_wait(PgAioInProgress *io)
+{
+ PgAioIoRef ref;
+
+ Assert(io->user_referenced && io->owner_id == my_aio_id);
+
+ pgaio_io_ref(io, &ref);
+ pgaio_io_wait_ref(&ref, /* call_local = */ true);
+}
+
+PgAioInProgress *
+pgaio_io_get(void)
+{
+ dlist_node *elem;
+ PgAioInProgress *io;
+
+ Assert(!LWLockHeldByMe(SharedAIOCtlLock));
+
+ // FIXME: relax?
+ Assert(my_aio->pending_count < PGAIO_SUBMIT_BATCH_SIZE);
+
+ /* FIXME: wait for an IO to complete if full */
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+ while (unlikely(dlist_is_empty(&aio_ctl->unused_ios)))
+ {
+ LWLockRelease(SharedAIOCtlLock);
+ elog(WARNING, "needed to drain while getting IO (used %d inflight %d)",
+ aio_ctl->used_count, pg_atomic_read_u32(&my_aio->inflight_count));
+
+ /*
+ * FIXME: should we wait for IO instead?
+ *
+ * Also, need to protect against too many ios handed out but not used.
+ */
+ for (int i = 0; i < aio_ctl->num_contexts; i++)
+ pgaio_drain(&aio_ctl->contexts[i], false, true);
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ }
+
+ elem = dlist_pop_head_node(&aio_ctl->unused_ios);
+ aio_ctl->used_count++;
+
+ LWLockRelease(SharedAIOCtlLock);
+
+ io = dlist_container(PgAioInProgress, owner_node, elem);
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3, errmsg("acquired io %zu/%llu",
+ io - aio_ctl->in_progress_io,
+ (long long unsigned) io->generation),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+
+ Assert(io->type == PGAIO_INVALID);
+ Assert(io->flags == PGAIOIP_UNUSED);
+ Assert(io->system_referenced);
+ Assert(io->on_completion_local == NULL);
+
+ io->user_referenced = true;
+ io->system_referenced = false;
+
+ *(volatile PgAioIPFlags*) &io->flags = PGAIOIP_IDLE;
+
+ io->owner_id = my_aio_id;
+
+ dlist_push_tail(&my_aio->outstanding, &io->owner_node);
+ my_aio->outstanding_count++;
+
+ return io;
+}
+
+bool
+pgaio_io_success(PgAioInProgress *io)
+{
+ Assert(io->user_referenced);
+ Assert(io->flags & PGAIOIP_DONE);
+
+ if (io->flags & (PGAIOIP_HARD_FAILURE | PGAIOIP_SOFT_FAILURE))
+ return false;
+
+ /* FIXME: is this possible? */
+ if (!(io->flags & PGAIOIP_SHARED_CALLBACK_CALLED))
+ return false;
+
+ return true;
+}
+
+bool
+pgaio_io_done(PgAioInProgress *io)
+{
+ Assert(io->user_referenced);
+ Assert(!(io->flags & PGAIOIP_UNUSED));
+
+ if (io->flags & PGAIOIP_SOFT_FAILURE)
+ return false;
+
+ if (io->flags & (PGAIOIP_IDLE | PGAIOIP_HARD_FAILURE))
+ return true;
+
+ if (io->flags & PGAIOIP_DONE)
+ {
+ if (io->owner_id == my_aio_id &&
+ !(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+pgaio_io_ref_internal(PgAioInProgress *io, PgAioIoRef *ref)
+{
+ Assert(io->flags & (PGAIOIP_IDLE | PGAIOIP_IN_PROGRESS | PGAIOIP_DONE));
+
+ ref->aio_index = io - aio_ctl->in_progress_io;
+ ref->generation_upper = (uint32) (io->generation >> 32);
+ ref->generation_lower = (uint32) io->generation;
+ Assert(io->generation != 0);
+}
+
+void
+pgaio_io_ref(PgAioInProgress *io, PgAioIoRef *ref)
+{
+ Assert(io->user_referenced);
+ pgaio_io_ref_internal(io, ref);
+}
+
+/*
+ * Register a completion callback that is executed locally in the backend that
+ * initiated the IO, even if the the completion of the IO has been reaped by
+ * another process (which executed the shared callback, unlocking buffers
+ * etc). This is mainly useful for AIO using code to promptly react to
+ * individual IOs finishing, without having to individually check each of the
+ * IOs.
+ */
+void
+pgaio_io_on_completion_local(PgAioInProgress *io, PgAioOnCompletionLocalContext *ocb)
+{
+ Assert(io->flags & (PGAIOIP_IDLE | PGAIOIP_PENDING));
+ Assert(io->on_completion_local == NULL);
+
+ io->on_completion_local = ocb;
+}
+
+static int
+reopen_buffered(const AioBufferTag *tag)
+{
+ uint32 off;
+ SMgrRelation reln = smgropen(tag->rnode.node, tag->rnode.backend);
+
+ return smgrfd(reln, tag->forkNum, tag->blockNum, &off);
+}
+
+void
+pgaio_io_retry(PgAioInProgress *io)
+{
+ bool retryable = false;
+ bool need_retry;
+
+ switch (io->type)
+ {
+ case PGAIO_READ_BUFFER:
+ retryable = true;
+ break;
+
+ case PGAIO_WRITE_BUFFER:
+ retryable = true;
+ break;
+
+ default:
+ break;
+ }
+
+ if (!retryable)
+ {
+ elog(WARNING, "non-retryable aio being retried");
+ return;
+ }
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+ /* could concurrently have been unset / retried */
+ if (io->flags & PGAIOIP_SHARED_FAILED)
+ {
+ Assert(!(io->flags & PGAIOIP_FOREIGN_DONE));
+
+ dlist_delete(&io->io_node);
+
+ io->flags =
+ (io->flags & ~(PGAIOIP_SHARED_FAILED |
+ PGAIOIP_DONE |
+ PGAIOIP_FOREIGN_DONE |
+ PGAIOIP_SHARED_CALLBACK_CALLED |
+ PGAIOIP_LOCAL_CALLBACK_CALLED |
+ PGAIOIP_HARD_FAILURE |
+ PGAIOIP_SOFT_FAILURE)) |
+ PGAIOIP_IN_PROGRESS |
+ PGAIOIP_PENDING |
+ PGAIOIP_RETRY;
+
+ need_retry = true;
+ }
+ else
+ {
+ need_retry = false;
+ }
+ LWLockRelease(SharedAIOCtlLock);
+
+ if (!need_retry)
+ {
+ ereport(LOG, errmsg("was about to retry %zd, but somebody else did already",
+ io - aio_ctl->in_progress_io),
+ errhidestmt(true),
+ errhidecontext(true));
+ pgaio_io_print(io, NULL);
+ return;
+ }
+
+ switch (io->type)
+ {
+ case PGAIO_READ_BUFFER:
+ io->d.read_buffer.fd = reopen_buffered(&io->d.read_buffer.tag);
+ break;
+
+ case PGAIO_WRITE_BUFFER:
+ io->d.write_buffer.fd = reopen_buffered(&io->d.write_buffer.tag);
+ break;
+
+ default:
+ break;
+ }
+
+ dlist_push_tail(&my_aio->pending, &io->io_node);
+ my_aio->pending_count++;
+ my_aio->retry_total_count++;
+
+ pgaio_submit_pending(true);
+}
+
+extern void
+pgaio_io_recycle(PgAioInProgress *io)
+{
+ uint32 init_flags = *(volatile PgAioIPFlags*) &io->flags;
+
+ Assert(init_flags & (PGAIOIP_IDLE | PGAIOIP_DONE));
+ Assert(io->user_referenced);
+ Assert(io->owner_id == my_aio_id);
+ Assert(!io->system_referenced);
+ Assert(io->merge_with == NULL);
+
+ if (io->bb)
+ {
+ pgaio_bounce_buffer_release_internal(io->bb, false, false);
+ io->bb = NULL;
+ }
+
+ if (io->flags & PGAIOIP_DONE)
+ {
+ /* request needs to actually be done, including local callbacks */
+ Assert(!(io->flags & PGAIOIP_FOREIGN_DONE));
+ Assert(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED);
+
+ io->generation++;
+ pg_write_barrier();
+
+ io->flags &= ~PGAIOIP_DONE;
+ io->flags |= PGAIOIP_IDLE;
+ }
+
+ io->flags &= ~(PGAIOIP_MERGE |
+ PGAIOIP_SHARED_CALLBACK_CALLED |
+ PGAIOIP_LOCAL_CALLBACK_CALLED |
+ PGAIOIP_RETRY |
+ PGAIOIP_HARD_FAILURE |
+ PGAIOIP_SOFT_FAILURE);
+ Assert(io->flags == PGAIOIP_IDLE);
+ io->result = 0;
+ io->on_completion_local = NULL;
+}
+
+static void __attribute__((noinline))
+pgaio_prepare_io(PgAioInProgress *io, PgAioAction action)
+{
+ if (my_aio->pending_count + 1 >= PGAIO_SUBMIT_BATCH_SIZE)
+ pgaio_submit_pending(true);
+
+ /* true for now, but not necessarily in the future */
+ Assert(io->flags == PGAIOIP_IDLE);
+ Assert(io->user_referenced);
+ Assert(io->merge_with == NULL);
+
+ Assert(my_aio->pending_count < PGAIO_SUBMIT_BATCH_SIZE);
+
+ io->flags = (io->flags & ~PGAIOIP_IDLE) | PGAIOIP_IN_PROGRESS | PGAIOIP_PENDING;
+
+ /* for this module */
+ io->system_referenced = true;
+ io->type = action;
+ if (IsUnderPostmaster)
+ io->owner_id = MyProc->pgprocno;
+
+ // FIXME: should this be done in end_get_io?
+ dlist_push_tail(&my_aio->pending, &io->io_node);
+ my_aio->pending_count++;
+}
+
+static void __attribute__((noinline))
+pgaio_finish_io(PgAioInProgress *io)
+{
+}
+
+
+void
+pgaio_io_release(PgAioInProgress *io)
+{
+ Assert(io->user_referenced);
+ Assert(!IsUnderPostmaster || io->owner_id == MyProc->pgprocno);
+
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+
+ io->user_referenced = false;
+
+ if (io->flags & (PGAIOIP_IDLE |
+ PGAIOIP_PENDING |
+ PGAIOIP_LOCAL_CALLBACK_CALLED))
+ {
+ Assert(!(io->flags & PGAIOIP_INFLIGHT));
+
+ Assert(my_aio->outstanding_count > 0);
+ dlist_delete_from(&my_aio->outstanding, &io->owner_node);
+ my_aio->outstanding_count--;
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3, errmsg("releasing plain user reference to %zu",
+ io - aio_ctl->in_progress_io),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+ }
+ else
+ {
+ dlist_delete_from(&my_aio->issued, &io->owner_node);
+ my_aio->issued_count--;
+
+ if (io->system_referenced)
+ {
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG4, errmsg("putting aio %zu onto issued_abandoned during release",
+ io - aio_ctl->in_progress_io),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+
+ dlist_push_tail(&my_aio->issued_abandoned, &io->owner_node);
+ my_aio->issued_abandoned_count++;
+ }
+ else
+ {
+ Assert(io->flags & (PGAIOIP_DONE | PGAIOIP_SHARED_CALLBACK_CALLED));
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG4, errmsg("not putting aio %zu onto issued_abandoned during release",
+ io - aio_ctl->in_progress_io),
+ errhidecontext(1),
+ errhidestmt(1));
+#endif
+ }
+ }
+
+ if (!io->system_referenced)
+ {
+ Assert(!(io->flags & PGAIOIP_INFLIGHT));
+ Assert(!(io->flags & PGAIOIP_MERGE));
+ Assert(io->flags & PGAIOIP_DONE ||
+ io->flags & PGAIOIP_IDLE);
+
+ if (io->flags & PGAIOIP_DONE)
+ {
+ if (io->flags & PGAIOIP_FOREIGN_DONE)
+ {
+ SpinLockAcquire(&my_aio->foreign_completed_lock);
+ Assert(io->flags & PGAIOIP_FOREIGN_DONE);
+ dlist_delete_from(&my_aio->foreign_completed, &io->io_node);
+ my_aio->foreign_completed_count--;
+ SpinLockRelease(&my_aio->foreign_completed_lock);
+ }
+ else if (!(io->flags & PGAIOIP_LOCAL_CALLBACK_CALLED))
+ {
+ dlist_delete_from(&my_aio->local_completed, &io->io_node);
+ my_aio->local_completed_count--;
+ io->on_completion_local = NULL;
+ }
+
+ }
+
+ io->generation++;
+ pg_write_barrier();
+
+ io->flags = PGAIOIP_UNUSED;
+ io->type = 0;
+ io->owner_id = INVALID_PGPROCNO;
+ io->result = 0;
+ io->system_referenced = true;
+ io->on_completion_local = NULL;
+
+ Assert(io->merge_with == NULL);
+
+ /* could do this earlier or conditionally */
+ if (io->bb)
+ {
+ pgaio_bounce_buffer_release_internal(io->bb,
+ /* holding_lock = */ true,
+ /* release_resowner = */ false);
+ io->bb = NULL;
+ }
+
+ dlist_push_head(&aio_ctl->unused_ios, &io->owner_node);
+ aio_ctl->used_count--;
+ }
+
+ LWLockRelease(SharedAIOCtlLock);
+}
+
+void
+pgaio_print_queues(void)
+{
+ StringInfoData s;
+ uint32 inflight_backend = 0;
+ uint32 *inflight_context;
+
+ initStringInfo(&s);
+
+ for (int procno = 0; procno < aio_ctl->backend_state_count; procno++)
+ {
+ PgAioPerBackend *bs = &aio_ctl->backend_state[procno];
+
+ inflight_backend += pg_atomic_read_u32(&bs->inflight_count);
+ }
+
+ inflight_context = palloc0(sizeof(uint32) * aio_ctl->backend_state_count);
+ for (int i = 0; i < max_aio_in_progress; i++)
+ {
+ PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+
+ if (!(io->flags & PGAIOIP_INFLIGHT))
+ continue;
+ inflight_context[io->ring]++;
+ }
+
+ appendStringInfo(&s, "inflight backend: %d", inflight_backend);
+
+ for (int contextno = 0; contextno < aio_ctl->num_contexts; contextno++)
+ {
+ PgAioContext *context = &aio_ctl->contexts[contextno];
+
+ appendStringInfo(&s, "\n\tqueue[%d]: space: %d, ready: %d, we think inflight: %d",
+ contextno,
+ io_uring_sq_space_left(&context->io_uring_ring),
+ io_uring_cq_ready(&context->io_uring_ring),
+ inflight_context[contextno]);
+ }
+
+ ereport(LOG, errmsg_internal("%s", s.data),
+ errhidestmt(true),
+ errhidecontext(true));
+}
+
+static const char *
+pgaio_io_action_string(PgAioAction a)
+{
+ switch(a)
+ {
+ case PGAIO_INVALID:
+ return "invalid";
+ case PGAIO_NOP:
+ return "nop";
+ case PGAIO_FLUSH_RANGE:
+ return "flush_range";
+ case PGAIO_FSYNC:
+ return "fsync";
+ case PGAIO_FSYNC_WAL:
+ return "fsync_wal";
+ case PGAIO_READ_BUFFER:
+ return "read_buffer";
+ case PGAIO_WRITE_BUFFER:
+ return "write_buffer";
+ case PGAIO_WRITE_WAL:
+ return "write_wal";
+ case PGAIO_WRITE_GENERIC:
+ return "write_generic";
+ }
+
+ pg_unreachable();
+}
+
+
+static void
+pgaio_io_flag_string(PgAioIPFlags flags, StringInfo s)
+{
+ bool first = true;
+
+#define STRINGIFY_FLAG(f) if (flags & f) { appendStringInfoString(s, first ? CppAsString(f) : " | " CppAsString(f)); first = false;}
+
+ STRINGIFY_FLAG(PGAIOIP_UNUSED);
+ STRINGIFY_FLAG(PGAIOIP_IDLE);
+ STRINGIFY_FLAG(PGAIOIP_IN_PROGRESS);
+ STRINGIFY_FLAG(PGAIOIP_PENDING);
+ STRINGIFY_FLAG(PGAIOIP_INFLIGHT);
+ STRINGIFY_FLAG(PGAIOIP_REAPED);
+ STRINGIFY_FLAG(PGAIOIP_SHARED_CALLBACK_CALLED);
+ STRINGIFY_FLAG(PGAIOIP_LOCAL_CALLBACK_CALLED);
+
+ STRINGIFY_FLAG(PGAIOIP_DONE);
+ STRINGIFY_FLAG(PGAIOIP_FOREIGN_DONE);
+
+ STRINGIFY_FLAG(PGAIOIP_MERGE);
+ STRINGIFY_FLAG(PGAIOIP_RETRY);
+ STRINGIFY_FLAG(PGAIOIP_HARD_FAILURE);
+ STRINGIFY_FLAG(PGAIOIP_SOFT_FAILURE);
+ STRINGIFY_FLAG(PGAIOIP_SHARED_FAILED);
+
+#undef STRINGIFY_FLAG
+}
+
+static void
+pgaio_io_action_desc(PgAioInProgress *io, StringInfo s)
+{
+ switch (io->type)
+ {
+ case PGAIO_FSYNC:
+ appendStringInfo(s, "fd: %d, datasync: %d, barrier: %d",
+ io->d.fsync.fd,
+ io->d.fsync.datasync,
+ io->d.fsync.barrier);
+ break;
+ case PGAIO_FSYNC_WAL:
+ appendStringInfo(s, "flush_no: %d, fd: %d, datasync: %d, barrier: %d",
+ io->d.fsync_wal.flush_no,
+ io->d.fsync_wal.fd,
+ io->d.fsync_wal.datasync,
+ io->d.fsync_wal.barrier);
+ break;
+ case PGAIO_FLUSH_RANGE:
+ appendStringInfo(s, "fd: %d, offset: %llu, nbytes: %llu",
+ io->d.flush_range.fd,
+ (unsigned long long) io->d.flush_range.offset,
+ (unsigned long long) io->d.flush_range.nbytes);
+ break;
+ case PGAIO_READ_BUFFER:
+ appendStringInfo(s, "fd: %d, mode: %d, offset: %u, nbytes: %u, already_done: %u, buf/data: %d/%p",
+ io->d.read_buffer.fd,
+ io->d.read_buffer.mode,
+ io->d.read_buffer.offset,
+ io->d.read_buffer.nbytes,
+ io->d.read_buffer.already_done,
+ io->d.read_buffer.buf,
+ io->d.read_buffer.bufdata);
+ break;
+ case PGAIO_WRITE_BUFFER:
+ appendStringInfo(s, "fd: %d, offset: %u, nbytes: %u, already_done: %u, release_lock: %d, buf/data: %u/%p",
+ io->d.write_buffer.fd,
+ io->d.write_buffer.offset,
+ io->d.write_buffer.nbytes,
+ io->d.write_buffer.already_done,
+ io->d.write_buffer.release_lock,
+ io->d.write_buffer.buf,
+ io->d.write_buffer.bufdata);
+ break;
+ case PGAIO_WRITE_WAL:
+ appendStringInfo(s, "write_no: %d, fd: %d, offset: %u, nbytes: %u, already_done: %u, bufdata: %p, no-reorder: %d",
+ io->d.write_wal.write_no,
+ io->d.write_wal.fd,
+ io->d.write_wal.offset,
+ io->d.write_wal.nbytes,
+ io->d.write_wal.already_done,
+ io->d.write_wal.bufdata,
+ io->d.write_wal.no_reorder);
+ break;
+ case PGAIO_WRITE_GENERIC:
+ appendStringInfo(s, "fd: %d, offset: %llu, nbytes: %u, already_done: %u, bufdata: %p, no-reorder: %d",
+ io->d.write_generic.fd,
+ (unsigned long long) io->d.write_generic.offset,
+ io->d.write_generic.nbytes,
+ io->d.write_generic.already_done,
+ io->d.write_generic.bufdata,
+ io->d.write_generic.no_reorder);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+pgaio_io_print_one(PgAioInProgress *io, StringInfo s)
+{
+ appendStringInfo(s, "aio %zu/"UINT64_FORMAT": action: %s, ring: %d, init: %d, flags: ",
+ io - aio_ctl->in_progress_io,
+ io->generation,
+ pgaio_io_action_string(io->type),
+ io->ring,
+ io->owner_id);
+ pgaio_io_flag_string(io->flags, s);
+ appendStringInfo(s, ", result: %d, user/system_referenced: %d/%d (",
+ io->result,
+ io->user_referenced,
+ io->system_referenced);
+ pgaio_io_action_desc(io, s);
+ appendStringInfoString(s, ")");
+}
+
+void
+pgaio_io_ref_print(PgAioIoRef *ref, StringInfo s)
+{
+ PgAioInProgress *io;
+
+ io = &aio_ctl->in_progress_io[ref->aio_index];
+
+ pgaio_io_print(io, s);
+}
+
+void
+pgaio_io_print(PgAioInProgress *io, StringInfo s)
+{
+ bool alloc = false;
+ MemoryContext old_context;
+
+ if (s == NULL)
+ {
+ old_context = MemoryContextSwitchTo(ErrorContext);
+ s = makeStringInfo();
+ alloc = true;
+ }
+
+ pgaio_io_print_one(io, s);
+
+ {
+ PgAioInProgress *cur = io;
+ int nummerge = 0;
+
+ if (cur->merge_with)
+ appendStringInfoString(s, "\n merge with:");
+
+ while (cur->merge_with)
+ {
+ nummerge++;
+ appendStringInfo(s, "\n %d: ", nummerge);
+ pgaio_io_print_one(cur->merge_with, s);
+
+ cur = cur->merge_with;
+ }
+ }
+
+ if (alloc)
+ {
+ ereport(LOG,
+ errmsg("%s", s->data),
+ errhidestmt(true),
+ errhidecontext(true));
+ pfree(s->data);
+ pfree(s);
+ MemoryContextReset(ErrorContext);
+ MemoryContextSwitchTo(old_context);
+ }
+}
+
+void
+pgaio_print_list(dlist_head *head, StringInfo s, size_t offset)
+{
+ bool alloc = false;
+ dlist_iter iter;
+ bool first = true;
+ MemoryContext old_context;
+
+ if (s == NULL)
+ {
+ old_context = MemoryContextSwitchTo(ErrorContext);
+ s = makeStringInfo();
+ alloc = true;
+ }
+
+ dlist_foreach(iter, head)
+ {
+ PgAioInProgress *io = ((PgAioInProgress *) ((char *) (iter.cur) - offset));
+
+ if (!first)
+ appendStringInfo(s, "\n");
+ first = false;
+
+ pgaio_io_print(io, s);
+ }
+
+ if (alloc)
+ {
+ ereport(LOG,
+ errmsg("%s", s->data),
+ errhidestmt(true),
+ errhidecontext(true));
+ pfree(s->data);
+ pfree(s);
+ MemoryContextSwitchTo(old_context);
+ MemoryContextReset(ErrorContext);
+ }
+}
+
+PgAioBounceBuffer *
+pgaio_bounce_buffer_get(void)
+{
+ PgAioBounceBuffer *bb = NULL;
+
+ ResourceOwnerEnlargeAioBB(CurrentResourceOwner);
+
+ while (true)
+ {
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ if (!dlist_is_empty(&aio_ctl->unused_bounce_buffers))
+ {
+ dlist_node *node = dlist_pop_head_node(&aio_ctl->unused_bounce_buffers);
+
+ aio_ctl->unused_bounce_buffers_count--;
+ bb = dlist_container(PgAioBounceBuffer, node, node);
+
+ Assert(pg_atomic_read_u32(&bb->refcount) == 0);
+ }
+ LWLockRelease(SharedAIOCtlLock);
+
+ if (!bb)
+ {
+ elog(WARNING, "needed to drain while getting BB (inflight %d)",
+ pg_atomic_read_u32(&my_aio->inflight_count));
+
+ for (int i = 0; i < aio_ctl->num_contexts; i++)
+ pgaio_drain(&aio_ctl->contexts[i], false, true);
+ }
+ else
+ break;
+ }
+
+ pg_atomic_write_u32(&bb->refcount, 1);
+
+ ResourceOwnerRememberAioBB(CurrentResourceOwner, bb);
+
+ return bb;
+}
+
+static void
+pgaio_bounce_buffer_release_internal(PgAioBounceBuffer *bb, bool holding_lock, bool release_resowner)
+{
+ Assert(holding_lock == LWLockHeldByMe(SharedAIOCtlLock));
+ Assert(bb != NULL);
+
+ if (release_resowner)
+ ResourceOwnerForgetAioBB(CurrentResourceOwner, bb);
+
+ if (pg_atomic_sub_fetch_u32(&bb->refcount, 1) != 0)
+ return;
+
+ if (!holding_lock)
+ LWLockAcquire(SharedAIOCtlLock, LW_EXCLUSIVE);
+ dlist_push_tail(&aio_ctl->unused_bounce_buffers, &bb->node);
+ aio_ctl->unused_bounce_buffers_count++;
+ if (!holding_lock)
+ LWLockRelease(SharedAIOCtlLock);
+}
+
+void
+pgaio_bounce_buffer_release(PgAioBounceBuffer *bb)
+{
+ pgaio_bounce_buffer_release_internal(bb,
+ /* holding_lock = */ false,
+ /* release_resowner */ true);
+}
+
+char *
+pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb)
+{
+ return bb->buffer;
+}
+
+void
+pgaio_assoc_bounce_buffer(PgAioInProgress *io, PgAioBounceBuffer *bb)
+{
+ Assert(bb != NULL);
+ Assert(io->bb == NULL);
+ Assert(io->flags == PGAIOIP_IDLE);
+ Assert(io->user_referenced);
+ Assert(pg_atomic_read_u32(&bb->refcount) > 0);
+
+ io->bb = bb;
+ pg_atomic_fetch_add_u32(&bb->refcount, 1);
+}
+
+/*
+ * FIXME: Should this be in general or io_uring specific code?
+ */
+static PgAioContext *
+pgaio_acquire_context(void)
+{
+ PgAioContext *context;
+ int init_last_context = my_aio->last_context;
+
+ /*
+ * First try to acquire a context without blocking on the lock. We start
+ * with the last context we successfully used, which should lead to
+ * backends spreading to different contexts over time.
+ */
+ for (int i = 0; i < aio_ctl->num_contexts; i++)
+ {
+ context = &aio_ctl->contexts[my_aio->last_context];
+
+ if (LWLockConditionalAcquire(&context->submission_lock, LW_EXCLUSIVE))
+ {
+ return context;
+ }
+
+ if (++my_aio->last_context == aio_ctl->num_contexts)
+ my_aio->last_context = 0;
+ }
+
+ /*
+ * Couldn't acquire any without blocking. Block on the last + 1.;
+ */
+ if (++my_aio->last_context == aio_ctl->num_contexts)
+ my_aio->last_context = 0;
+ context = &aio_ctl->contexts[my_aio->last_context];
+
+ elog(DEBUG2, "blocking acquiring io context %d, started on %d",
+ my_aio->last_context, init_last_context);
+
+ LWLockAcquire(&context->submission_lock, LW_EXCLUSIVE);
+
+ return context;
+}
+
+/* --------------------------------------------------------------------------------
+ * io_uring related code
+ * --------------------------------------------------------------------------------
+ */
+
+static int
+pgaio_uring_submit(int max_submit, bool drain)
+{
+ PgAioInProgress *ios[PGAIO_SUBMIT_BATCH_SIZE];
+ struct io_uring_sqe *sqe[PGAIO_SUBMIT_BATCH_SIZE];
+ PgAioContext *context;
+ int nios = 0;
+
+ context = pgaio_acquire_context();
+
+ Assert(max_submit != 0 && max_submit <= my_aio->pending_count);
+
+ while (!dlist_is_empty(&my_aio->pending))
+ {
+ dlist_node *node;
+ PgAioInProgress *io;
+
+ if (nios == max_submit)
+ break;
+
+ /*
+ * XXX: Should there be a per-ring limit? If so, we'd probably best
+ * apply it here.
+ */
+
+ sqe[nios] = io_uring_get_sqe(&context->io_uring_ring);
+
+ if (!sqe[nios])
+ {
+ Assert(nios != 0);
+ elog(WARNING, "io_uring_get_sqe() returned NULL?");
+ break;
+ }
+
+ node = dlist_head_node(&my_aio->pending);
+ io = dlist_container(PgAioInProgress, io_node, node);
+ ios[nios] = io;
+
+ pgaio_io_prepare_submit(io, context - aio_ctl->contexts);
+
+ pgaio_uring_sq_from_io(context, ios[nios], sqe[nios]);
+
+ nios++;
+ }
+
+ Assert(nios > 0);
+
+ {
+ int ret;
+
+ pg_atomic_add_fetch_u32(&my_aio->inflight_count, nios);
+ my_aio->submissions_total_count++;
+
+again:
+ pgstat_report_wait_start(WAIT_EVENT_AIO_SUBMIT);
+ ret = io_uring_submit(&context->io_uring_ring);
+ pgstat_report_wait_end();
+
+ if (ret == -EINTR)
+ goto again;
+
+ if (ret < 0)
+ elog(PANIC, "failed: %d/%s",
+ ret, strerror(-ret));
+ }
+
+ LWLockRelease(&context->submission_lock);
+
+ /*
+ * Others might have been waiting for this IO. Because it wasn't
+ * marked as in-flight until now, they might be waiting for the
+ * CV. Wake'em up.
+ */
+ for (int i = 0; i < nios; i++)
+ {
+ PgAioInProgress *cur = ios[i];
+
+ while (cur)
+ {
+ ConditionVariableBroadcast(&cur->cv);
+ cur = cur->merge_with;
+ }
+ }
+
+ /* callbacks will be called later by pgaio_submit() */
+ if (drain)
+ pgaio_drain(context, false, false);
+
+ return nios;
+}
+
+static int
+pgaio_uring_drain_locked(PgAioContext *context)
+{
+ uint32 processed = 0;
+ int ready;
+
+ Assert(LWLockHeldByMe(&context->completion_lock));
+
+ /*
+ * Don't drain more events than available right now. Otherwise it's
+ * plausible that one backend could get stuck, for a while, receiving CQEs
+ * without actually processing them.
+ */
+ ready = io_uring_cq_ready(&context->io_uring_ring);
+
+ while (ready > 0)
+ {
+ struct io_uring_cqe *reaped_cqes[PGAIO_MAX_LOCAL_REAPED];
+ uint32 processed_one;
+
+ START_CRIT_SECTION();
+
+ ereport(DEBUG3,
+ errmsg("context [%zu]: drain %d ready",
+ context - aio_ctl->contexts,
+ ready),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ processed_one =
+ io_uring_peek_batch_cqe(&context->io_uring_ring,
+ reaped_cqes,
+ Min(PGAIO_MAX_LOCAL_REAPED, ready));
+ Assert(processed_one <= ready);
+
+ ready -= processed_one;
+ processed += processed_one;
+
+ for (int i = 0; i < processed_one; i++)
+ {
+ struct io_uring_cqe *cqe = reaped_cqes[i];
+
+ pgaio_uring_io_from_cqe(context, cqe);
+
+ io_uring_cqe_seen(&context->io_uring_ring, cqe);
+ }
+
+ END_CRIT_SECTION();
+ }
+
+ if (context->reaped_iovecs_count > context->unused_iovecs_count &&
+ LWLockConditionalAcquire(&context->submission_lock, LW_EXCLUSIVE))
+ {
+ ereport(DEBUG4,
+ errmsg("plenty reaped iovecs (%d), transferring",
+ context->reaped_iovecs_count),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ pgaio_uring_iovec_transfer(context);
+ LWLockRelease(&context->submission_lock);
+ }
+
+ return processed;
+}
+
+static int
+pgaio_uring_drain(PgAioContext *context)
+{
+ uint32 processed = 0;
+
+ Assert(!LWLockHeldByMe(&context->completion_lock));
+
+ while (io_uring_cq_ready(&context->io_uring_ring))
+ {
+ if (LWLockAcquireOrWait(&context->completion_lock, LW_EXCLUSIVE))
+ {
+ processed = pgaio_uring_drain_locked(context);
+
+ /*
+ * Call shared callbacks under lock - that avoids others to first
+ * wait on the completion_lock for pgaio_uring_wait_one, then
+ * again in pgaio_drain(), and then again on the condition
+ * variable for the AIO.
+ */
+ if (processed > 0)
+ pgaio_uncombine();
+
+ pgaio_complete_ios(false);
+
+ LWLockRelease(&context->completion_lock);
+ break;
+ }
+ }
+
+ return processed;
+}
+
+static void
+pgaio_uring_wait_one(PgAioContext *context, PgAioInProgress *io, uint64 ref_generation, uint32 wait_event_info)
+{
+ PgAioIPFlags flags;
+
+ if (LWLockAcquireOrWait(&context->completion_lock, LW_EXCLUSIVE))
+ {
+ ereport(DEBUG3,
+ errmsg("[%d] got the lock, before waiting for %zu/%llu, %d ready",
+ (int)(context - aio_ctl->contexts),
+ io - aio_ctl->in_progress_io,
+ (long long unsigned) io->generation,
+ io_uring_cq_ready(&context->io_uring_ring)),
+ errhidestmt(true),
+ errhidecontext(true));
+
+ flags = *(volatile PgAioIPFlags*) &io->flags;
+
+ pg_read_barrier();
+
+ if (!io_uring_cq_ready(&context->io_uring_ring) &&
+ io->generation == ref_generation &&
+ (flags & PGAIOIP_INFLIGHT))
+ {
+ int ret;
+
+ /*
+ * XXX: Temporary, non-assert, sanity checks, some of these are
+ * hard to hit in assert builds and lead to rare and hard to debug
+ * hangs.
+ */
+ if (io->generation != ref_generation ||
+ &aio_ctl->contexts[io->ring] != context)
+ {
+ ereport(PANIC,
+ errmsg("generation changed while locked, after wait: %llu, real: %llu",
+ (long long unsigned) ref_generation,
+ (long long unsigned) io->generation),
+ errhidestmt(true),
+ errhidecontext(true));
+ }
+
+ pgstat_report_wait_start(wait_event_info);
+ ret = __sys_io_uring_enter(context->io_uring_ring.ring_fd,
+ 0, 1,
+ IORING_ENTER_GETEVENTS, NULL);
+ pgstat_report_wait_end();
+
+ if (ret < 0 && errno == EINTR)
+ {
+ elog(DEBUG3, "got interrupted");
+ }
+ else if (ret != 0)
+ elog(WARNING, "unexpected: %d/%s: %m", ret, strerror(-ret));
+
+ /* see XXX above */
+ if (io->generation != ref_generation ||
+ &aio_ctl->contexts[io->ring] != context)
+ {
+ ereport(PANIC,
+ errmsg("generation changed while locked, after wait: %llu, real: %llu",
+ (long long unsigned) ref_generation,
+ (long long unsigned) io->generation),
+ errhidestmt(true),
+ errhidecontext(true));
+ }
+
+ ereport(DEBUG3,
+ errmsg("[%d] after waiting for %zu, %d ready",
+ (int)(context - aio_ctl->contexts),
+ io - aio_ctl->in_progress_io,
+ io_uring_cq_ready(&context->io_uring_ring)),
+ errhidestmt(true),
+ errhidecontext(true));
+ }
+ else
+ {
+ ereport(DEBUG3,
+ errmsg("[%d] got ready under lock, without wait %zu, %d ready",
+ (int)(context - aio_ctl->contexts),
+ io - aio_ctl->in_progress_io,
+ io_uring_cq_ready(&context->io_uring_ring)),
+ errhidestmt(true),
+ errhidecontext(true));
+ }
+
+ /*
+ * Drain and call shared callbacks under lock while we already have it
+ * - that avoids others to first wait on the completion_lock for
+ * pgaio_uring_wait_one, then again in pgaio_drain(), and then again
+ * on the condition variable for the AIO.
+ */
+ if (io_uring_cq_ready(&context->io_uring_ring))
+ {
+ pgaio_uring_drain_locked(context);
+ pgaio_uncombine();
+ pgaio_complete_ios(false);
+ }
+
+ LWLockRelease(&context->completion_lock);
+ }
+ else
+ {
+ ereport(DEBUG3,
+ errmsg("[%d] somebody else got the lock while waiting for %zu/%lu, %d ready",
+ (int)(context - aio_ctl->contexts),
+ io - aio_ctl->in_progress_io, io->generation,
+ io_uring_cq_ready(&context->io_uring_ring)),
+ errhidestmt(true),
+ errhidecontext(true));
+ }
+}
+
+static void
+pgaio_uring_io_from_cqe(PgAioContext *context, struct io_uring_cqe *cqe)
+{
+ PgAioInProgress *io;
+
+ io = io_uring_cqe_get_data(cqe);
+ Assert(io != NULL);
+ Assert(io->flags & PGAIOIP_INFLIGHT);
+ Assert(io->system_referenced);
+
+ *(volatile PgAioIPFlags*) &io->flags = (io->flags & ~PGAIOIP_INFLIGHT) | PGAIOIP_REAPED;
+ io->result = cqe->res;
+
+ dlist_push_tail(&my_aio->reaped, &io->io_node);
+
+ if (io->used_iovec != -1)
+ {
+ PgAioIovec *iovec = &context->iovecs[io->used_iovec];
+
+ slist_push_head(&context->reaped_iovecs, &iovec->node);
+ context->reaped_iovecs_count++;
+ }
+
+ /*
+ * FIXME: needs to be removed at some point, this is effectively a
+ * critical section.
+ */
+ if (cqe->res < 0)
+ {
+ elog(WARNING, "cqe: u: %p s: %d/%s f: %u",
+ io_uring_cqe_get_data(cqe),
+ cqe->res,
+ cqe->res < 0 ? strerror(-cqe->res) : "",
+ cqe->flags);
+ }
+}
+
+/*
+ * FIXME: These need to be deduplicated.
+ */
+static void
+prep_read_buffer_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+ PgAioInProgress *cur;
+ uint32 offset = io->d.read_buffer.offset;
+ int niov = 0;
+
+ cur = io;
+ while (cur)
+ {
+ offset += cur->d.read_buffer.already_done;
+ iovs[niov].iov_base = cur->d.read_buffer.bufdata + cur->d.read_buffer.already_done;
+ iovs[niov].iov_len = cur->d.read_buffer.nbytes - cur->d.read_buffer.already_done;
+
+ niov++;
+ cur = cur->merge_with;
+ }
+
+ io_uring_prep_readv(sqe,
+ io->d.read_buffer.fd,
+ iovs,
+ niov,
+ offset);
+}
+
+static void
+prep_write_buffer_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+ PgAioInProgress *cur;
+ uint32 offset = io->d.write_buffer.offset;
+ int niov = 0;
+
+ cur = io;
+ while (cur)
+ {
+ offset += cur->d.write_buffer.already_done;
+ iovs[niov].iov_base = cur->d.write_buffer.bufdata + cur->d.write_buffer.already_done;
+ iovs[niov].iov_len = cur->d.write_buffer.nbytes - cur->d.write_buffer.already_done;
+
+ niov++;
+ cur = cur->merge_with;
+ }
+
+ io_uring_prep_writev(sqe,
+ io->d.write_buffer.fd,
+ iovs,
+ niov,
+ offset);
+}
+
+static void
+prep_write_wal_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+ PgAioInProgress *cur;
+ uint32 offset = io->d.write_wal.offset;
+ int niov = 0;
+
+ cur = io;
+ while (cur)
+ {
+ offset += cur->d.write_wal.already_done;
+ iovs[niov].iov_base = cur->d.write_wal.bufdata + cur->d.write_wal.already_done;
+ iovs[niov].iov_len = cur->d.write_wal.nbytes - cur->d.write_wal.already_done;
+
+ niov++;
+ cur = cur->merge_with;
+ }
+
+ io_uring_prep_writev(sqe,
+ io->d.write_wal.fd,
+ iovs,
+ niov,
+ offset);
+}
+
+static void
+prep_write_generic_iov(PgAioInProgress *io, struct io_uring_sqe *sqe, struct iovec *iovs)
+{
+ PgAioInProgress *cur;
+ off_t offset = io->d.write_generic.offset;
+ int niov = 0;
+
+ cur = io;
+ while (cur)
+ {
+ offset += cur->d.write_generic.already_done;
+ iovs[niov].iov_base = cur->d.write_generic.bufdata + cur->d.write_generic.already_done;
+ iovs[niov].iov_len = cur->d.write_generic.nbytes - cur->d.write_generic.already_done;
+
+ niov++;
+ cur = cur->merge_with;
+ }
+
+ io_uring_prep_writev(sqe,
+ io->d.write_generic.fd,
+ iovs,
+ niov,
+ offset);
+}
+
+static void
+pgaio_uring_iovec_transfer(PgAioContext *context)
+{
+ Assert(LWLockHeldByMe(&context->submission_lock));
+ Assert(LWLockHeldByMe(&context->completion_lock));
+
+ while (!slist_is_empty(&context->reaped_iovecs))
+ {
+ slist_push_head(&context->unused_iovecs, slist_pop_head_node(&context->reaped_iovecs));
+ }
+
+ context->unused_iovecs_count += context->reaped_iovecs_count;
+ context->reaped_iovecs_count = 0;
+}
+
+static struct PgAioIovec *
+pgaio_uring_iovec_get(PgAioContext *context, PgAioInProgress *io)
+{
+ slist_node *node;
+ PgAioIovec *iov;
+
+ if (context->unused_iovecs_count == 0)
+ {
+ ereport(DEBUG2,
+ errmsg("out of unused iovecs, transferring %d reaped ones",
+ context->reaped_iovecs_count),
+ errhidestmt(true),
+ errhidecontext(true));
+ LWLockAcquire(&context->completion_lock, LW_EXCLUSIVE);
+ Assert(context->reaped_iovecs_count > 0);
+ pgaio_uring_iovec_transfer(context);
+ LWLockRelease(&context->completion_lock);
+ Assert(context->unused_iovecs_count > 0);
+ }
+
+ context->unused_iovecs_count--;
+ node = slist_pop_head_node(&context->unused_iovecs);
+ iov = slist_container(PgAioIovec, node, node);
+
+ io->used_iovec = iov - context->iovecs;
+
+ return iov;
+}
+
+static void
+pgaio_uring_sq_from_io(PgAioContext *context, PgAioInProgress *io, struct io_uring_sqe *sqe)
+{
+ PgAioIovec *iovec;
+
+ io->used_iovec = -1;
+
+ switch (io->type)
+ {
+ case PGAIO_FSYNC:
+ io_uring_prep_fsync(sqe,
+ io->d.fsync.fd,
+ io->d.fsync.datasync ? IORING_FSYNC_DATASYNC : 0);
+ if (io->d.fsync.barrier)
+ sqe->flags |= IOSQE_IO_DRAIN;
+ break;
+
+ case PGAIO_FSYNC_WAL:
+ io_uring_prep_fsync(sqe,
+ io->d.fsync_wal.fd,
+ io->d.fsync_wal.datasync ? IORING_FSYNC_DATASYNC : 0);
+ if (io->d.fsync.barrier)
+ sqe->flags |= IOSQE_IO_DRAIN;
+ break;
+
+ case PGAIO_READ_BUFFER:
+ iovec = pgaio_uring_iovec_get(context, io);
+ prep_read_buffer_iov(io, sqe, iovec->iovec);
+ //sqe->flags |= IOSQE_ASYNC;
+ break;
+
+ case PGAIO_WRITE_BUFFER:
+ iovec = pgaio_uring_iovec_get(context, io);
+ prep_write_buffer_iov(io, sqe, iovec->iovec);
+ break;
+
+ case PGAIO_FLUSH_RANGE:
+ io_uring_prep_rw(IORING_OP_SYNC_FILE_RANGE,
+ sqe,
+ io->d.flush_range.fd,
+ NULL,
+ io->d.flush_range.nbytes,
+ io->d.flush_range.offset);
+ sqe->sync_range_flags = SYNC_FILE_RANGE_WRITE;
+ break;
+
+ case PGAIO_WRITE_WAL:
+ iovec = pgaio_uring_iovec_get(context, io);
+ prep_write_wal_iov(io, sqe, iovec->iovec);
+ if (io->d.write_wal.no_reorder)
+ sqe->flags = IOSQE_IO_DRAIN;
+ break;
+
+ case PGAIO_WRITE_GENERIC:
+ iovec = pgaio_uring_iovec_get(context, io);
+ prep_write_generic_iov(io, sqe, iovec->iovec);
+ if (io->d.write_generic.no_reorder)
+ sqe->flags = IOSQE_IO_DRAIN;
+ break;
+
+ case PGAIO_NOP:
+ elog(ERROR, "not yet");
+ break;
+
+ case PGAIO_INVALID:
+ elog(ERROR, "invalid");
+ }
+
+ io_uring_sqe_set_data(sqe, io);
+}
+
+static int
+__sys_io_uring_enter(int fd, unsigned to_submit, unsigned min_complete,
+ unsigned flags, sigset_t *sig)
+{
+
+# ifndef __NR_io_uring_enter
+# define __NR_io_uring_enter 426
+# endif
+
+ return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
+ flags, sig, _NSIG / 8);
+}
+
+
+
+/* --------------------------------------------------------------------------------
+ * Code dealing with specific IO types
+ * --------------------------------------------------------------------------------
+ */
+
+void
+pgaio_io_start_flush_range(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes)
+{
+ pgaio_prepare_io(io, PGAIO_FLUSH_RANGE);
+
+ io->d.flush_range.fd = fd;
+ io->d.flush_range.offset = offset;
+ io->d.flush_range.nbytes = nbytes;
+
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "start_flush_range %zu: %d, %llu, %llu",
+ io - aio_ctl->in_progress_io,
+ fd, (unsigned long long) offset, (unsigned long long) nbytes);
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_read_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes, char *bufdata, int buffno, int mode)
+{
+ Assert(ShmemAddrIsValid(bufdata));
+
+ pgaio_prepare_io(io, PGAIO_READ_BUFFER);
+
+ io->d.read_buffer.buf = buffno;
+ io->d.read_buffer.mode = mode;
+ io->d.read_buffer.fd = fd;
+ io->d.read_buffer.offset = offset;
+ io->d.read_buffer.nbytes = nbytes;
+ io->d.read_buffer.bufdata = bufdata;
+ io->d.read_buffer.already_done = 0;
+ memcpy(&io->d.read_buffer.tag, tag, sizeof(io->d.read_buffer.tag));
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("start_read_buffer %zu: "
+ "fd %d, off: %llu, bytes: %llu, buff: %d, data %p",
+ io - aio_ctl->in_progress_io,
+ fd,
+ (unsigned long long) offset,
+ (unsigned long long) nbytes,
+ buffno,
+ bufdata),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes, char *bufdata, int buffno, bool release_lock)
+{
+ Assert(ShmemAddrIsValid(bufdata));
+
+ pgaio_prepare_io(io, PGAIO_WRITE_BUFFER);
+
+ io->d.write_buffer.buf = buffno;
+ io->d.write_buffer.fd = fd;
+ io->d.write_buffer.offset = offset;
+ io->d.write_buffer.nbytes = nbytes;
+ io->d.write_buffer.bufdata = bufdata;
+ io->d.write_buffer.already_done = 0;
+ io->d.write_buffer.release_lock = release_lock;
+ memcpy(&io->d.write_buffer.tag, tag, sizeof(io->d.write_buffer.tag));
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("start_write_buffer %zu: "
+ "fd %d, off: %llu, nbytes: %llu, rel: %d, buff: %d, data %p",
+ io - aio_ctl->in_progress_io,
+ fd,
+ (unsigned long long) offset,
+ (unsigned long long) nbytes,
+ release_lock,
+ buffno,
+ bufdata),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_wal(PgAioInProgress *io, int fd, uint32 offset, uint32 nbytes, char *bufdata, bool no_reorder, uint32 write_no)
+{
+ Assert(ShmemAddrIsValid(bufdata));
+
+ pgaio_prepare_io(io, PGAIO_WRITE_WAL);
+
+ io->d.write_wal.fd = fd;
+ io->d.write_wal.no_reorder = no_reorder;
+ io->d.write_wal.offset = offset;
+ io->d.write_wal.nbytes = nbytes;
+ io->d.write_wal.bufdata = bufdata;
+ io->d.write_wal.already_done = 0;
+ io->d.write_wal.write_no = write_no;
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("start_write_wal %zu:"
+ "fd %d, off: %llu, bytes: %llu, write_no: %d, no_reorder: %d, data %p",
+ io - aio_ctl->in_progress_io,
+ fd,
+ (unsigned long long) offset,
+ (unsigned long long) nbytes,
+ write_no,
+ no_reorder,
+ bufdata),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_write_generic(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes, char *bufdata, bool no_reorder)
+{
+ Assert(ShmemAddrIsValid(bufdata));
+
+ pgaio_prepare_io(io, PGAIO_WRITE_GENERIC);
+
+ io->d.write_generic.fd = fd;
+ io->d.write_generic.no_reorder = no_reorder;
+ io->d.write_generic.offset = offset;
+ io->d.write_generic.nbytes = nbytes;
+ io->d.write_generic.bufdata = bufdata;
+ io->d.write_generic.already_done = 0;
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("start_write_generic %zu:"
+ "fd %d, off: %llu, bytes: %llu, no_reorder: %d, data %p",
+ io - aio_ctl->in_progress_io,
+ fd,
+ (unsigned long long) offset,
+ (unsigned long long) nbytes,
+ no_reorder,
+ bufdata),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_nop(PgAioInProgress *io)
+{
+ pgaio_prepare_io(io, PGAIO_NOP);
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fsync(PgAioInProgress *io, int fd, bool barrier)
+{
+ pgaio_prepare_io(io, PGAIO_FSYNC);
+ io->d.fsync.fd = fd;
+ io->d.fsync.barrier = barrier;
+ io->d.fsync.datasync = false;
+
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "start_fsync %zu:"
+ "fd %d, is_barrier: %d, is_datasync: %d",
+ io - aio_ctl->in_progress_io,
+ fd,
+ barrier,
+ false);
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fdatasync(PgAioInProgress *io, int fd, bool barrier)
+{
+ pgaio_prepare_io(io, PGAIO_FSYNC);
+ io->d.fsync.fd = fd;
+ io->d.fsync.barrier = barrier;
+ io->d.fsync.datasync = true;
+
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "start_fsync %zu:"
+ "fd %d, is_barrier: %d, is_datasync: %d",
+ io - aio_ctl->in_progress_io,
+ fd,
+ barrier,
+ true);
+#endif
+
+ pgaio_finish_io(io);
+}
+
+void
+pgaio_io_start_fsync_wal(PgAioInProgress *io, int fd, bool barrier, bool datasync_only, uint32 flush_no)
+{
+ pgaio_prepare_io(io, PGAIO_FSYNC_WAL);
+ io->d.fsync_wal.fd = fd;
+ io->d.fsync_wal.barrier = barrier;
+ io->d.fsync_wal.datasync = datasync_only;
+ io->d.fsync_wal.flush_no = flush_no;
+
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "start_fsync_wal %zu:"
+ "fd %d, is_barrier: %d, is_datasync: %d, flush_no: %d",
+ io - aio_ctl->in_progress_io,
+ fd,
+ barrier,
+ datasync_only,
+ flush_no);
+#endif
+
+ pgaio_finish_io(io);
+}
+
+static bool
+pgaio_complete_nop(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "completed nop");
+#endif
+
+ return true;
+}
+
+static bool
+pgaio_complete_fsync(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "completed fsync: %zu",
+ io - aio_ctl->in_progress_io);
+#endif
+ if (io->result != 0)
+ elog(PANIC, "fsync needs better error handling");
+
+ return true;
+}
+
+static bool
+pgaio_complete_fsync_wal(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "completed fsync_wal: %zu",
+ io - aio_ctl->in_progress_io);
+#endif
+ if (io->result != 0)
+ elog(PANIC, "fsync_wal needs better error handling");
+
+ XLogFlushComplete(io, io->d.fsync_wal.flush_no);
+ return true;
+}
+
+static bool
+pgaio_complete_flush_range(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ elog(DEBUG3, "completed flush_range: %zu, %s",
+ io - aio_ctl->in_progress_io,
+ io->result < 0 ? strerror(-io->result) : "ok");
+#endif
+
+ return true;
+}
+
+static bool
+pgaio_complete_read_buffer(PgAioInProgress *io)
+{
+ Buffer buffer = io->d.read_buffer.buf;
+
+ bool call_completion;
+ bool failed;
+ bool done;
+
+ /* great for torturing error handling */
+#if 0
+ if (io->result > 4096)
+ io->result = 4096;
+#endif
+
+#ifdef PGAIO_VERBOSE
+ ereport(io->flags & PGAIOIP_RETRY ? DEBUG1 : DEBUG3,
+ errmsg("completed read_buffer: %zu, %d/%s, buf %d",
+ io - aio_ctl->in_progress_io,
+ io->result,
+ io->result < 0 ? strerror(-io->result) : "ok",
+ io->d.read_buffer.buf),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ if (io->result != (io->d.read_buffer.nbytes - io->d.read_buffer.already_done))
+ {
+ MemoryContext old_context = MemoryContextSwitchTo(aio_retry_context);
+
+ failed = true;
+
+ //pgaio_io_print(io, NULL);
+
+ if (io->result < 0)
+ {
+ if (io->result == -EAGAIN || io->result == -EINTR)
+ {
+ elog(PANIC, "need to implement retries for failed requests");
+ }
+ else
+ {
+ ereport(WARNING,
+ errcode_for_file_access(),
+ errmsg("could not read block %u in file \"%s\": %s",
+ io->d.read_buffer.tag.blockNum,
+ relpath(io->d.read_buffer.tag.rnode,
+ io->d.read_buffer.tag.forkNum),
+ strerror(-io->result)));
+ }
+
+ call_completion = true;
+ done = true;
+ }
+ else
+ {
+ io->flags |= PGAIOIP_SOFT_FAILURE;
+ call_completion = false;
+ done = false;
+ io->d.read_buffer.already_done += io->result;
+
+ /*
+ * This is actually pretty common and harmless, happens when part
+ * of the block is in the kernel page cache, but the other
+ * isn't. So don't issue WARNING/ERROR, but just retry.
+ *
+ * While it can happen with single BLCKSZ reads (since they're
+ * bigger than typical page sizes), it's made much more likely by
+ * us combining reads.
+ *
+ * XXX: Should we handle repeated failures for the same blocks
+ * differently?
+ */
+ ereport(DEBUG1,
+ errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("aio %zd: could not read block %u in file \"%s\": read only %d of %d bytes (init: %d, cur: %d)",
+ io - aio_ctl->in_progress_io,
+ io->d.read_buffer.tag.blockNum,
+ relpath(io->d.read_buffer.tag.rnode,
+ io->d.read_buffer.tag.forkNum),
+ io->result, BLCKSZ,
+ io->owner_id, MyProc ? MyProc->pgprocno : INVALID_PGPROCNO));
+ }
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextReset(aio_retry_context);
+ }
+ else
+ {
+ io->d.read_buffer.already_done += io->result;
+ Assert(io->d.read_buffer.already_done == BLCKSZ);
+
+ call_completion = true;
+ failed = false;
+ done = true;
+ }
+
+ if (call_completion)
+ ReadBufferCompleteRead(buffer,
+ &io->d.read_buffer.tag,
+ io->d.read_buffer.bufdata,
+ io->d.read_buffer.mode,
+ failed);
+
+ return done;
+}
+
+static bool
+pgaio_complete_write_buffer(PgAioInProgress *io)
+{
+ Buffer buffer = io->d.write_buffer.buf;
+
+ bool call_completion;
+ bool failed;
+ bool done;
+
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("completed write_buffer: %zu, %d/%s, buf %d",
+ io - aio_ctl->in_progress_io,
+ io->result,
+ io->result < 0 ? strerror(-io->result) : "ok",
+ io->d.write_buffer.buf),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ if (io->result != (io->d.write_buffer.nbytes - io->d.write_buffer.already_done))
+ {
+ MemoryContext old_context = MemoryContextSwitchTo(aio_retry_context);
+
+ failed = true;
+
+ if (io->result < 0)
+ {
+ int elevel ;
+
+ failed = true;
+
+ if (io->result == -EAGAIN || io->result == -EINTR)
+ {
+ io->flags |= PGAIOIP_SOFT_FAILURE;
+
+ call_completion = false;
+ done = false;
+
+ elevel = DEBUG1;
+ }
+ else
+ {
+ io->flags |= PGAIOIP_HARD_FAILURE;
+ elevel = WARNING;
+
+ call_completion = true;
+ done = true;
+
+ pgaio_io_print(io, NULL);
+ }
+
+ ereport(elevel,
+ errcode_for_file_access(),
+ errmsg("aio %zd: could not write block %u in file \"%s\": %s",
+ io - aio_ctl->in_progress_io,
+ io->d.write_buffer.tag.blockNum,
+ relpath(io->d.write_buffer.tag.rnode,
+ io->d.write_buffer.tag.forkNum),
+ strerror(-io->result)),
+ errhint("Check free disk space."));
+ }
+ else
+ {
+ io->flags |= PGAIOIP_SOFT_FAILURE;
+ io->d.write_buffer.already_done += io->result;
+
+ call_completion = false;
+ done = false;
+
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("aio %zd: could not write block %u in file \"%s\": wrote only %d of %d bytes (init: %d, cur: %d)",
+ io - aio_ctl->in_progress_io,
+ io->d.write_buffer.tag.blockNum,
+ relpath(io->d.write_buffer.tag.rnode,
+ io->d.write_buffer.tag.forkNum),
+ io->result, (io->d.write_buffer.nbytes - io->d.write_buffer.already_done),
+ io->owner_id, MyProc ? MyProc->pgprocno : INVALID_PGPROCNO)));
+ }
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextReset(aio_retry_context);
+ }
+ else
+ {
+ io->d.write_buffer.already_done += io->result;
+ Assert(io->d.write_buffer.already_done == BLCKSZ);
+
+ call_completion = true;
+ failed = false;
+ done = true;
+ }
+
+ if (call_completion && BufferIsValid(buffer))
+ ReadBufferCompleteWrite(buffer, failed, io->d.write_buffer.release_lock);
+
+ return done;
+}
+
+static bool
+pgaio_complete_write_wal(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("completed write_wal: %zu, %d/%s",
+ io - aio_ctl->in_progress_io,
+ io->result,
+ io->result < 0 ? strerror(-io->result) : "ok"),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ if (io->result < 0)
+ {
+ if (io->result == -EAGAIN || io->result == -EINTR)
+ {
+ elog(WARNING, "need to implement retries");
+ }
+
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file: %s",
+ strerror(-io->result))));
+ }
+ else if (io->result != (io->d.write_wal.nbytes - io->d.write_wal.already_done))
+ {
+ /* FIXME: implement retries for short writes */
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file: wrote only %d of %d bytes",
+ io->result, (io->d.write_wal.nbytes - io->d.write_wal.already_done))));
+ }
+
+ XLogWriteComplete(io, io->d.write_wal.write_no);
+
+ return true;
+}
+
+
+static bool
+pgaio_complete_write_generic(PgAioInProgress *io)
+{
+#ifdef PGAIO_VERBOSE
+ ereport(DEBUG3,
+ errmsg("completed write_generic: %zu, %d/%s",
+ io - aio_ctl->in_progress_io,
+ io->result,
+ io->result < 0 ? strerror(-io->result) : "ok"),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ if (io->result < 0)
+ {
+ if (io->result == -EAGAIN || io->result == -EINTR)
+ {
+ elog(WARNING, "need to implement retries");
+ }
+
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file: %s",
+ strerror(-io->result))));
+ }
+ else if (io->result != (io->d.write_generic.nbytes - io->d.write_generic.already_done))
+ {
+ /* FIXME: implement retries for short writes */
+ /* FIXME: not WAL */
+ ereport(PANIC,
+ (errcode_for_file_access(),
+ errmsg("could not write to log file: wrote only %d of %d bytes",
+ io->result, (io->d.write_generic.nbytes - io->d.write_generic.already_done))));
+ }
+
+ return true;
+}
+
+
+/* --------------------------------------------------------------------------------
+ * SQL interface functions
+ * --------------------------------------------------------------------------------
+ */
+
+Datum
+pg_stat_get_aio_backends(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_AIO_BACKEND_COLS 13
+
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ for (int i = 0; i < aio_ctl->backend_state_count; i++)
+ {
+ PgAioPerBackend *bs = &aio_ctl->backend_state[i];
+ Datum values[PG_STAT_GET_AIO_BACKEND_COLS];
+ bool nulls[PG_STAT_GET_AIO_BACKEND_COLS];
+ int pid = ProcGlobal->allProcs[i].pid;
+
+ if (pid == 0)
+ continue;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[ 0] = Int32GetDatum(pid);
+ values[ 1] = Int64GetDatum(bs->executed_total_count);
+ values[ 2] = Int64GetDatum(bs->issued_total_count);
+ values[ 3] = Int64GetDatum(bs->submissions_total_count);
+ values[ 4] = Int64GetDatum(bs->foreign_completed_total_count);
+ values[ 5] = Int64GetDatum(bs->retry_total_count);
+ values[ 6] = Int64GetDatum(pg_atomic_read_u32(&bs->inflight_count));
+ values[ 7] = Int32GetDatum(bs->unused_count);
+ values[ 8] = Int32GetDatum(bs->outstanding_count);
+ values[ 9] = Int32GetDatum(bs->pending_count);
+ values[10] = Int32GetDatum(bs->local_completed_count);
+ values[11] = Int32GetDatum(bs->foreign_completed_count);
+ values[12] = Int32GetDatum(bs->last_context);
+
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ return (Datum) 0;
+}
+
+Datum
+pg_stat_get_aios(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_AIOS_COLS 8
+
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ StringInfoData tmps;
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ initStringInfo(&tmps);
+
+ for (int i = 0; i < max_aio_in_progress; i++)
+ {
+ PgAioInProgress *io = &aio_ctl->in_progress_io[i];
+ Datum values[PG_STAT_GET_AIOS_COLS];
+ bool nulls[PG_STAT_GET_AIOS_COLS];
+ int owner_pid;
+ uint32 owner_id;
+ PgAioInProgressFlags flags = io->flags;
+
+ if (flags & PGAIOIP_UNUSED)
+ continue;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[ 0] = Int32GetDatum(i);
+ values[ 1] = PointerGetDatum(cstring_to_text(pgaio_io_action_string(io->type)));
+
+ pgaio_io_flag_string(flags, &tmps);
+ values[ 2] = PointerGetDatum(cstring_to_text(tmps.data));
+ resetStringInfo(&tmps);
+
+ values[ 3] = Int32GetDatum(io->ring);
+
+ owner_id = io->owner_id; // XXX: READ_ONCE needed?
+ if (owner_id != INVALID_PGPROCNO)
+ {
+ owner_pid = ProcGlobal->allProcs[owner_id].pid;
+ values[ 4] = Int32GetDatum(owner_pid);
+ }
+ else
+ {
+ nulls[ 4] = true;
+ }
+
+ values[ 5] = Int64GetDatum(io->generation);
+
+ values[ 6] = Int32GetDatum(io->result);
+
+ pgaio_io_action_desc(io, &tmps);
+ values[ 7] = PointerGetDatum(cstring_to_text(tmps.data));
+ resetStringInfo(&tmps);
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ return (Datum) 0;
+}
--- /dev/null
+#include "postgres.h"
+
+#include "storage/aio.h"
+#include "miscadmin.h"
+
+/* typedef is in header */
+typedef struct PgStreamingWriteItem
+{
+ /* membership in PgStreamingWrite->issued, PgStreamingWrite->available */
+ dlist_node node;
+
+ PgAioInProgress *aio;
+ bool in_progress;
+ void *private_data;
+
+ struct pg_streaming_write *pgsw;
+
+ PgAioOnCompletionLocalContext on_completion;
+} PgStreamingWriteItem;
+
+/* typedef is in header */
+struct pg_streaming_write
+{
+ uint32 iodepth;
+
+ uint32 inflight_count;
+
+ void *private_data;
+
+ pg_streaming_write_completed on_completion;
+
+ /* submitted writes */
+ dlist_head issued;
+
+ /* available writes (unused or completed) */
+ dlist_head available;
+
+ PgStreamingWriteItem all_items[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static void pg_streaming_write_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+
+pg_streaming_write*
+pg_streaming_write_alloc(uint32 iodepth, void *private_data, pg_streaming_write_completed on_completion)
+{
+ pg_streaming_write *pgsw;
+
+ iodepth = Max(Min(iodepth, NBuffers / 128), 1);
+
+ pgsw = palloc0(offsetof(pg_streaming_write, all_items)
+ + sizeof(PgStreamingWriteItem) * iodepth);
+
+ pgsw->iodepth = iodepth;
+ pgsw->private_data = private_data;
+ pgsw->on_completion = on_completion;
+
+ dlist_init(&pgsw->available);
+ dlist_init(&pgsw->issued);
+
+ for (int i = 0; i < pgsw->iodepth; i++)
+ {
+ PgStreamingWriteItem *this_write = &pgsw->all_items[i];
+
+ this_write->on_completion.callback = pg_streaming_write_complete;
+ this_write->pgsw = pgsw;
+ dlist_push_tail(&pgsw->available, &this_write->node);
+ }
+
+ return pgsw;
+}
+
+PgAioInProgress *
+pg_streaming_write_get_io(pg_streaming_write *pgsw)
+{
+ PgStreamingWriteItem *this_write;
+
+ /* loop in case the callback issues further writes */
+ while (dlist_is_empty(&pgsw->available))
+ {
+ Assert(!dlist_is_empty(&pgsw->issued));
+
+ this_write = dlist_head_element(PgStreamingWriteItem, node, &pgsw->issued);
+ Assert(this_write->in_progress);
+
+ pgaio_io_wait(this_write->aio);
+ Assert(!this_write->in_progress);
+ }
+
+ this_write = dlist_head_element(PgStreamingWriteItem, node, &pgsw->available);
+
+ Assert(!this_write->in_progress);
+
+ if (!this_write->aio)
+ {
+ this_write->aio = pgaio_io_get();
+ }
+
+ return this_write->aio;
+}
+
+uint32
+pg_streaming_write_inflight(pg_streaming_write *pgsw)
+{
+ return pgsw->inflight_count;
+}
+
+void
+pg_streaming_write_write(pg_streaming_write *pgsw, PgAioInProgress *io, void *private_data)
+{
+ PgStreamingWriteItem *this_write;
+
+ Assert(!dlist_is_empty(&pgsw->available));
+
+ this_write = dlist_container(PgStreamingWriteItem, node, dlist_pop_head_node(&pgsw->available));
+
+ Assert(!this_write->in_progress);
+ Assert(this_write->aio && this_write->aio == io);
+
+ dlist_delete_from(&pgsw->available, &this_write->node);
+
+ pgaio_io_on_completion_local(this_write->aio, &this_write->on_completion);
+
+ this_write->private_data = private_data;
+ this_write->in_progress = true;
+
+ dlist_push_tail(&pgsw->issued, &this_write->node);
+ pgsw->inflight_count++;
+
+ /*
+ * XXX: It'd make sense to trigger io submission more often.
+ */
+
+ ereport(DEBUG3, errmsg("submit %zu, %d deep", this_write - pgsw->all_items, pgsw->inflight_count),
+ errhidestmt(true),
+ errhidecontext(true));
+}
+
+static void
+pg_streaming_write_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io)
+{
+ PgStreamingWriteItem *this_write = pgaio_ocb_container(PgStreamingWriteItem, on_completion, ocb);
+ pg_streaming_write *pgsw = this_write->pgsw;
+ void *private_data;
+
+ Assert(this_write->in_progress);
+ Assert(pgaio_io_done(io));
+ Assert(pgaio_io_success(io));
+
+ dlist_delete_from(&pgsw->issued, &this_write->node);
+ pgsw->inflight_count--;
+
+ private_data = this_write->private_data;
+ this_write->private_data = NULL;
+ this_write->in_progress = false;
+ pgaio_io_recycle(this_write->aio);
+
+ dlist_push_tail(&pgsw->available, &this_write->node);
+
+ /* call callback after all other handling so it can issue IO */
+ pgsw->on_completion(pgsw->private_data, private_data);
+
+ ereport(DEBUG3, errmsg("complete %zu", this_write - pgsw->all_items),
+ errhidestmt(true),
+ errhidecontext(true));
+}
+
+void
+pg_streaming_write_wait_all(pg_streaming_write *pgsw)
+{
+ while (!dlist_is_empty(&pgsw->issued))
+ {
+ PgStreamingWriteItem *this_write =
+ dlist_head_element(PgStreamingWriteItem, node, &pgsw->issued);
+
+ if (!this_write->in_progress)
+ continue;
+ pgaio_io_wait(this_write->aio);
+ Assert(!this_write->in_progress);
+ }
+
+ Assert(pgsw->inflight_count == 0);
+}
+
+void
+pg_streaming_write_free(pg_streaming_write *pgsw)
+{
+ for (uint32 off = 0; off < pgsw->iodepth; off++)
+ {
+ PgStreamingWriteItem *this_write = &pgsw->all_items[off];
+
+ Assert(!this_write->in_progress);
+ if (this_write->aio)
+ pgaio_io_release(this_write->aio);
+ this_write->aio = NULL;
+ }
+
+ pfree(pgsw);
+}
+
+typedef struct PgStreamingReadItem
+{
+ /* membership in PgStreamingRead->issued, PgStreamingRead->available */
+ dlist_node node;
+ /* membership in PgStreamingRead->in_order */
+ dlist_node sequence_node;
+
+ PgAioOnCompletionLocalContext on_completion;
+ PgStreamingRead *pgsr;
+ PgAioInProgress *aio;
+
+ /* is this IO still considered to be in progress */
+ bool in_progress;
+ /* is this item currently valid / used */
+ bool valid;
+ uintptr_t read_private;
+} PgStreamingReadItem;
+
+struct PgStreamingRead
+{
+ uint32 iodepth_max;
+ uint32 distance_max;
+ uint32 all_items_count;
+
+ uintptr_t pgsr_private;
+ PgStreamingReadDetermineNextCB determine_next_cb;
+ PgStreamingReadRelease release_cb;
+
+ uint32 current_window;
+
+ /* number of requests issued */
+ uint64 prefetched_total_count;
+ /* number of requests submitted to kernel */
+ uint64 submitted_total_count;
+ /* number of current requests completed */
+ uint32 completed_count;
+ /* number of current requests in flight */
+ int32 inflight_count;
+ /* number of requests not yet submitted */
+ int32 pending_count;
+ /* number of requests that didn't require IO (debugging only) */
+ int32 no_io_count;
+
+ bool hit_end;
+
+ /* submitted reads */
+ dlist_head issued;
+
+ /* available reads (unused or completed) */
+ dlist_head available;
+
+ /*
+ * IOs, be they completed or in progress, in the order that the callback
+ * returned them.
+ */
+ dlist_head in_order;
+
+ PgStreamingReadItem all_items[FLEXIBLE_ARRAY_MEMBER];
+};
+
+static void pg_streaming_read_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+static void pg_streaming_read_prefetch(PgStreamingRead *pgsr);
+
+PgStreamingRead *
+pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private,
+ PgStreamingReadDetermineNextCB determine_next_cb,
+ PgStreamingReadRelease release_cb)
+{
+ PgStreamingRead *pgsr;
+
+ iodepth = Max(Min(iodepth, NBuffers / 128), 1);
+
+ pgsr = palloc0(offsetof(PgStreamingRead, all_items) +
+ sizeof(PgStreamingReadItem) * iodepth * 2);
+
+ pgsr->iodepth_max = iodepth;
+ pgsr->distance_max = iodepth;
+ pgsr->all_items_count = pgsr->iodepth_max + pgsr->distance_max;
+ pgsr->pgsr_private = pgsr_private;
+ pgsr->determine_next_cb = determine_next_cb;
+ pgsr->release_cb = release_cb;
+
+ pgsr->current_window = 0;
+
+ dlist_init(&pgsr->available);
+ dlist_init(&pgsr->in_order);
+ dlist_init(&pgsr->issued);
+
+ for (int i = 0; i < pgsr->all_items_count; i++)
+ {
+ PgStreamingReadItem *this_read = &pgsr->all_items[i];
+
+ this_read->on_completion.callback = pg_streaming_read_complete;
+ this_read->pgsr = pgsr;
+ dlist_push_tail(&pgsr->available, &this_read->node);
+ }
+
+ return pgsr;
+}
+
+void
+pg_streaming_read_free(PgStreamingRead *pgsr)
+{
+ /*
+ * Prevent further read-ahead from being queued in the completion
+ * callback. We'd not necessarily wait for them in this loop, leaving
+ * their completion callbacks point to already freed memory.
+ */
+ pgsr->hit_end = true;
+
+ for (int i = 0; i < pgsr->all_items_count; i++)
+ {
+ PgStreamingReadItem *this_read = &pgsr->all_items[i];
+
+ if (this_read->in_progress)
+ {
+ Assert(this_read->valid);
+ pgaio_io_wait(this_read->aio);
+ Assert(!this_read->in_progress);
+ }
+
+ if (this_read->valid)
+ pgsr->release_cb(pgsr->pgsr_private, this_read->read_private);
+
+ if (this_read->aio)
+ {
+ pgaio_io_release(this_read->aio);
+ this_read->aio = NULL;
+ }
+ }
+
+ pfree(pgsr);
+}
+
+static void
+pg_streaming_read_complete(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io)
+{
+ PgStreamingReadItem *this_read = pgaio_ocb_container(PgStreamingReadItem, on_completion, ocb);
+ PgStreamingRead *pgsr = this_read->pgsr;
+
+#if 0
+ if ((pgsr->prefetched_total_count % 10000) == 0)
+ ereport(LOG, errmsg("pgsr read completed: qd %d completed: %d",
+ pgsr->inflight_count, pgsr->completed_count),
+ errhidestmt(true),
+ errhidecontext(true));
+#endif
+
+ Assert(this_read->in_progress);
+ Assert(this_read->valid);
+ Assert(this_read->aio == io);
+ Assert(pgsr->inflight_count > 0);
+ Assert(pgaio_io_done(io));
+ Assert(pgaio_io_success(io));
+
+ dlist_delete_from(&pgsr->issued, &this_read->node);
+ pgsr->inflight_count--;
+ pgsr->completed_count++;
+ this_read->in_progress = false;
+ pgaio_io_recycle(this_read->aio);
+
+ pg_streaming_read_prefetch(pgsr);
+}
+
+static void
+pg_streaming_read_prefetch_one(PgStreamingRead *pgsr)
+{
+ PgStreamingReadItem *this_read;
+ PgStreamingReadNextStatus status;
+
+ Assert(!dlist_is_empty(&pgsr->available));
+
+ this_read = dlist_container(PgStreamingReadItem, node, dlist_pop_head_node(&pgsr->available));
+ Assert(!this_read->valid);
+ Assert(!this_read->in_progress);
+ Assert(this_read->read_private == 0);
+
+ if (this_read->aio == NULL)
+ {
+ this_read->aio = pgaio_io_get();
+ }
+
+ pgaio_io_on_completion_local(this_read->aio, &this_read->on_completion);
+ this_read->in_progress = true;
+ this_read->valid = true;
+ dlist_push_tail(&pgsr->issued, &this_read->node);
+ dlist_push_tail(&pgsr->in_order, &this_read->sequence_node);
+ pgsr->pending_count++;
+ pgsr->inflight_count++;
+ pgsr->prefetched_total_count++;
+
+ status = pgsr->determine_next_cb(pgsr->pgsr_private, this_read->aio, &this_read->read_private);
+
+ if (status == PGSR_NEXT_END)
+ {
+ pgsr->pending_count--;
+ pgsr->inflight_count--;
+ pgsr->prefetched_total_count--;
+ pgsr->hit_end = true;
+ this_read->read_private = 0;
+ this_read->valid = false;
+ this_read->in_progress = false;
+ pgaio_io_recycle(this_read->aio);
+ dlist_delete_from(&pgsr->in_order, &this_read->sequence_node);
+ dlist_push_tail(&pgsr->available, &this_read->node);
+ }
+ else if (status == PGSR_NEXT_NO_IO)
+ {
+ Assert(this_read->read_private != 0);
+ pgsr->pending_count--;
+ pgsr->inflight_count--;
+ pgsr->no_io_count++;
+ pgsr->completed_count++;
+ this_read->in_progress = false;
+ pgaio_io_recycle(this_read->aio);
+ dlist_delete_from(&pgsr->issued, &this_read->node);
+ }
+ else
+ {
+ Assert(this_read->read_private != 0);
+ }
+}
+
+static void
+pg_streaming_read_prefetch(PgStreamingRead *pgsr)
+{
+ uint32 min_issue;
+
+ if (pgsr->hit_end)
+ return;
+
+ Assert(pgsr->inflight_count <= pgsr->current_window);
+ Assert(pgsr->completed_count <= (pgsr->iodepth_max + pgsr->distance_max));
+
+ /*
+ * XXX: Some issues:
+ *
+ * - We should probably do the window calculation based on the number of
+ * buffers the user actually requested, i.e. only recompute this
+ * whenever pg_streaming_read_get_next() is called. Otherwise we will
+ * always read the whole prefetch window.
+ *
+ * - The algorithm here is pretty stupid. Should take distance / iodepth
+ * properly into account in a distitcn way. Grow slower.
+ *
+ * - It'd be good to have a usage dependent iodepth management. After an
+ * initial increase, we should only increase further if the the user the
+ * window proves too small (i.e. we don't manage to keep 'completed'
+ * close to full).
+ *
+ * - If most requests don't trigger IO, we should probably reduce the
+ * prefetch window.
+ */
+ if (pgsr->current_window < pgsr->iodepth_max)
+ {
+ if (pgsr->current_window == 0)
+ pgsr->current_window = 4;
+ else
+ pgsr->current_window *= 2;
+
+ if (pgsr->current_window > pgsr->iodepth_max)
+ pgsr->current_window = pgsr->iodepth_max;
+
+ min_issue = 1;
+ }
+ else
+ {
+ min_issue = Min(pgsr->iodepth_max, pgsr->current_window / 4);
+ }
+
+ Assert(pgsr->inflight_count <= pgsr->current_window);
+ Assert(pgsr->completed_count <= (pgsr->iodepth_max + pgsr->distance_max));
+
+ if (pgsr->completed_count >= pgsr->current_window)
+ return;
+
+ if (pgsr->inflight_count >= pgsr->current_window)
+ return;
+
+ while (!pgsr->hit_end &&
+ (pgsr->inflight_count < pgsr->current_window) &&
+ (pgsr->completed_count < pgsr->current_window))
+ {
+ pg_streaming_read_prefetch_one(pgsr);
+
+ if (pgsr->pending_count >= min_issue)
+ {
+ pgsr->submitted_total_count += pgsr->pending_count;
+ pgsr->pending_count = 0;
+ pgaio_submit_pending(true);
+ }
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (pgsr->pending_count >= min_issue)
+ {
+ pgsr->submitted_total_count += pgsr->pending_count;
+ pgsr->pending_count = 0;
+ pgaio_submit_pending(true);
+ }
+}
+
+uintptr_t
+pg_streaming_read_get_next(PgStreamingRead *pgsr)
+{
+ if (pgsr->prefetched_total_count == 0)
+ {
+ pg_streaming_read_prefetch(pgsr);
+ Assert(pgsr->hit_end || pgsr->prefetched_total_count > 0);
+ }
+
+ if (dlist_is_empty(&pgsr->in_order))
+ {
+ Assert(pgsr->hit_end);
+ return 0;
+ }
+ else
+ {
+ PgStreamingReadItem *this_read;
+ uint64_t ret;
+
+ Assert(pgsr->prefetched_total_count > 0);
+
+ this_read = dlist_container(PgStreamingReadItem, sequence_node,
+ dlist_pop_head_node(&pgsr->in_order));
+ Assert(this_read->valid);
+
+ if (this_read->in_progress)
+ {
+ pgaio_io_wait(this_read->aio);
+ /* callback should have updated */
+ Assert(!this_read->in_progress);
+ Assert(this_read->valid);
+ }
+
+ Assert(this_read->read_private != 0);
+ ret = this_read->read_private;
+ this_read->read_private = 0;
+ this_read->valid = false;
+
+ pgsr->completed_count--;
+ dlist_push_tail(&pgsr->available, &this_read->node);
+ pg_streaming_read_prefetch(pgsr);
+
+ return ret;
+ }
+}
#include "replication/slot.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/dsm.h"
#include "storage/ipc.h"
size = add_size(size, BTreeShmemSize());
size = add_size(size, SyncScanShmemSize());
size = add_size(size, AsyncShmemSize());
+ size = add_size(size, AioShmemSize());
#ifdef EXEC_BACKEND
size = add_size(size, ShmemBackendArraySize());
#endif
SyncScanShmemInit();
AsyncShmemInit();
+ AioShmemInit();
+
#ifdef EXEC_BACKEND
/*
/* LWTRANCHE_PARALLEL_APPEND: */
"ParallelAppend",
/* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
- "PerXactPredicateList"
+ "PerXactPredicateList",
+ /* LWTRANCHE_AIO_CONTEXT_SUBMISSION: */
+ "AioContextSubmission",
+ /* LWTRANCHE_AIO_CONTEXT_COMPLETION: */
+ "AioContextCompletion"
};
StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
# 45 was XactTruncationLock until removal of BackendRandomLock
WrapLimitsVacuumLock 46
NotifyQueueTailLock 47
+SharedAIOCtlLock 48
+WALSyncLock 49
#include "replication/slot.h"
#include "replication/syncrep.h"
#include "replication/walsender.h"
+#include "storage/aio.h"
#include "storage/condition_variable.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
*/
InitLWLockAccess();
InitDeadLockChecking();
+
+ pgaio_postmaster_child_init();
}
/*
Assert(MyProc->lockGroupLeader == NULL);
Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+ pgaio_postmaster_child_init();
+
/*
* We might be reusing a semaphore that belonged to a failed process. So
* be careful and reinitialize its value here. (This is not strictly
#include "access/itup.h"
#include "access/xlog.h"
#include "pgstat.h"
+#include "storage/aio.h"
#include "storage/checksum.h"
#include "utils/memdebug.h"
#include "utils/memutils.h"
* returned page and not refer to it again.
*/
char *
-PageSetChecksumCopy(Page page, BlockNumber blkno)
+PageSetChecksumCopy(Page page, BlockNumber blkno, PgAioBounceBuffer **bb)
{
- static char *pageCopy = NULL;
+ static char *pageCopySync = NULL;
+ char *pageCopy;
+
+ if (bb)
+ *bb = NULL;
/* If we don't need a checksum, just return the passed-in data */
if (PageIsNew(page) || !DataChecksumsEnabled())
return (char *) page;
- /*
- * We allocate the copy space once and use it over on each subsequent
- * call. The point of palloc'ing here, rather than having a static char
- * array, is first to ensure adequate alignment for the checksumming code
- * and second to avoid wasting space in processes that never call this.
- */
- if (pageCopy == NULL)
- pageCopy = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+ if (bb)
+ {
+ *bb = pgaio_bounce_buffer_get();
+ pageCopy = pgaio_bounce_buffer_buffer(*bb);
+ }
+ else if (!pageCopySync)
+ {
+ pageCopySync = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+ pageCopy = pageCopySync;
+ }
+ else
+ pageCopy = pageCopySync;
memcpy(pageCopy, (char *) page, BLCKSZ);
((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
+static inline int _mdfd_open_flags(ForkNumber forkNum)
+{
+ int flags = O_RDWR | PG_BINARY;
+
+ /*
+ * XXX: not clear if direct IO ever is interesting for other forks? The
+ * FSM fork currently often ends up very fragmented when using direct IO,
+ * for example.
+ */
+ if (io_data_direct /* && forkNum == MAIN_FORKNUM */)
+ flags |= PG_O_DIRECT;
+
+ return flags;
+}
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
path = relpath(reln->smgr_rnode, forkNum);
- fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags(forkNum) | O_CREAT | O_EXCL);
if (fd < 0)
{
int save_errno = errno;
if (isRedo)
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags(forkNum));
if (fd < 0)
{
/* be sure to report the error reported by create, not open */
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}
+static void
+zeroextend_complete(void *pgsw_private, void *write_private)
+{
+ BlockNumber *latest = (BlockNumber *) write_private;
+}
+
+BlockNumber
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+ MdfdVec *v;
+ PgAioBounceBuffer *bb;
+ char *zerobuf;
+ pg_streaming_write *pgsw ;
+ BlockNumber latest;
+ BlockNumber curblocknum = blocknum;
+ int remblocks = nblocks;
+
+ Assert(nblocks > 0);
+
+ pgsw = pg_streaming_write_alloc(Min(256, nblocks), &latest, zeroextend_complete);
+
+ bb = pgaio_bounce_buffer_get();
+ zerobuf = pgaio_bounce_buffer_buffer(bb);
+ memset(zerobuf, 0, BLCKSZ);
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber.
+ */
+ // FIXME
+#if 0
+ if (blocknum == InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u blocks",
+ relpath(reln->smgr_rnode, forknum),
+ InvalidBlockNumber)));
+#endif
+
+ while (remblocks > 0)
+ {
+ int fd;
+ int ret;
+ int segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+ int segendblock = (curblocknum % ((BlockNumber) RELSEG_SIZE)) + remblocks;
+ off_t seekpos;
+
+ if (segendblock > RELSEG_SIZE)
+ segendblock = RELSEG_SIZE;
+
+ v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+ Assert(segstartblock < RELSEG_SIZE);
+ Assert(segendblock <= RELSEG_SIZE);
+
+ seekpos = (off_t) BLCKSZ * segstartblock;
+
+ fd = FileGetRawDesc(v->mdfd_vfd);
+ ret = posix_fallocate(fd,
+ seekpos,
+ (off_t) BLCKSZ * (segendblock - segstartblock));
+
+ if (ret != 0)
+ elog(ERROR, "fallocate failed: %m");
+
+ for (BlockNumber i = segstartblock; i < segendblock; i++)
+ {
+ PgAioInProgress *aio = pg_streaming_write_get_io(pgsw);
+ AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = i};
+
+ pgaio_assoc_bounce_buffer(aio, bb);
+ FileStartWrite(aio, v->mdfd_vfd, zerobuf, BLCKSZ, i * BLCKSZ, &tag, InvalidBuffer, false);
+
+ pg_streaming_write_write(pgsw, aio, (void*) &i);
+
+ /* FIXME: ensure errors are handled equivalently */
+#if 0
+ int nbytes;
+
+ if ((nbytes = FileWrite(v->mdfd_vfd, zerobuf, BLCKSZ,
+ i * BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+ {
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space.")));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+ FilePathName(v->mdfd_vfd),
+ nbytes, BLCKSZ, blocknum),
+ errhint("Check free disk space.")));
+ }
+#endif
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+ remblocks -= segendblock - segstartblock;
+ curblocknum += segendblock - segstartblock;
+ }
+
+ pgaio_bounce_buffer_release(bb);
+
+ pg_streaming_write_wait_all(pgsw);
+ pg_streaming_write_free(pgsw);
+
+ return blocknum + (nblocks - 1);
+}
+
+
/*
* mdopenfork() -- Open one fork of the specified relation.
*
path = relpath(reln->smgr_rnode, forknum);
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
if (fd < 0)
{
mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
+ if (io_data_direct)
+ return;
/*
* Issue flush requests in as few requests as possible; have to split at
* segment boundaries though, since those are actually separate files.
}
}
+/*
+ * mdread() -- Read the specified block from a relation.
+ */
+void
+mdstartread(PgAioInProgress *io, SMgrRelation reln,
+ ForkNumber forknum, BlockNumber blocknum,
+ char *buffer, int bufno, int mode)
+{
+ off_t seekpos;
+ MdfdVec *v;
+ AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = blocknum};
+
+ AssertPointerAlignment(buffer, 4096);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, false,
+ EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ if (!FileStartRead(io, v->mdfd_vfd, buffer, BLCKSZ, seekpos, &tag, bufno, mode))
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write block %u in file \"%s\": %m",
+ blocknum, FilePathName(v->mdfd_vfd))));
+ }
+
+}
+
/*
* mdwrite() -- Write the supplied block at the appropriate location.
*
register_dirty_segment(reln, forknum, v);
}
+/*
+ * mdstartwrite() -- Asynchronously start a Write the supplied block at the
+ * appropriate location.
+ */
+void
+mdstartwrite(PgAioInProgress *io, SMgrRelation reln,
+ ForkNumber forknum, BlockNumber blocknum,
+ char *buffer, int bufno, bool skipFsync, bool release_lock)
+{
+ off_t seekpos;
+ MdfdVec *v;
+ AioBufferTag tag = {.rnode = reln->smgr_rnode, .forkNum = forknum, .blockNum = blocknum};
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum < mdnblocks(reln, forknum));
+#endif
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+ reln->smgr_rnode.node.spcNode,
+ reln->smgr_rnode.node.dbNode,
+ reln->smgr_rnode.node.relNode,
+ reln->smgr_rnode.backend);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+ EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ /*
+ * XXX: In the synchronous case this is after the write - should be fine,
+ * I think?
+ */
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ if (!FileStartWrite(io, v->mdfd_vfd, buffer, BLCKSZ, seekpos, &tag, bufno, release_lock))
+ {
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write block %u in file \"%s\": %m",
+ blocknum, FilePathName(v->mdfd_vfd))));
+ }
+}
+
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
}
}
+int
+mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
+{
+ MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, false,
+ EXTENSION_FAIL);
+
+ *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ return FileGetRawDesc(v->mdfd_vfd);
+}
+
/*
* register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
*/
fullpath = _mdfd_segpath(reln, forknum, segno);
/* open the file */
- fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+ fd = PathNameOpenFile(fullpath, _mdfd_open_flags(forknum) | oflags);
pfree(fullpath);
strlcpy(path, p, MAXPGPATH);
pfree(p);
- file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ file = PathNameOpenFile(path, _mdfd_open_flags(ftag->forknum));
if (file < 0)
return -1;
need_to_close = true;
bool isRedo);
void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
+ BlockNumber (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer);
+ void (*smgr_startread) (struct PgAioInProgress *io ,
+ SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer,
+ int bufno, int mode);
+ void (*smgr_startwrite) (struct PgAioInProgress *io,
+ SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer,
+ int bufno, bool skipFsync, bool release_lock);
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+ int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
} f_smgr;
static const f_smgr smgrsw[] = {
.smgr_exists = mdexists,
.smgr_unlink = mdunlink,
.smgr_extend = mdextend,
+ .smgr_zeroextend = mdzeroextend,
.smgr_prefetch = mdprefetch,
.smgr_read = mdread,
+ .smgr_startread = mdstartread,
.smgr_write = mdwrite,
+ .smgr_startwrite = mdstartwrite,
.smgr_writeback = mdwriteback,
.smgr_nblocks = mdnblocks,
.smgr_truncate = mdtruncate,
.smgr_immedsync = mdimmedsync,
+ .smgr_fd = mdfd,
}
};
reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
}
+BlockNumber
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync)
+{
+ return smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+ nblocks, skipFsync);
+}
+
/*
* smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
*
smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
}
+void
+smgrstartread(struct PgAioInProgress *io, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ char *buffer, int bufno, int mode)
+{
+ return smgrsw[reln->smgr_which].smgr_startread(io, reln, forknum, blocknum, buffer, bufno, mode);
+}
+
/*
* smgrwrite() -- Write the supplied buffer out.
*
buffer, skipFsync);
}
+void
+smgrstartwrite(struct PgAioInProgress *io,
+ SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer,
+ int bufno, bool skipFsync, bool release_lock)
+{
+ smgrsw[reln->smgr_which].smgr_startwrite(io, reln, forknum, blocknum,
+ buffer, bufno, skipFsync, release_lock);
+}
+
/*
* smgrwriteback() -- Trigger kernel writeback for the supplied range of
smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
}
+int
+smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
+{
+ return smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
+}
+
/*
* AtEOXact_SMgr
*
#include "replication/slot.h"
#include "replication/walsender.h"
#include "rewrite/rewriteHandler.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/pmsignal.h"
/* Initialize MaxBackends (if under postmaster, was done already) */
InitializeMaxBackends();
+
+ /* AIO is needed during InitPostgres() */
+ pgaio_postmaster_init();
}
/* Early initialization */
#include "postmaster/autovacuum.h"
#include "postmaster/interrupt.h"
#include "postmaster/postmaster.h"
+#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/latch.h"
InitLatch(MyLatch);
InitializeLatchWaitSet();
+ pgaio_postmaster_child_init_local();
+
/*
* If possible, make this process a group leader, so that the postmaster
* can signal any child processes too. Not all processes will have
NULL, NULL, NULL
},
+ {
+ {"io_data_direct", PGC_SUSET, RESOURCES_DISK,
+ gettext_noop("data file IO uses direct IO."),
+ },
+ &io_data_direct,
+ false,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"io_data_force_async", PGC_SUSET, RESOURCES_DISK,
+ gettext_noop("force synchronous data file to use async IO."),
+ },
+ &io_data_force_async,
+ true, // helpful for development
+ NULL, NULL, NULL
+ },
+
+ {
+ {"io_wal_direct", PGC_SIGHUP, RESOURCES_DISK,
+ gettext_noop("wal file IO uses direct IO."),
+ },
+ &io_wal_direct,
+ false,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"io_wal_init_direct", PGC_SIGHUP, RESOURCES_DISK,
+ gettext_noop("wal file initialization IO uses direct IO."),
+ },
+ &io_wal_init_direct,
+ false,
+ NULL, NULL, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
#include "common/cryptohash.h"
#include "common/hashfn.h"
#include "jit/jit.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/predicate.h"
ResourceArray dsmarr; /* dynamic shmem segments */
ResourceArray jitarr; /* JIT contexts */
ResourceArray cryptohasharr; /* cryptohash contexts */
+ ResourceArray aiobbarr; /* AIO BB references */
/* We can remember up to MAX_RESOWNER_LOCKS references to local locks. */
int nlocks; /* number of owned locks */
static void PrintFileLeakWarning(File file);
static void PrintDSMLeakWarning(dsm_segment *seg);
static void PrintCryptoHashLeakWarning(Datum handle);
+static void PrintAioBBLeakWarning(Datum handle);
/*****************************************************************************
PrintCryptoHashLeakWarning(foundres);
pg_cryptohash_free(context);
}
+
+ /* Ditto for AIO BBs */
+ /* XXX: reconsider phase? */
+ while (ResourceArrayGetAny(&(owner->aiobbarr), &foundres))
+ {
+ PgAioBounceBuffer *bb = (PgAioBounceBuffer *) PointerGetDatum(foundres);
+
+ if (isCommit)
+ PrintAioBBLeakWarning(foundres);
+
+ pgaio_bounce_buffer_release(bb);
+ }
}
else if (phase == RESOURCE_RELEASE_LOCKS)
{
elog(WARNING, "cryptohash context reference leak: context %p still referenced",
DatumGetPointer(handle));
}
+
+void
+ResourceOwnerEnlargeAioBB(ResourceOwner owner)
+{
+ ResourceArrayEnlarge(&(owner->aiobbarr));
+}
+
+void
+ResourceOwnerRememberAioBB(ResourceOwner owner, PgAioBounceBuffer *bb)
+{
+ ResourceArrayAdd(&(owner->aiobbarr), PointerGetDatum(bb));
+}
+
+void
+ResourceOwnerForgetAioBB(ResourceOwner owner, PgAioBounceBuffer *bb)
+{
+ if (!ResourceArrayRemove(&(owner->aiobbarr), PointerGetDatum(bb)))
+ elog(ERROR, "aio BB %p is not owned by resource owner %s",
+ DatumGetPointer(bb), owner->name);
+}
+
+static void
+PrintAioBBLeakWarning(Datum handle)
+{
+ elog(WARNING, "aio BB reference leak: bb %zu still referenced",
+ handle);
+}
extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli);
+struct PgAioInProgress;
+extern void XLogWriteComplete(struct PgAioInProgress *aio, uint32 write_no);
+extern void XLogFlushComplete(struct PgAioInProgress *aio, uint32 flush_no);
+
/*
* Exported for the functions in timeline.c and xlogarchive.c. Only valid
* in the startup process.
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
prosrc => 'unicode_is_normalized' },
+{ oid => '811', descr => 'AIO backend stats',
+ proname => 'pg_stat_get_aio_backends', procost => '1000', prorows => '20', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{int4,int8,int8,int8,int8,int8,int4,int4,int4,int4,int4,int4,int4}', proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o}',
+ proargnames => '{pid, executed_total_count, issued_total_count, submissions_total_count, foreign_completed_total_count, retry_total_count, inflight_count, unused_count, outstanding_count, pending_count, local_completed_count, foreign_completed_count,last_context}', prosrc => 'pg_stat_get_aio_backends' },
+
+{ oid => '812', descr => 'AIO stats',
+ proname => 'pg_stat_get_aios', procost => '1000', prorows => '20', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{int4,text,text,int4,int4,int8,int8,text}', proargmodes => '{o,o,o,o,o,o,o,o}',
+ proargnames => '{id,operation,flags,ring,owner_pid,generation,result,action_desc}', prosrc => 'pg_stat_get_aios' },
+
]
WAIT_EVENT_WAL_READ,
WAIT_EVENT_WAL_SYNC,
WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+ WAIT_EVENT_WAL_WAIT_FLUSH,
WAIT_EVENT_WAL_WAIT_INSERT,
+ WAIT_EVENT_WAL_WAIT_WRITE,
WAIT_EVENT_WAL_WRITE,
WAIT_EVENT_LOGICAL_CHANGES_READ,
WAIT_EVENT_LOGICAL_CHANGES_WRITE,
WAIT_EVENT_LOGICAL_SUBXACT_READ,
- WAIT_EVENT_LOGICAL_SUBXACT_WRITE
+ WAIT_EVENT_LOGICAL_SUBXACT_WRITE,
+ WAIT_EVENT_AIO_SUBMIT,
+ WAIT_EVENT_AIO_IO_COMPLETE_ANY,
+ WAIT_EVENT_AIO_IO_COMPLETE_ONE,
+ WAIT_EVENT_AIO_REFS,
+ WAIT_EVENT_AIO_BACKPRESSURE
} WaitEventIO;
/* ----------
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * aio.h
+ * aio
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/aio.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef AIO_H
+#define AIO_H
+
+#include "common/relpath.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/relfilenode.h"
+#include "lib/ilist.h"
+
+#include "access/xlogdefs.h"
+
+typedef struct PgAioInProgress PgAioInProgress;
+typedef struct PgAioBounceBuffer PgAioBounceBuffer;
+
+typedef struct PgAioIoRef
+{
+ uint32 aio_index;
+ uint32 generation_upper;
+ uint32 generation_lower;
+} PgAioIoRef;
+
+/* initialization */
+extern void pgaio_postmaster_init(void);
+extern Size AioShmemSize(void);
+extern void AioShmemInit(void);
+extern void pgaio_postmaster_child_init_local(void);
+extern void pgaio_postmaster_child_init(void);
+
+extern void pgaio_at_abort(void);
+extern void pgaio_at_commit(void);
+
+/*
+ * XXX: Add flags to the initiation functions that govern:
+ * - whether PgAioInProgress will be released once the IO has successfully
+ * finished, even if done by another backend (e.g. useful when prefetching,
+ * without wanting to look at the concrete buffer, or when doing buffer
+ * writeback). In particular this'd also cause the buffer pin to be released
+ * upon completion.
+ * - whether a failed request needs to be seen by the issuing backend. That's
+ * not needed e.g. for prefetches, but is for buffer writes by checkpointer.
+ * - whether pending requests might be issued if sensible, or whether that's
+ * not allowed.
+ *
+ *
+ * FIXME: Add indicator about which IO channel needs to be used (important
+ * e.g. for buffer writes interleaved with WAL writes, for queue depth
+ * management of checkpointer, for readahead)
+ */
+
+extern PgAioInProgress *pgaio_io_get(void);
+extern void pgaio_io_release(PgAioInProgress *io);
+
+extern dlist_node* pgaio_io_node(PgAioInProgress *io);
+
+typedef struct PgAioOnCompletionLocalContext PgAioOnCompletionLocalContext;;
+typedef void (*PgAioOnCompletionLocalCB)(PgAioOnCompletionLocalContext *ocb, PgAioInProgress *io);
+struct PgAioOnCompletionLocalContext
+{
+ PgAioOnCompletionLocalCB callback;
+};
+#define pgaio_ocb_container(type, membername, ptr) \
+ (AssertVariableIsOfTypeMacro(ptr, PgAioOnCompletionLocalContext *), \
+ AssertVariableIsOfTypeMacro(((type *) NULL)->membername, PgAioOnCompletionLocalContext), \
+ ((type *) ((char *) (ptr) - offsetof(type, membername))))
+extern void pgaio_io_on_completion_local(PgAioInProgress *io, PgAioOnCompletionLocalContext *ocb);
+
+extern void pgaio_io_wait(PgAioInProgress *io);
+extern void pgaio_io_wait_ref(PgAioIoRef *ref, bool call_local);
+extern bool pgaio_io_check_ref(PgAioIoRef *ref);
+
+extern bool pgaio_io_success(PgAioInProgress *io);
+extern bool pgaio_io_done(PgAioInProgress *io);
+extern void pgaio_io_recycle(PgAioInProgress *io);
+extern void pgaio_io_ref(PgAioInProgress *io, PgAioIoRef *ref);
+
+static inline bool
+pgaio_io_ref_valid(const PgAioIoRef *ref)
+{
+ return ref->aio_index != PG_UINT32_MAX;
+}
+
+static inline void
+pgaio_io_ref_clear(PgAioIoRef *ref)
+{
+ ref->aio_index = PG_UINT32_MAX;
+}
+
+
+typedef struct AioBufferTag
+{
+ RelFileNodeBackend rnode; /* physical relation identifier */
+ ForkNumber forkNum;
+ BlockNumber blockNum; /* blknum relative to begin of reln */
+} AioBufferTag;
+struct buftag;
+
+extern void pgaio_io_start_flush_range(PgAioInProgress *io, int fd, uint64 offset, uint32 nbytes);
+extern void pgaio_io_start_nop(PgAioInProgress *io);
+extern void pgaio_io_start_fsync(PgAioInProgress *io, int fd, bool barrier);
+extern void pgaio_io_start_fdatasync(PgAioInProgress *io, int fd, bool barrier);
+
+extern void pgaio_io_start_read_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes,
+ char *bufdata, int buffno, int mode);
+extern void pgaio_io_start_write_buffer(PgAioInProgress *io, const AioBufferTag *tag, int fd, uint32 offset, uint32 nbytes,
+ char *bufdata, int buffno, bool release_lock);
+extern void pgaio_io_start_write_wal(PgAioInProgress *io, int fd,
+ uint32 offset, uint32 nbytes,
+ char *bufdata, bool no_reorder,
+ uint32 write_no);
+extern void pgaio_io_start_write_generic(PgAioInProgress *io, int fd,
+ uint64 offset, uint32 nbytes,
+ char *bufdata, bool no_reorder);
+extern void pgaio_io_start_fsync_wal(PgAioInProgress *io, int fd, bool barrier,
+ bool datasync_only, uint32 sync_no);
+
+extern void pgaio_io_retry(PgAioInProgress *io);
+extern void pgaio_submit_pending(bool drain);
+
+extern void pgaio_print_queues(void);
+struct StringInfoData;
+extern void pgaio_io_print(PgAioInProgress *io, struct StringInfoData *s);
+extern void pgaio_io_ref_print(PgAioIoRef *ref, struct StringInfoData *s);
+struct dlist_head;
+extern void pgaio_print_list(struct dlist_head *head, struct StringInfoData *s, size_t offset);
+
+
+extern void pgaio_assoc_bounce_buffer(PgAioInProgress *io, PgAioBounceBuffer *bb);
+extern PgAioBounceBuffer *pgaio_bounce_buffer_get(void);
+extern void pgaio_bounce_buffer_release(PgAioBounceBuffer *bb);
+extern char *pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb);
+
+
+/*
+ * Helpers. In aio_util.c.
+ */
+
+/*
+ * Helper to efficiently perform bulk writes.
+ */
+typedef struct pg_streaming_write pg_streaming_write;
+typedef void (*pg_streaming_write_completed)(void *pgsw_private, void *write_private);
+
+extern pg_streaming_write *pg_streaming_write_alloc(uint32 iodepth, void *private,
+ pg_streaming_write_completed on_completion);
+extern PgAioInProgress *pg_streaming_write_get_io(pg_streaming_write *pgsw);
+extern uint32 pg_streaming_write_inflight(pg_streaming_write *pgsw);
+extern void pg_streaming_write_write(pg_streaming_write *pgsw, PgAioInProgress *io, void *private);
+extern void pg_streaming_write_wait_all(pg_streaming_write *pgsw);
+extern void pg_streaming_write_free(pg_streaming_write *pgsw);
+
+/*
+ * Helper for efficient reads (using readahead).
+ */
+
+typedef struct PgStreamingRead PgStreamingRead;
+typedef enum PgStreamingReadNextStatus
+{
+ PGSR_NEXT_END,
+ PGSR_NEXT_NO_IO,
+ PGSR_NEXT_IO
+} PgStreamingReadNextStatus;
+
+typedef PgStreamingReadNextStatus (*PgStreamingReadDetermineNextCB)(uintptr_t pgsr_private, PgAioInProgress *aio, uintptr_t *read_private);
+typedef void (*PgStreamingReadRelease)(uintptr_t pgsr_private, uintptr_t read_private);
+extern PgStreamingRead *pg_streaming_read_alloc(uint32 iodepth, uintptr_t pgsr_private,
+ PgStreamingReadDetermineNextCB determine_next_cb,
+ PgStreamingReadRelease release_cb);
+extern void pg_streaming_read_free(PgStreamingRead *pgsr);
+extern uintptr_t pg_streaming_read_get_next(PgStreamingRead *pgsr);
+
+#endif /* AIO_H */
#include "port/atomics.h"
#include "storage/buf.h"
+#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/condition_variable.h"
#include "storage/latch.h"
int wait_backend_pid; /* backend PID of pin-count waiter */
int freeNext; /* link in freelist chain */
+ PgAioIoRef io_in_progress;
LWLock content_lock; /* to lock access to buffer contents */
} BufferDesc;
extern void IssuePendingWritebacks(WritebackContext *context);
extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
+extern void ReadBufferCompleteRead(Buffer buffer, const AioBufferTag *tag, char *bufdata, int mode, bool failed);
+extern void ReadBufferCompleteWrite(Buffer buffer, bool failed, bool release_lock);
+
+
/* freelist.c */
extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
uint32 *buf_state);
extern int backend_flush_after;
extern int bgwriter_flush_after;
+extern bool io_data_direct;
+extern bool io_data_force_async;
+extern bool io_wal_direct;
+extern bool io_wal_init_direct;
+
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;
/*
* prototypes for functions in bufmgr.c
*/
-extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln,
+extern PrefetchBufferResult PrefetchSharedBuffer(Relation reln,
+ struct SMgrRelationData *smgr_reln,
ForkNumber forkNum,
BlockNumber blockNum);
extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum,
extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
ForkNumber forkNum, BlockNumber blockNum,
ReadBufferMode mode, BufferAccessStrategy strategy);
+struct PgAioInProgress;
+extern Buffer ReadBufferAsync(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+ ReadBufferMode mode, BufferAccessStrategy strategy,
+ bool *already_valid, struct PgAioInProgress** aio);
+extern Buffer BulkExtendBuffered(Relation relation, ForkNumber forkNum,
+ int extendby,
+ BufferAccessStrategy strategy);
extern void ReleaseBuffer(Buffer buffer);
extern void UnlockReleaseBuffer(Buffer buffer);
extern void MarkBufferDirty(Buffer buffer);
extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offset);
extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
Item newtup, Size newsize);
-extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
+struct PgAioBounceBuffer;
+extern char *PageSetChecksumCopy(Page page, BlockNumber blkno, struct PgAioBounceBuffer **bb);
extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
#endif /* BUFPAGE_H */
extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file);
extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
-extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int FileRead(File file, char *buffer, int amount, off_t offset,
+ uint32 wait_event_info);
+struct PgAioInProgress;
+struct AioBufferTag;
+extern bool FileStartRead(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const struct AioBufferTag *tag, int bufid, int mode);
extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern bool FileStartWrite(struct PgAioInProgress *io, File file, char *buffer, int amount, off_t offset, const struct AioBufferTag *tag, int bufid, bool release_lock);
extern int FileSync(File file, uint32 wait_event_info);
extern off_t FileSize(File file);
extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
LWTRANCHE_SHARED_TIDBITMAP,
LWTRANCHE_PARALLEL_APPEND,
LWTRANCHE_PER_XACT_PREDICATE_LIST,
+ LWTRANCHE_AIO_CONTEXT_SUBMISSION,
+ LWTRANCHE_AIO_CONTEXT_COMPLETION,
LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds;
extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void mdextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
+extern void mdstartread(struct PgAioInProgress *,
+ SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ char *buffer, int bufno, int mode);
+extern void mdstartwrite(struct PgAioInProgress *,
+ SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ char *buffer, int bufno, bool skipFsync, bool release_lock);
extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
extern void ForgetDatabaseSyncRequests(Oid dbid);
extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
+extern BlockNumber smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void smgrread(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer);
+struct PgAioInProgress;
+extern void smgrstartread(struct PgAioInProgress* io,
+ SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer,
+ int bufno, int mode);
+extern void smgrstartwrite(struct PgAioInProgress* io,
+ SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer,
+ int bufno, bool skipFsync, bool release_lock);
extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
int nforks, BlockNumber *nblocks);
extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern int smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
extern void AtEOXact_SMgr(void);
#endif /* SMGR_H */
extern void ResourceOwnerForgetCryptoHash(ResourceOwner owner,
Datum handle);
+/* support for AIO bounce buffer management */
+struct PgAioBounceBuffer;
+extern void ResourceOwnerEnlargeAioBB(ResourceOwner owner);
+extern void ResourceOwnerRememberAioBB(ResourceOwner owner,
+ struct PgAioBounceBuffer *bb);
+extern void ResourceOwnerForgetAioBB(ResourceOwner owner,
+ struct PgAioBounceBuffer *bb);
+
#endif /* RESOWNER_PRIVATE_H */
FROM ((pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, sslcompression, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid)
LEFT JOIN pg_database d ON ((s.datid = d.oid)))
LEFT JOIN pg_authid u ON ((s.usesysid = u.oid)));
+pg_stat_aio_backends| SELECT s.pid,
+ s.executed_total_count,
+ s.issued_total_count,
+ s.submissions_total_count,
+ s.foreign_completed_total_count,
+ s.retry_total_count,
+ s.inflight_count,
+ s.unused_count,
+ s.outstanding_count,
+ s.pending_count,
+ s.local_completed_count,
+ s.foreign_completed_count,
+ s.last_context
+ FROM pg_stat_get_aio_backends() s(pid, executed_total_count, issued_total_count, submissions_total_count, foreign_completed_total_count, retry_total_count, inflight_count, unused_count, outstanding_count, pending_count, local_completed_count, foreign_completed_count, last_context);
+pg_stat_aios| SELECT s.id,
+ s.operation,
+ s.flags,
+ s.ring,
+ s.owner_pid,
+ s.generation,
+ s.result,
+ s.action_desc
+ FROM pg_stat_get_aios() s(id, operation, flags, ring, owner_pid, generation, result, action_desc);
pg_stat_all_indexes| SELECT c.oid AS relid,
i.oid AS indexrelid,
n.nspname AS schemaname,