With real AIO it doesn't make sense to cross segment boundaries with one
IO. Add smgrmaxcombine() to allow upper layers to query which buffers can be
merged.
We could continue to cross segment boundaries when not using AIO, but it
doesn't really make sense, because md.c will never be able to perform the read
across the segment boundary in one system call. Which means we'll mark more
buffers as undergoing IO than really makes sense - if another backend desires
to read the same blocks, it'll be blocked longer than necessary. So it seems
better to just never cross the boundary.
Reviewed-by: Heikki Linnakangas <[email protected]>
Reviewed-by: Noah Misch <[email protected]>
Discussion: https://postgr.es/m/
1f6b50a7-38ef-4d87-8246-
786d39f46ab9@iki.fi
{
int actual_nblocks = *nblocks;
int io_buffers_len = 0;
+ int maxcombine = 0;
Assert(*nblocks > 0);
Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
{
/* Extend the readable range to cover this block. */
io_buffers_len++;
+
+ /*
+ * Check how many blocks we can cover with the same IO. The smgr
+ * implementation might e.g. be limited due to a segment boundary.
+ */
+ if (i == 0 && actual_nblocks > 1)
+ {
+ maxcombine = smgrmaxcombine(operation->smgr,
+ operation->forknum,
+ blockNum);
+ if (unlikely(maxcombine < actual_nblocks))
+ {
+ elog(DEBUG2, "limiting nblocks at %u from %u to %u",
+ blockNum, actual_nblocks, maxcombine);
+ actual_nblocks = maxcombine;
+ }
+ }
}
}
*nblocks = actual_nblocks;
return iovcnt;
}
+/*
+ * mdmaxcombine() -- Return the maximum number of total blocks that can be
+ * combined with an IO starting at blocknum.
+ */
+uint32
+mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum)
+{
+ BlockNumber segoff;
+
+ segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
+
+ return RELSEG_SIZE - segoff;
+}
+
/*
* mdreadv() -- Read the specified blocks from a relation.
*/
RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "read crosses segment boundary");
+
iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
size_this_segment = nblocks_this_segment * BLCKSZ;
transferred_this_segment = 0;
RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "write crosses segment boundary");
+
iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
size_this_segment = nblocks_this_segment * BLCKSZ;
transferred_this_segment = 0;
BlockNumber blocknum, int nblocks, bool skipFsync);
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks);
+ uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
void **buffers, BlockNumber nblocks);
.smgr_extend = mdextend,
.smgr_zeroextend = mdzeroextend,
.smgr_prefetch = mdprefetch,
+ .smgr_maxcombine = mdmaxcombine,
.smgr_readv = mdreadv,
.smgr_writev = mdwritev,
.smgr_writeback = mdwriteback,
return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
}
+/*
+ * smgrmaxcombine() - Return the maximum number of total blocks that can be
+ * combined with an IO starting at blocknum.
+ *
+ * The returned value includes the IO for blocknum itself.
+ */
+uint32
+smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum)
+{
+ return smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
+}
+
/*
* smgrreadv() -- read a particular block range from a relation into the
* supplied buffers.
* This routine is called from the buffer manager in order to
* instantiate pages in the shared buffer cache. All storage managers
* return pages in the format that POSTGRES expects.
+ *
+ * If more than one block is intended to be read, callers need to use
+ * smgrmaxcombine() to check how many blocks can be combined into one IO.
*/
void
smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
* skipFsync indicates that the caller will make other provisions to
* fsync the relation, so we needn't bother. Temporary relations also
* do not require fsync.
+ *
+ * If more than one block is intended to be read, callers need to use
+ * smgrmaxcombine() to check how many blocks can be combined into one IO.
*/
void
smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks);
+extern uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks);
extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks, bool skipFsync);
extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks);
+extern uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
extern void smgrreadv(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum,
void **buffers, BlockNumber nblocks);