diff --git a/contrib/Makefile b/contrib/Makefile index 952855d9b61b..1c9f22b1c86a 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ dict_int \ dict_xsyn \ earthdistance \ + fsync_checker \ file_fdw \ fuzzystrmatch \ hstore \ diff --git a/contrib/fsync_checker/fsync_checker.control b/contrib/fsync_checker/fsync_checker.control new file mode 100644 index 000000000000..7d0e36434bfa --- /dev/null +++ b/contrib/fsync_checker/fsync_checker.control @@ -0,0 +1,5 @@ +# fsync_checker extension +comment = 'SMGR extension for checking volatile writes' +default_version = '1.0' +module_pathname = '$libdir/fsync_checker' +relocatable = true diff --git a/contrib/fsync_checker/fsync_checker_smgr.c b/contrib/fsync_checker/fsync_checker_smgr.c new file mode 100644 index 000000000000..626ff0587645 --- /dev/null +++ b/contrib/fsync_checker/fsync_checker_smgr.c @@ -0,0 +1,248 @@ +#include "postgres.h" + +#include "access/xlog.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/md.h" +#include "utils/hsearch.h" + +PG_MODULE_MAGIC; + +typedef struct +{ + RelFileLocator locator; + ForkNumber forknum; +} VolatileRelnKey; + +typedef struct +{ + VolatileRelnKey key; + XLogRecPtr lsn; +} VolatileRelnEntry; + +void _PG_init(void); + +static void fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync, SmgrChainIndex chain_index); +static void fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +static void fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void **buffers, + BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index); +static void fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks, SmgrChainIndex chain_index); +static void fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync, SmgrChainIndex chain_index); + +static void fsync_checker_checkpoint_create(const CheckPoint *checkPoint); +static void fsync_checker_shmem_request(void); +static void fsync_checker_shmem_startup(void); + +static void add_reln(SMgrRelation reln, ForkNumber forknum); +static void remove_reln(SMgrRelation reln, ForkNumber forknum); + +static SMgrId fsync_checker_smgr_id; +static const struct f_smgr fsync_checker_smgr = { + .name = "fsync_checker", + .chain_position = SMGR_CHAIN_MODIFIER, + .smgr_init = NULL, + .smgr_shutdown = NULL, + .smgr_open = NULL, + .smgr_close = NULL, + .smgr_create = NULL, + .smgr_exists = NULL, + .smgr_unlink = NULL, + .smgr_extend = fsync_checker_extend, + .smgr_zeroextend = fsync_checker_zeroextend, + .smgr_prefetch = NULL, + .smgr_maxcombine = NULL, + .smgr_readv = NULL, + .smgr_writev = fsync_checker_writev, + .smgr_writeback = fsync_checker_writeback, + .smgr_nblocks = NULL, + .smgr_truncate = NULL, + .smgr_immedsync = fsync_checker_immedsync, + .smgr_registersync = NULL, +}; + +static HTAB *volatile_relns; +static LWLock *volatile_relns_lock; +static shmem_request_hook_type prev_shmem_request_hook; +static shmem_startup_hook_type prev_shmem_startup_hook; +static checkpoint_create_hook_type prev_checkpoint_create_hook; + +void +_PG_init(void) +{ + prev_checkpoint_create_hook = checkpoint_create_hook; + checkpoint_create_hook = fsync_checker_checkpoint_create; + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = fsync_checker_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = fsync_checker_shmem_startup; + + /* + * Relation size of 0 means we can just defer to md, but it would be nice + * to just expose this functionality, so if I needed my own relation, I + * could use MdSmgrRelation as the parent. + */ + fsync_checker_smgr_id = smgr_register(&fsync_checker_smgr, 0); +} + +static void +fsync_checker_checkpoint_create(const CheckPoint *checkPoint) +{ + long num_entries; + HASH_SEQ_STATUS status; + VolatileRelnEntry *entry; + + if (prev_checkpoint_create_hook) + prev_checkpoint_create_hook(checkPoint); + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + hash_seq_init(&status, volatile_relns); + + num_entries = hash_get_num_entries(volatile_relns); + elog(INFO, "Analyzing %ld volatile relations", num_entries); + while ((entry = hash_seq_search(&status))) + { + if (entry->lsn < checkPoint->redo) + { + RelPathStr path; + + path = relpathperm(entry->key.locator, entry->key.forknum); + + elog(WARNING, "Relation not previously synced: %s", path.str); + } + } + + LWLockRelease(volatile_relns_lock); +} + +static void +fsync_checker_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(1024, sizeof(VolatileRelnEntry))); + RequestNamedLWLockTranche("fsync_checker volatile relns lock", 1); +} + +static void +fsync_checker_shmem_startup(void) +{ + HASHCTL ctl; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + ctl.keysize = sizeof(VolatileRelnKey); + ctl.entrysize = sizeof(VolatileRelnEntry); + volatile_relns = NULL; + volatile_relns_lock = NULL; + + /* + * Create or attach to the shared memory state, including hash table + */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + volatile_relns = ShmemInitHash("fsync_checker volatile relns", + 1024, 1024, &ctl, HASH_BLOBS | HASH_ELEM); + volatile_relns_lock = &GetNamedLWLockTranche("fsync_checker volatile relns lock")->lock; + + LWLockRelease(AddinShmemInitLock); +} + +static void +add_reln(SMgrRelation reln, ForkNumber forknum) +{ + bool found; + XLogRecPtr lsn; + VolatileRelnKey key; + VolatileRelnEntry *entry; + + key.locator = reln->smgr_rlocator.locator; + key.forknum = forknum; + + lsn = GetXLogWriteRecPtr(); + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + entry = hash_search(volatile_relns, &key, HASH_ENTER, &found); + if (!found) + entry->lsn = lsn; + + LWLockRelease(volatile_relns_lock); +} + +static void +remove_reln(SMgrRelation reln, ForkNumber forknum) +{ + VolatileRelnKey key; + + key.locator = reln->smgr_rlocator.locator; + key.forknum = forknum; + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + hash_search(volatile_relns, &key, HASH_REMOVE, NULL); + + LWLockRelease(volatile_relns_lock); +} + +static void +fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync, SmgrChainIndex chain_index) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + smgr_extend_next(reln, forknum, blocknum, buffer, skipFsync, chain_index + 1); +} + +static void +fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + if (!SmgrIsTemp(reln)) + remove_reln(reln, forknum); + + smgr_immedsync_next(reln, forknum, chain_index + 1); +} + +static void +fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void **buffers, + BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + smgr_writev_next(reln, forknum, blocknum, buffers, nblocks, skipFsync, chain_index + 1); +} + +static void +fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks, SmgrChainIndex chain_index) +{ + if (!SmgrIsTemp(reln)) + remove_reln(reln, forknum); + + smgr_writeback_next(reln, forknum, blocknum, nblocks, chain_index + 1); +} + +static void +fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync, SmgrChainIndex chain_index) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + smgr_zeroextend_next(reln, forknum, blocknum, nblocks, skipFsync, chain_index + 1); +} diff --git a/contrib/fsync_checker/meson.build b/contrib/fsync_checker/meson.build new file mode 100644 index 000000000000..ce6ed7fe90bb --- /dev/null +++ b/contrib/fsync_checker/meson.build @@ -0,0 +1,22 @@ +# Copyright (c) 2023, PostgreSQL Global Development Group + +fsync_checker_sources = files( + 'fsync_checker_smgr.c', +) + +if host_system == 'windows' + fsync_checker_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'fsync_checker', + '--FILEDESC', 'fsync_checker - SMGR extension for checking volatile relations',]) +endif + +fsync_checker = shared_module('fsync_checker', + fsync_checker_sources, + kwargs: contrib_mod_args, +) +contrib_targets += fsync_checker + +install_data( + 'fsync_checker.control', + kwargs: contrib_data_args, +) diff --git a/contrib/meson.build b/contrib/meson.build index 1ba73ebd67a3..c48fb1387518 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -28,6 +28,7 @@ subdir('dict_int') subdir('dict_xsyn') subdir('earthdistance') subdir('file_fdw') +subdir('fsync_checker') subdir('fuzzystrmatch') subdir('hstore') subdir('hstore_plperl') diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 4da4dc845803..7563d4d6d597 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -588,6 +588,8 @@ heapam_relation_set_new_filelocator(Relation rel, { SMgrRelation srel; + RelFileLocator oldlocator = rel->rd_locator; + /* * Initialize to the minimum XID that could put tuples in the table. We * know that no xacts older than RecentXmin are still running, so that @@ -605,7 +607,7 @@ heapam_relation_set_new_filelocator(Relation rel, */ *minmulti = GetOldestMultiXactId(); - srel = RelationCreateStorage(*newrlocator, persistence, true); + srel = RelationCreateStorage(oldlocator, *newrlocator, persistence, true); /* * If required, set up an init fork for an unlogged table so that it can @@ -615,7 +617,7 @@ heapam_relation_set_new_filelocator(Relation rel, { Assert(rel->rd_rel->relkind == RELKIND_RELATION || rel->rd_rel->relkind == RELKIND_TOASTVALUE); - smgrcreate(srel, INIT_FORKNUM, false); + smgrcreate(oldlocator, srel, INIT_FORKNUM, false); log_smgrcreate(newrlocator, INIT_FORKNUM); } @@ -648,7 +650,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) * NOTE: any conflict in relfilenumber value will be caught in * RelationCreateStorage(). */ - dstrel = RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true); + dstrel = RelationCreateStorage(rel->rd_locator, *newrlocator, rel->rd_rel->relpersistence, true); /* copy main fork */ RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, @@ -660,7 +662,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) { if (smgrexists(RelationGetSmgr(rel), forkNum)) { - smgrcreate(dstrel, forkNum, false); + smgrcreate(rel->rd_locator, dstrel, forkNum, false); /* * WAL log creation if the relation is persistent, or this is the diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 4b6c694a3f71..bc1044bf72d3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -208,6 +208,8 @@ const struct config_enum_entry archive_mode_options[] = { */ CheckpointStatsData CheckpointStats; +checkpoint_create_hook_type checkpoint_create_hook = NULL; + /* * During recovery, lastFullPageWrites keeps track of full_page_writes that * the replayed WAL records indicate. It's initialized with full_page_writes @@ -7173,6 +7175,9 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); + if (checkpoint_create_hook != NULL) + checkpoint_create_hook(&checkPoint); + /* * In some cases there are groups of actions that must all occur on one * side or the other of a checkpoint record. Before flushing the diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d4..2179d2f73fa9 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -487,7 +487,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, * filesystem loses an inode during a crash. Better to write the data * until we are actually told to delete the file.) */ - smgrcreate(smgr, forknum, true); + smgrcreate(rlocator, smgr, forknum, true); lastblock = smgrnblocks(smgr, forknum); diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index bd3554c0bfdc..251d22f50b2f 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -386,7 +386,7 @@ heap_create(const char *relname, relpersistence, relfrozenxid, relminmxid); else if (RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) - RelationCreateStorage(rel->rd_locator, relpersistence, true); + RelationCreateStorage(rel->rd_locator, rel->rd_locator, relpersistence, true); else Assert(false); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 739a92bdcc1c..8d93bc224829 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3089,7 +3089,7 @@ index_build(Relation heapRelation, if (indexRelation->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && !smgrexists(RelationGetSmgr(indexRelation), INIT_FORKNUM)) { - smgrcreate(RelationGetSmgr(indexRelation), INIT_FORKNUM, false); + smgrcreate(indexRelation->rd_locator, RelationGetSmgr(indexRelation), INIT_FORKNUM, false); log_smgrcreate(&indexRelation->rd_locator, INIT_FORKNUM); indexRelation->rd_indam->ambuildempty(indexRelation); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 624ed41bbf3a..59fa01decc59 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -118,7 +118,7 @@ AddPendingSync(const RelFileLocator *rlocator) * pass register_delete = false. */ SMgrRelation -RelationCreateStorage(RelFileLocator rlocator, char relpersistence, +RelationCreateStorage(RelFileLocator oldlocator, RelFileLocator rlocator, char relpersistence, bool register_delete) { SMgrRelation srel; @@ -147,7 +147,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, } srel = smgropen(rlocator, procNumber); - smgrcreate(srel, MAIN_FORKNUM, false); + smgrcreate(oldlocator, srel, MAIN_FORKNUM, false); if (needs_wal) log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM); @@ -976,7 +976,7 @@ smgr_redo(XLogReaderState *record) SMgrRelation reln; reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); - smgrcreate(reln, xlrec->forkNum, true); + smgrcreate(xlrec->rlocator, reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) { @@ -997,7 +997,7 @@ smgr_redo(XLogReaderState *record) * XLogReadBufferForRedo, we prefer to recreate the rel and replay the * log as best we can until the drop is seen. */ - smgrcreate(reln, MAIN_FORKNUM, true); + smgrcreate(xlrec->rlocator, reln, MAIN_FORKNUM, true); /* * Before we perform the truncation, update minimum recovery point to diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 4b7c5113aabe..d8c560a11b28 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -344,7 +344,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) SMgrRelation srel; srel = smgropen(rel->rd_locator, INVALID_PROC_NUMBER); - smgrcreate(srel, INIT_FORKNUM, false); + smgrcreate(rel->rd_locator, srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_locator, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); FlushRelationBuffers(rel); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 129c97fdf28f..152b0d38969b 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16432,7 +16432,7 @@ index_copy_data(Relation rel, RelFileLocator newrlocator) * NOTE: any conflict in relfilenumber value will be caught in * RelationCreateStorage(). */ - dstrel = RelationCreateStorage(newrlocator, rel->rd_rel->relpersistence, true); + dstrel = RelationCreateStorage(rel->rd_locator, newrlocator, rel->rd_rel->relpersistence, true); /* copy main fork */ RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, @@ -16444,7 +16444,7 @@ index_copy_data(Relation rel, RelFileLocator newrlocator) { if (smgrexists(RelationGetSmgr(rel), forkNum)) { - smgrcreate(dstrel, forkNum, false); + smgrcreate(rel->rd_locator, dstrel, forkNum, false); /* * WAL log creation if the relation is persistent, or this is the diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d13846298bd5..0497dce756d4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -915,11 +915,18 @@ PostmasterMain(int argc, char *argv[]) */ ApplyLauncherRegister(); + /* + * Register built-in managers that are not part of static arrays + */ + register_builtin_dynamic_managers(); + /* * process any libraries that should be preloaded at postmaster start */ process_shared_preload_libraries(); + process_smgr_chain(); + /* * Initialize SSL library, if specified. */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 79ca9d18d07b..8abe47833c3a 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -945,7 +945,7 @@ ExtendBufferedRelTo(BufferManagerRelation bmr, /* recheck, fork might have been created concurrently */ if (!smgrexists(bmr.smgr, fork)) - smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY); + smgrcreate(bmr.rel->rd_locator, bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY); UnlockRelationForExtension(bmr.rel, ExclusiveLock); } @@ -4743,7 +4743,7 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator, * directory. Therefore, each individual relation doesn't need to be * registered for cleanup. */ - RelationCreateStorage(dst_rlocator, relpersistence, false); + RelationCreateStorage(src_rlocator, dst_rlocator, relpersistence, false); /* copy main fork. */ RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM, @@ -4755,7 +4755,8 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator, { if (smgrexists(src_rel, forkNum)) { - smgrcreate(dst_rel, forkNum, false); + /* TODO: for sure? */ + smgrcreate(src_rel->smgr_rlocator.locator, dst_rel, forkNum, false); /* * WAL log creation if the relation is persistent, or this is the diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f3220f98dc42..963bd0e9cde7 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -84,6 +84,21 @@ typedef struct _MdfdVec } MdfdVec; static MemoryContext MdCxt; /* context for all MdfdVec objects */ +SMgrId MdSMgrId; + +typedef struct +{ + SMgrRelationData reln; /* parent data */ + + /* + * for md.c; per-fork arrays of the number of open segments + * (md_num_open_segs) and the segments themselves (md_seg_fds). + */ + int md_num_open_segs[MAX_FORKNUM + 1]; + MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; +} MdSMgrRelationData; + +typedef MdSMgrRelationData *MdSMgrRelation; /* Populate a file tag describing an md.c segment file. */ @@ -109,6 +124,33 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ /* don't try to open a segment, if not already open */ #define EXTENSION_DONT_OPEN (1 << 5) +/* md storage manager functionality */ +static void mdinit(void); +static void mdopen(SMgrRelation reln, SmgrChainIndex chain_index); +static void mdclose(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +static void mdcreate(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index); +static bool mdexists(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +static void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index); +static void mdextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync, SmgrChainIndex chain_index); +static void mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync, SmgrChainIndex chain_index); +static bool mdprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, SmgrChainIndex chain_index); +static uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, SmgrChainIndex chain_index); +static void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks, SmgrChainIndex chain_index); +static void mdwritev(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index); +static void mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks, SmgrChainIndex chain_index); +static BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +static void mdtruncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber old_blocks, BlockNumber nblocks, SmgrChainIndex chain_index); +static void mdimmedsync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +static void mdregistersync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); /* * Fixed-length string to represent paths to files that need to be built by @@ -130,26 +172,56 @@ typedef struct MdPathStr } MdPathStr; +void +mdsmgr_register(void) +{ + /* magnetic disk */ + f_smgr md_smgr = (f_smgr) { + .name = "md", + .chain_position = SMGR_CHAIN_TAIL, + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, + .smgr_prefetch = mdprefetch, + .smgr_maxcombine = mdmaxcombine, + .smgr_readv = mdreadv, + .smgr_writev = mdwritev, + .smgr_writeback = mdwriteback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = mdimmedsync, + .smgr_registersync = mdregistersync, + }; + + MdSMgrId = smgr_register(&md_smgr, sizeof(MdSMgrRelationData)); +} + /* local routines */ static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); -static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); -static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior); +static void register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno); static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno); -static void _fdvec_resize(SMgrRelation reln, +static void _fdvec_resize(MdSMgrRelation reln, ForkNumber forknum, int nseg); -static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, +static MdPathStr _mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno); -static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags); -static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior); -static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, +static BlockNumber _mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static inline int @@ -166,7 +238,7 @@ _mdfd_open_flags(void) /* * mdinit() -- Initialize private state for magnetic disk storage manager. */ -void +static void mdinit(void) { MdCxt = AllocSetContextCreate(TopMemoryContext, @@ -179,18 +251,20 @@ mdinit(void) * * Note: this will return true for lingering files, with pending deletions */ -bool -mdexists(SMgrRelation reln, ForkNumber forknum) +static bool +mdexists(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* * Close it first, to ensure that we notice if the fork has been unlinked * since we opened it. As an optimization, we can skip that in recovery, * which already closes relations when dropping them. */ if (!InRecovery) - mdclose(reln, forknum); + mdclose(reln, forknum, 0); - return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL); + return (mdopenfork(mdreln, forknum, EXTENSION_RETURN_NULL) != NULL); } /* @@ -198,17 +272,18 @@ mdexists(SMgrRelation reln, ForkNumber forknum) * * If isRedo is true, it's okay for the relation to exist already. */ -void -mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +static void +mdcreate(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; MdfdVec *mdfd; RelPathStr path; File fd; - if (isRedo && reln->md_num_open_segs[forknum] > 0) + if (isRedo && mdreln->md_num_open_segs[forknum] > 0) return; /* created and opened already... */ - Assert(reln->md_num_open_segs[forknum] == 0); + Assert(mdreln->md_num_open_segs[forknum] == 0); /* * We may be using the target table space for the first time in this @@ -243,13 +318,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) } } - _fdvec_resize(reln, forknum, 1); - mdfd = &reln->md_seg_fds[forknum][0]; + _fdvec_resize(mdreln, forknum, 1); + mdfd = &mdreln->md_seg_fds[forknum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, mdfd); + register_dirty_segment(mdreln, forknum, mdfd); } /* @@ -313,8 +388,8 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */ -void -mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) +static void +mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index) { /* Now do the per-fork work */ if (forknum == InvalidForkNumber) @@ -463,10 +538,11 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */ -void +static void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync) + const void *buffer, bool skipFsync, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; off_t seekpos; int nbytes; MdfdVec *v; @@ -493,7 +569,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forknum).str, InvalidBlockNumber))); - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_CREATE); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -517,9 +593,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); } /* @@ -528,10 +604,11 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * Similar to mdextend(), except the relation can be extended by multiple * blocks at once and the added blocks will be filled with zeroes. */ -void +static void mdzeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync) + BlockNumber blocknum, int nblocks, bool skipFsync, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; MdfdVec *v; BlockNumber curblocknum = blocknum; int remblocks = nblocks; @@ -566,7 +643,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, else numblocks = remblocks; - v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + v = _mdfd_getseg(mdreln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); Assert(segstartblock < RELSEG_SIZE); Assert(segstartblock + numblocks <= RELSEG_SIZE); @@ -621,9 +698,9 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); remblocks -= numblocks; curblocknum += numblocks; @@ -641,7 +718,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * invent one out of whole cloth. */ static MdfdVec * -mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) +mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior) { MdfdVec *mdfd; RelPathStr path; @@ -651,7 +728,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) if (reln->md_num_open_segs[forknum] > 0) return &reln->md_seg_fds[forknum][0]; - path = relpath(reln->smgr_rlocator, forknum); + path = relpath(reln->reln.smgr_rlocator, forknum); fd = PathNameOpenFile(path.str, _mdfd_open_flags()); @@ -678,21 +755,24 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) /* * mdopen() -- Initialize newly-opened relation. */ -void -mdopen(SMgrRelation reln) +static void +mdopen(SMgrRelation reln, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* mark it not open */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) - reln->md_num_open_segs[forknum] = 0; + mdreln->md_num_open_segs[forknum] = 0; } /* * mdclose() -- Close the specified relation, if it isn't closed already. */ -void -mdclose(SMgrRelation reln, ForkNumber forknum) +static void +mdclose(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) { - int nopensegs = reln->md_num_open_segs[forknum]; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + int nopensegs = mdreln->md_num_open_segs[forknum]; /* No work if already closed */ if (nopensegs == 0) @@ -701,10 +781,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* close segments starting from the end */ while (nopensegs > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][nopensegs - 1]; FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, nopensegs - 1); + _fdvec_resize(mdreln, forknum, nopensegs - 1); nopensegs--; } } @@ -712,11 +792,12 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation */ -bool +static bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int nblocks) + int nblocks, SmgrChainIndex chain_index) { #ifdef USE_PREFETCH + MdSMgrRelation mdreln = (MdSMgrRelation) reln; Assert((io_direct_flags & IO_DIRECT_DATA) == 0); @@ -729,7 +810,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MdfdVec *v; int nblocks_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(mdreln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) return false; @@ -809,9 +890,9 @@ buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks) * mdmaxcombine() -- Return the maximum number of total blocks that can be * combined with an IO starting at blocknum. */ -uint32 +static uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum) + BlockNumber blocknum, SmgrChainIndex index) { BlockNumber segoff; @@ -823,10 +904,12 @@ mdmaxcombine(SMgrRelation reln, ForkNumber forknum, /* * mdreadv() -- Read the specified blocks from a relation. */ -void +static void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void **buffers, BlockNumber nblocks) + void **buffers, BlockNumber nblocks, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; @@ -838,7 +921,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, size_t transferred_this_segment; size_t size_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(mdreln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -944,10 +1027,12 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ -void +static void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void **buffers, BlockNumber nblocks, bool skipFsync) + const void **buffers, BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum)); @@ -964,7 +1049,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, size_t transferred_this_segment; size_t size_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -1034,7 +1119,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); nblocks -= nblocks_this_segment; buffers += nblocks_this_segment; @@ -1049,10 +1134,12 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ -void +static void mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) + BlockNumber blocknum, BlockNumber nblocks, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); /* @@ -1067,7 +1154,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, int segnum_start, segnum_end; - v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + v = _mdfd_getseg(mdreln, forknum, blocknum, true /* not used */ , EXTENSION_DONT_OPEN); /* @@ -1108,17 +1195,18 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, * called, then only segments up to the last one actually touched * are present in the array. */ -BlockNumber -mdnblocks(SMgrRelation reln, ForkNumber forknum) +static BlockNumber +mdnblocks(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; MdfdVec *v; BlockNumber nblocks; BlockNumber segno; - mdopenfork(reln, forknum, EXTENSION_FAIL); + mdopenfork(mdreln, forknum, EXTENSION_FAIL); /* mdopen has opened the first segment */ - Assert(reln->md_num_open_segs[forknum] > 0); + Assert(mdreln->md_num_open_segs[forknum] > 0); /* * Start from the last open segments, to avoid redundant seeks. We have @@ -1133,12 +1221,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * that's OK because the checkpointer never needs to compute relation * size.) */ - segno = reln->md_num_open_segs[forknum] - 1; - v = &reln->md_seg_fds[forknum][segno]; + segno = mdreln->md_num_open_segs[forknum] - 1; + v = &mdreln->md_seg_fds[forknum][segno]; for (;;) { - nblocks = _mdnblocks(reln, forknum, v); + nblocks = _mdnblocks(mdreln, forknum, v); if (nblocks > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big"); if (nblocks < ((BlockNumber) RELSEG_SIZE)) @@ -1156,7 +1244,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * undermines _mdfd_getseg's attempts to notice and report an error * upon access to a missing segment. */ - v = _mdfd_openseg(reln, forknum, segno, 0); + v = _mdfd_openseg(mdreln, forknum, segno, 0); if (v == NULL) return segno * ((BlockNumber) RELSEG_SIZE); } @@ -1172,10 +1260,11 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * sure we have opened all active segments, so that truncate loop will get * them all! */ -void +static void mdtruncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber curnblk, BlockNumber nblocks) + BlockNumber curnblk, BlockNumber nblocks, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; BlockNumber priorblocks; int curopensegs; @@ -1196,14 +1285,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, * Truncate segments, starting at the last one. Starting at the end makes * managing the memory for the fd array easier, should there be errors. */ - curopensegs = reln->md_num_open_segs[forknum]; + curopensegs = mdreln->md_num_open_segs[forknum]; while (curopensegs > 0) { MdfdVec *v; priorblocks = (curopensegs - 1) * RELSEG_SIZE; - v = &reln->md_seg_fds[forknum][curopensegs - 1]; + v = &mdreln->md_seg_fds[forknum][curopensegs - 1]; if (priorblocks > nblocks) { @@ -1218,13 +1307,13 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, FilePathName(v->mdfd_vfd)))); if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); /* we never drop the 1st segment */ - Assert(v != &reln->md_seg_fds[forknum][0]); + Assert(v != &mdreln->md_seg_fds[forknum][0]); FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, curopensegs - 1); + _fdvec_resize(mdreln, forknum, curopensegs - 1); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { @@ -1244,7 +1333,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, FilePathName(v->mdfd_vfd), nblocks))); if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); } else { @@ -1261,9 +1350,10 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, /* * mdregistersync() -- Mark whole relation as needing fsync */ -void -mdregistersync(SMgrRelation reln, ForkNumber forknum) +static void +mdregistersync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; int segno; int min_inactive_seg; @@ -1271,9 +1361,9 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) * NOTE: mdnblocks makes sure we have opened all active segments, so that * the loop below will get them all! */ - mdnblocks(reln, forknum); + mdnblocks(reln, forknum, 0); - min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = mdreln->md_num_open_segs[forknum]; /* * Temporarily open inactive segments, then close them after sync. There @@ -1281,20 +1371,20 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) * harmless. We don't bother to clean them up and take a risk of further * trouble. The next mdclose() will soon close them. */ - while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL) segno++; while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1]; - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); /* Close inactive segments immediately */ if (segno > min_inactive_seg) { FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, segno - 1); + _fdvec_resize(mdreln, forknum, segno - 1); } segno--; @@ -1312,9 +1402,10 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) * crash before the next checkpoint syncs the newly-inactive segment, that * segment may survive recovery, reintroducing unwanted data into the table. */ -void -mdimmedsync(SMgrRelation reln, ForkNumber forknum) +static void +mdimmedsync(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; int segno; int min_inactive_seg; @@ -1322,9 +1413,9 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * NOTE: mdnblocks makes sure we have opened all active segments, so that * the loop below will get them all! */ - mdnblocks(reln, forknum); + mdnblocks(reln, forknum, 0); - min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = mdreln->md_num_open_segs[forknum]; /* * Temporarily open inactive segments, then close them after sync. There @@ -1332,12 +1423,12 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * is harmless. We don't bother to clean them up and take a risk of * further trouble. The next mdclose() will soon close them. */ - while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL) segno++; while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1]; /* * fsyncs done through mdimmedsync() should be tracked in a separate @@ -1358,7 +1449,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) if (segno > min_inactive_seg) { FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, segno - 1); + _fdvec_resize(mdreln, forknum, segno - 1); } segno--; @@ -1375,14 +1466,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * enough to be a performance problem). */ static void -register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { FileTag tag; - INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno); + INIT_MD_FILETAG(tag, reln->reln.smgr_rlocator.locator, forknum, seg->mdfd_segno); /* Temp relations should never be fsync'd */ - Assert(!SmgrIsTemp(reln)); + Assert(!SmgrIsTemp(&reln->reln)); if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { @@ -1500,7 +1591,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) * _fdvec_resize() -- Resize the fork's open segments array */ static void -_fdvec_resize(SMgrRelation reln, +_fdvec_resize(MdSMgrRelation reln, ForkNumber forknum, int nseg) { @@ -1548,12 +1639,12 @@ _fdvec_resize(SMgrRelation reln, * returned string is palloc'd. */ static MdPathStr -_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) +_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno) { RelPathStr path; MdPathStr fullpath; - path = relpath(reln->smgr_rlocator, forknum); + path = relpath(reln->reln.smgr_rlocator, forknum); if (segno > 0) sprintf(fullpath.str, "%s.%u", path.str, segno); @@ -1568,7 +1659,7 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) * and make a MdfdVec object for it. Returns NULL on failure. */ static MdfdVec * -_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, +_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags) { MdfdVec *v; @@ -1611,7 +1702,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, * EXTENSION_CREATE case. */ static MdfdVec * -_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior) { MdfdVec *v; @@ -1685,9 +1776,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); - mdextend(reln, forknum, + mdextend((SMgrRelation) reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, - zerobuf, skipFsync); + zerobuf, skipFsync, 0); pfree(zerobuf); } flags = O_CREAT; @@ -1740,7 +1831,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * Get number of blocks present in a single disk file */ static BlockNumber -_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +_mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { off_t len; @@ -1763,7 +1854,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER); + MdSMgrRelation reln = (MdSMgrRelation) smgropen(ftag->rlocator, INVALID_PROC_NUMBER); File file; instr_time io_start; bool need_to_close; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index ebe35c04de59..088925637682 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -53,84 +53,23 @@ #include "access/xlogutils.h" #include "lib/ilist.h" +#include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" +#include "port/atomics.h" #include "utils/hsearch.h" #include "utils/inval.h" +#include "utils/memutils.h" +f_smgr *smgrsw; -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks); - uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - void **buffers, BlockNumber nblocks); - void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - const void **buffers, BlockNumber nblocks, - bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber old_blocks, BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; - -static const f_smgr smgrsw[] = { - /* magnetic disk */ - { - .smgr_init = mdinit, - .smgr_shutdown = NULL, - .smgr_open = mdopen, - .smgr_close = mdclose, - .smgr_create = mdcreate, - .smgr_exists = mdexists, - .smgr_unlink = mdunlink, - .smgr_extend = mdextend, - .smgr_zeroextend = mdzeroextend, - .smgr_prefetch = mdprefetch, - .smgr_maxcombine = mdmaxcombine, - .smgr_readv = mdreadv, - .smgr_writev = mdwritev, - .smgr_writeback = mdwriteback, - .smgr_nblocks = mdnblocks, - .smgr_truncate = mdtruncate, - .smgr_immedsync = mdimmedsync, - .smgr_registersync = mdregistersync, - } -}; +static int NSmgr = 0; -static const int NSmgr = lengthof(smgrsw); +static Size LargestSMgrRelationSize = 0; + +SMgrChain storage_manager_chain; /* * Each backend has a hashtable that stores all extant SMgrRelation objects. @@ -144,6 +83,74 @@ static dlist_head unpinned_relns; static void smgrshutdown(int code, Datum arg); static void smgrdestroy(SMgrRelation reln); +#define MaxSMgrId UINT8_MAX + +SMgrId +smgr_register(const f_smgr *smgr, Size smgrrelation_size) +{ + SMgrId my_id; + MemoryContext old; + + if (process_shared_preload_libraries_done) + elog(FATAL, "SMgrs must be registered in the shared_preload_libraries phase"); + if (NSmgr == MaxSMgrId) + elog(FATAL, "Too many smgrs registered"); + if (smgr->name == NULL || *smgr->name == 0) + elog(FATAL, "smgr registered with invalid name"); + + if (smgr->chain_position == SMGR_CHAIN_TAIL) + { + Assert(smgr->smgr_open != NULL); + Assert(smgr->smgr_close != NULL); + Assert(smgr->smgr_create != NULL); + Assert(smgr->smgr_exists != NULL); + Assert(smgr->smgr_unlink != NULL); + Assert(smgr->smgr_extend != NULL); + Assert(smgr->smgr_zeroextend != NULL); + Assert(smgr->smgr_prefetch != NULL); + Assert(smgr->smgr_readv != NULL); + Assert(smgr->smgr_writev != NULL); + Assert(smgr->smgr_writeback != NULL); + Assert(smgr->smgr_nblocks != NULL); + Assert(smgr->smgr_truncate != NULL); + Assert(smgr->smgr_immedsync != NULL); + } + + old = MemoryContextSwitchTo(TopMemoryContext); + + my_id = NSmgr++; + if (my_id == 0) + smgrsw = palloc_array(f_smgr, 1); + else + smgrsw = repalloc_array(smgrsw, f_smgr, NSmgr); + + MemoryContextSwitchTo(old); + + pg_compiler_barrier(); + + if (!smgrsw) + { + NSmgr--; + elog(FATAL, "Failed to extend smgr array"); + } + + smgrsw[my_id] = *smgr; + + LargestSMgrRelationSize = Max(LargestSMgrRelationSize, smgrrelation_size); + + return my_id; +} + +SMgrId +smgr_lookup(const char *name) +{ + for (int i = 0; i < NSmgr; i++) + { + if (strcmp(smgrsw[i].name, name) == 0) + return i; + } + elog(FATAL, "Storage manager not found with name: %s", name); +} /* * smgrinit(), smgrshutdown() -- Initialize or shut down storage @@ -183,6 +190,22 @@ smgrshutdown(int code, Datum arg) } } +#define SMGR_CHAIN_LOOKUP(SMGR_METHOD) \ + do \ + { \ + while (chain_index < reln->smgr_chain.size && smgrsw[reln->smgr_chain.chain[chain_index]].SMGR_METHOD == NULL) \ + chain_index++; \ + Assert(chain_index < reln->smgr_chain.size); \ + } while (0) + +void +smgr_open_next(SMgrRelation reln, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_open); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_open(reln, chain_index); +} + /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * @@ -211,8 +234,11 @@ smgropen(RelFileLocator rlocator, ProcNumber backend) /* First time through: initialize the hash table */ HASHCTL ctl; + LargestSMgrRelationSize = MAXALIGN(LargestSMgrRelationSize); + Assert(NSmgr > 0); + ctl.keysize = sizeof(RelFileLocatorBackend); - ctl.entrysize = sizeof(SMgrRelationData); + ctl.entrysize = LargestSMgrRelationSize; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_BLOBS); dlist_init(&unpinned_relns); @@ -232,10 +258,11 @@ smgropen(RelFileLocator rlocator, ProcNumber backend) reln->smgr_targblock = InvalidBlockNumber; for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + memcpy(&reln->smgr_chain, &storage_manager_chain, sizeof(SMgrChain)); /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); + smgr_open_next(reln, 0); /* it is not pinned yet */ reln->pincount = 0; @@ -273,6 +300,14 @@ smgrunpin(SMgrRelation reln) dlist_push_tail(&unpinned_relns, &reln->node); } +void +smgr_close_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_close); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_close(reln, forknum, chain_index); +} + /* * smgrdestroy() -- Delete an SMgrRelation object. */ @@ -284,7 +319,7 @@ smgrdestroy(SMgrRelation reln) Assert(reln->pincount == 0); for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + smgr_close_next(reln, forknum, 0); dlist_delete(&reln->node); @@ -304,7 +339,7 @@ smgrrelease(SMgrRelation reln) { for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + smgr_close_next(reln, forknum, 0); reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } reln->smgr_targblock = InvalidBlockNumber; @@ -394,13 +429,29 @@ smgrreleaserellocator(RelFileLocatorBackend rlocator) smgrrelease(reln); } +bool +smgr_exists_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_exists); + + return smgrsw[reln->smgr_chain.chain[chain_index]].smgr_exists(reln, forknum, chain_index); +} + /* * smgrexists() -- Does the underlying file for a fork exist? */ bool smgrexists(SMgrRelation reln, ForkNumber forknum) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return smgr_exists_next(reln, forknum, 0); +} + +void +smgr_create_next(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_create); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_create(relold, reln, forknum, isRedo, chain_index); } /* @@ -411,9 +462,17 @@ smgrexists(SMgrRelation reln, ForkNumber forknum) * to be created. */ void -smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +smgrcreate(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + smgr_create_next(relold, reln, forknum, isRedo, 0); +} + +void +smgr_immedsync_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_immedsync); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_immedsync(reln, forknum, chain_index); } /* @@ -441,16 +500,22 @@ smgrdosyncall(SMgrRelation *rels, int nrels) */ for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); + if (smgr_exists_next(rels[i], forknum, 0)) + smgr_immedsync_next(rels[i], forknum, 0); } } } +void +smgr_unlink_next(SMgrRelation reln, RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_unlink); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_unlink(rlocator, forknum, isRedo, chain_index); +} + /* * smgrdounlinkall() -- Immediately unlink all forks of all given relations * @@ -485,13 +550,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator; - int which = rels[i]->smgr_which; rlocators[i] = rlocator; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); + smgr_close_next(rels[i], forknum, 0); } /* @@ -515,15 +579,22 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo); + smgr_unlink_next(rels[i], rlocators[i], forknum, isRedo, 0); } pfree(rlocators); } +void +smgr_extend_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_extend); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_extend(reln, forknum, blocknum, + buffer, skipFsync, chain_index); +} /* * smgrextend() -- Add a new block to a file. @@ -538,8 +609,7 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, - buffer, skipFsync); + smgr_extend_next(reln, forknum, blocknum, buffer, skipFsync, 0); /* * Normally we expect this to increase nblocks by one, but if the cached @@ -552,6 +622,16 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } +void +smgr_zeroextend_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, bool skipFsync, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_zeroextend); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_zeroextend(reln, forknum, blocknum, + nblocks, skipFsync, chain_index); +} + /* * smgrzeroextend() -- Add new zeroed out blocks to a file. * @@ -563,8 +643,7 @@ void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum, - nblocks, skipFsync); + smgr_zeroextend_next(reln, forknum, blocknum, nblocks, skipFsync, 0); /* * Normally we expect this to increase the fork size by nblocks, but if @@ -577,6 +656,16 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } +bool +smgr_prefetch_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_prefetch); + + return smgrsw[reln->smgr_chain.chain[chain_index]].smgr_prefetch(reln, forknum, blocknum, + nblocks, chain_index); +} + /* * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. * @@ -588,7 +677,16 @@ bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks); + return smgr_prefetch_next(reln, forknum, blocknum, nblocks, 0); +} + +uint32 +smgr_maxcombine_next(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_maxcombine); + + return smgrsw[reln->smgr_chain.chain[chain_index]].smgr_maxcombine(reln, forknum, blocknum, chain_index); } /* @@ -601,7 +699,17 @@ uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum); + return smgr_maxcombine_next(reln, forknum, blocknum, 0); +} + +void +smgr_readv_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_readv); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_readv(reln, forknum, blocknum, + buffers, nblocks, chain_index); } /* @@ -619,8 +727,17 @@ void smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers, - nblocks); + smgr_readv_next(reln, forknum, blocknum, buffers, nblocks, 0); +} + +void +smgr_writev_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_writev); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_writev(reln, forknum, blocknum, + buffers, nblocks, skipFsync, chain_index); } /* @@ -653,8 +770,17 @@ void smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum, - buffers, nblocks, skipFsync); + smgr_writev_next(reln, forknum, blocknum, + buffers, nblocks, skipFsync, 0); +} + +void +smgr_writeback_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_writeback); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_writeback(reln, forknum, blocknum, nblocks, chain_index); } /* @@ -665,8 +791,15 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, - nblocks); + smgr_writeback_next(reln, forknum, blocknum, nblocks, 0); +} + +extern BlockNumber +smgr_nblocks_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_nblocks); + + return smgrsw[reln->smgr_chain.chain[chain_index]].smgr_nblocks(reln, forknum, chain_index); } /* @@ -683,7 +816,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = smgr_nblocks_next(reln, forknum, 0); reln->smgr_cached_nblocks[forknum] = result; @@ -711,6 +844,14 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) return InvalidBlockNumber; } +void +smgr_truncate_next(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_truncate); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_truncate(reln, forknum, curnblk, nblocks, chain_index); +} + /* * smgrtruncate() -- Truncate the given forks of supplied relation to * each specified numbers of blocks @@ -755,8 +896,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], - old_nblocks[i], nblocks[i]); + smgr_truncate_next(reln, forknum[i], old_nblocks[i], nblocks[i], 0); /* * We might as well update the local smgr_cached_nblocks values. The @@ -769,6 +909,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, } } +void +smgr_registersync_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index) +{ + SMGR_CHAIN_LOOKUP(smgr_registersync); + + smgrsw[reln->smgr_chain.chain[chain_index]].smgr_registersync(reln, forknum, chain_index); +} + /* * smgrregistersync() -- Request a relation to be sync'd at next checkpoint * @@ -784,7 +932,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, void smgrregistersync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_registersync(reln, forknum); + smgr_registersync_next(reln, forknum, 0); } /* @@ -816,7 +964,7 @@ smgrregistersync(SMgrRelation reln, ForkNumber forknum) void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + smgr_immedsync_next(reln, forknum, 0); } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 55ab2da299b9..e806a4528aee 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4072,6 +4072,8 @@ PostgresSingleUserMain(int argc, char *argv[], */ process_shared_preload_libraries(); + process_smgr_chain(); + /* Initialize MaxBackends */ InitializeMaxBackends(); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 9f54a9e72b73..b565d38b7b67 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3832,7 +3832,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) /* handle these directly, at least for now */ SMgrRelation srel; - srel = RelationCreateStorage(newrlocator, persistence, true); + srel = RelationCreateStorage(relation->rd_locator, newrlocator, persistence, true); smgrclose(srel); } else diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index dc3521457c76..32d99e1244a9 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -43,6 +43,7 @@ #include "replication/slotsync.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/md.h" #include "storage/latch.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -55,6 +56,7 @@ #include "utils/pidfile.h" #include "utils/syscache.h" #include "utils/varlena.h" +#include "storage/smgr.h" #define DIRECTORY_LOCK_FILE "postmaster.pid" @@ -192,6 +194,9 @@ InitStandaloneProcess(const char *argv0) InitProcessLocalLatch(); InitializeLatchWaitSet(); + /* Initialize smgrs */ + register_builtin_dynamic_managers(); + /* * For consistency with InitPostmasterChild, initialize signal mask here. * But we don't unblock SIGQUIT or provide a default handler for it. @@ -1830,6 +1835,8 @@ char *session_preload_libraries_string = NULL; char *shared_preload_libraries_string = NULL; char *local_preload_libraries_string = NULL; +char *smgr_chain_string = NULL; + /* Flag telling that we are loading shared_preload_libraries */ bool process_shared_preload_libraries_in_progress = false; bool process_shared_preload_libraries_done = false; @@ -1906,6 +1913,62 @@ process_shared_preload_libraries(void) process_shared_preload_libraries_done = true; } +void +process_smgr_chain(void) +{ + char *rawstring; + List *elemlist; + ListCell *l; + uint8 idx = 0; + + if (smgr_chain_string == NULL || smgr_chain_string[0] == '\0') + return; /* nothing to do */ + + /* Need a modifiable copy of string */ + rawstring = pstrdup(smgr_chain_string); + + /* Parse string into list of filename paths */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + pfree(rawstring); + ereport(LOG, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid list syntax in parameter \"%s\"", + "smgr_chain"))); + return; + } + + foreach(l, elemlist) + { + char *smgrname = (char *) lfirst(l); + SMgrId id = smgr_lookup(smgrname); + + storage_manager_chain.chain[idx++] = id; + + ereport(DEBUG1, + (errmsg_internal("using storage manager in chain \"%s\"", smgrname))); + } + + for (int i = 0; i < idx; ++i) + { + int chain_position = smgrsw[storage_manager_chain.chain[i]].chain_position; + + if (i == idx - 1 && chain_position != SMGR_CHAIN_TAIL) + ereport(FATAL, + (errmsg_internal("smgr_chain: the last element should be a `tail` implementation, not a modifier."))); + + if (i != idx - 1 && chain_position != SMGR_CHAIN_MODIFIER) + ereport(FATAL, + (errmsg_internal("smgr_chain: element %i/%i %s is not a modifier.", i, idx, smgrsw[storage_manager_chain.chain[i]].name))); + } + + storage_manager_chain.size = idx; + + list_free(elemlist); + pfree(rawstring); +} + /* * process any libraries that should be preloaded at backend start */ @@ -1920,6 +1983,19 @@ process_session_preload_libraries(void) true); } +/* + * Register any internal managers. + */ +void +register_builtin_dynamic_managers(void) +{ + mdsmgr_register(); + + /* setup a dummy chain with md, for tools */ + storage_manager_chain.chain[0] = MdSMgrId; + storage_manager_chain.size = 1; +} + /* * process any shared memory requests from preloaded libraries */ diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 9c0b10ad4dc2..ad578de73a44 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -4408,6 +4408,17 @@ struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, + { + {"smgr_chain", PGC_POSTMASTER, CLIENT_CONN_PRELOAD, + gettext_noop("Lists storage managers used by the server, in order."), + NULL, + GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY + }, + &smgr_chain_string, + "md", + NULL, NULL, NULL + }, + { {"search_path", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Sets the schema search order for names that are not schema-qualified."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 8de86e0c9454..e8d56f5426e0 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -790,6 +790,7 @@ autovacuum_worker_slots = 16 # autovacuum worker slots to allocate #session_preload_libraries = '' #shared_preload_libraries = '' # (change requires restart) #jit_provider = 'llvmjit' # JIT library to use +#smgr_chain = 'md' # SMGR implementations to use # - Other Defaults - diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d313099c027f..8aab37ef52d7 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -13,6 +13,7 @@ #include "access/xlogbackup.h" #include "access/xlogdefs.h" +#include "catalog/pg_control.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" @@ -59,6 +60,9 @@ extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +typedef void (*checkpoint_create_hook_type) (const CheckPoint *); +extern PGDLLIMPORT checkpoint_create_hook_type checkpoint_create_hook; + /* Archive modes */ typedef enum ArchiveMode { diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index ba99225b0a37..ecc3b792f4f5 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -22,7 +22,8 @@ /* GUC variables */ extern PGDLLIMPORT int wal_skip_threshold; -extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator, +extern SMgrRelation RelationCreateStorage(RelFileLocator oldlocator, + RelFileLocator rlocator, char relpersistence, bool register_delete); extern void RelationDropStorage(Relation rel); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 6f16794eb63c..7bdb760f5c1c 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -513,6 +513,7 @@ extern PGDLLIMPORT bool process_shmem_requests_in_progress; extern PGDLLIMPORT char *session_preload_libraries_string; extern PGDLLIMPORT char *shared_preload_libraries_string; extern PGDLLIMPORT char *local_preload_libraries_string; +extern PGDLLIMPORT char *smgr_chain_string; extern void CreateDataDirLockFile(bool amPostmaster); extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster, @@ -521,7 +522,9 @@ extern void TouchSocketLockFiles(void); extern void AddToDataDirLockFile(int target_line, const char *str); extern bool RecheckDataDirLockFile(void); extern void ValidatePgVersion(const char *path); +extern void register_builtin_dynamic_managers(void); extern void process_shared_preload_libraries(void); +extern void process_smgr_chain(void); extern void process_session_preload_libraries(void); extern void process_shmem_requests(void); extern void pg_bindtextdomain(const char *domain); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 05bf537066eb..5b4992c0855b 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -19,33 +19,9 @@ #include "storage/smgr.h" #include "storage/sync.h" -/* md storage manager functionality */ -extern void mdinit(void); -extern void mdopen(SMgrRelation reln); -extern void mdclose(SMgrRelation reln, ForkNumber forknum); -extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool mdexists(SMgrRelation reln, ForkNumber forknum); -extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); -extern void mdextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); -extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); -extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks); -extern uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void **buffers, BlockNumber nblocks); -extern void mdwritev(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - const void **buffers, BlockNumber nblocks, bool skipFsync); -extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); -extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber old_blocks, BlockNumber nblocks); -extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); -extern void mdregistersync(SMgrRelation reln, ForkNumber forknum); +/* registration function for md storage manager */ +extern void mdsmgr_register(void); +extern SMgrId MdSMgrId; extern void ForgetDatabaseSyncRequests(Oid dbid); extern void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 4016b206ad66..8f789cb7f801 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,20 @@ #include "storage/block.h" #include "storage/relfilelocator.h" +typedef uint8 SMgrId; + +typedef uint8 SmgrChainIndex; + +#define MAX_SMGR_CHAIN 15 + +typedef struct +{ + SMgrId chain[MAX_SMGR_CHAIN]; /* storage manager selector */ + uint8 size; +} SMgrChain; + +extern PGDLLIMPORT SMgrChain storage_manager_chain; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -51,14 +65,7 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ - - /* - * for md.c; per-fork arrays of the number of open segments - * (md_num_open_segs) and the segments themselves (md_seg_fds). - */ - int md_num_open_segs[MAX_FORKNUM + 1]; - struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + SMgrChain smgr_chain; /* selected storage manager chain */ /* * Pinning support. If unpinned (ie. pincount == 0), 'node' is a list @@ -73,6 +80,61 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) +#define SMGR_CHAIN_TAIL 1 +#define SMGR_CHAIN_MODIFIER 2 + +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + const char *name; + int chain_position; + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln, SmgrChainIndex chain_index); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); + void (*smgr_create) (RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, + bool isRedo, SmgrChainIndex chain_index); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); + void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo, SmgrChainIndex chain_index); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync, SmgrChainIndex chain_index); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync, SmgrChainIndex chain_index); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, SmgrChainIndex chain_index); + uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, SmgrChainIndex chain_index); + void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + void **buffers, BlockNumber nblocks, SmgrChainIndex chain_index); + void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, + bool skipFsync, SmgrChainIndex chain_index); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks, SmgrChainIndex chain_index); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber old_blocks, BlockNumber nblocks, SmgrChainIndex chain_index); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); + void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +} f_smgr; + +extern SMgrId smgr_register(const f_smgr *smgr, Size smgrrelation_size); +extern SMgrId smgr_lookup(const char *name); + +extern f_smgr *smgrsw; + extern void smgrinit(void); extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); @@ -83,7 +145,7 @@ extern void smgrdestroyall(void); extern void smgrrelease(SMgrRelation reln); extern void smgrreleaseall(void); extern void smgrreleaserellocator(RelFileLocatorBackend rlocator); -extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern void smgrcreate(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, @@ -113,6 +175,46 @@ extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +extern void + smgr_open_next(SMgrRelation reln, SmgrChainIndex chain_index); +extern void + smgr_close_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +extern bool + smgr_exists_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +extern void + smgr_create_next(RelFileLocator relold, SMgrRelation reln, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index); +extern void + smgr_immedsync_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +extern void + smgr_unlink_next(SMgrRelation reln, RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo, SmgrChainIndex chain_index); +extern void + smgr_extend_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync, SmgrChainIndex chain_index); +extern void + smgr_zeroextend_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, bool skipFsync, SmgrChainIndex chain_index); +extern bool + smgr_prefetch_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, SmgrChainIndex chain_index); +extern uint32 + smgr_maxcombine_next(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, SmgrChainIndex chain_index); +extern void + smgr_readv_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks, SmgrChainIndex chain_index); +extern void + smgr_writev_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync, SmgrChainIndex chain_index); +extern void + smgr_writeback_next(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks, SmgrChainIndex chain_index); +extern BlockNumber + smgr_nblocks_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); +extern void + smgr_truncate_next(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks, SmgrChainIndex chain_index); +extern void + smgr_registersync_next(SMgrRelation reln, ForkNumber forknum, SmgrChainIndex chain_index); + static inline void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 93339ef3c58f..3b04dd39a801 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1632,6 +1632,7 @@ ManyTestResourceKind Material MaterialPath MaterialState +MdSMgrRelationData MdfdVec MdPathStr Memoize @@ -2556,6 +2557,7 @@ SID_IDENTIFIER_AUTHORITY SID_NAME_USE SISeg SIZE_T +SMgrChain SMgrRelation SMgrRelationData SMgrSortArray @@ -3149,6 +3151,8 @@ ViewStmt VirtualTransactionId VirtualTupleTableSlot VolatileFunctionStatus +VolatileRelnEntry +VolatileRelnKey Vsrt WAIT_ORDER WALAvailability