diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml index 58d0d90fece0..2f906e9f018d 100644 --- a/doc/src/sgml/acronyms.sgml +++ b/doc/src/sgml/acronyms.sgml @@ -9,6 +9,15 @@ + + AIO + + + Asynchronous I/O + + + + ACL @@ -354,6 +363,15 @@ + + I/O + + + Input/Output + + + + ISO diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index c0f812e3f5e8..b88cac598e90 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -81,6 +81,31 @@ + + Asynchronous I/O + AIO + + Asynchronous I/O + + + + Asynchronous I/O (AIO) describes + performing I/O in a non-blocking way (asynchronously), + in contrast to synchronous I/O, which blocks for the + entire duration of the I/O. + + + With AIO, starting an I/O operation + is separated from waiting for the result of the operation, allowing + multiple I/O operations to be initiated concurrently, + as well as performing CPU heavy operations + concurrently with I/O. The price for that increased + concurrency is increased complexity. + + + + + Atomic @@ -938,6 +963,20 @@ + + Input/Output + I/O + + + Input/Output (I/O) describes the communication between + a program and peripheral devices. In the context of database systems, + I/O commonly, but not exclusively, refers to + interaction with storage devices or the network. + + + + + Insert diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 3f5a306247e6..e9a59af8c34b 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -51,6 +51,11 @@ + + pg_aios + In-use asynchronous IO handles + + pg_available_extensions available extensions @@ -231,6 +236,295 @@ + + <structname>pg_aios</structname> + + + pg_aios + + + + The pg_aios view lists all handles that are currently in-use. An I/O handle + is used to reference an I/O operation that is being prepared, executed or + is in the process of completing. pg_aios contains + one row for each I/O handle. + + + + This view is mainly useful for developers of + PostgreSQL, but may also be useful when tuning + PostgreSQL. + + + + <structname>pg_aios</structname> Columns + + + + + Column Type + + + Description + + + + + + + + pid int4 + + + Process ID of the server process that is issuing this I/O. + + + + + + io_id int4 + + + Identifier of the I/O handle. Handles are reused once the I/O + completed (or if the handle is released before I/O is started). On reuse + + pg_aios.io_generation + + is incremented. + + + + + + io_generation int8 + + + Generation of the I/O handle. + + + + + + state text + + + State of the I/O handle: + + + + HANDED_OUT, referenced by code but not yet used + + + + + DEFINED, information necessary for execution is known + + + + + STAGED, ready for execution + + + + + SUBMITTED, submitted for execution + + + + + COMPLETED_IO, finished, but result has not yet been processed + + + + + COMPLETED_SHARED, shared completion processing completed + + + + + COMPLETED_LOCAL, backend local completion processing completed + + + + + + + + + operation text + + + Operation performed using the I/O handle: + + + + invalid, not yet known + + + + + readv, a vectored read + + + + + writev, a vectored write + + + + + + + + + off int8 + + + Offset of the I/O operation. + + + + + + length int8 + + + Length of the I/O operation. + + + + + + target text + + + What kind of object is the I/O targeting: + + + + smgr, I/O on relations + + + + + + + + + handle_data_len int2 + + + Length of the data associated with the I/O operation. For I/O to/from + and , this indicates the number of buffers the + I/O is operating on. + + + + + + raw_result int4 + + + Low-level result of the I/O operation, or NULL if the operation has not + yet completed. + + + + + + result text + + + High-level result of the I/O operation: + + + + UNKNOWN means that the result of the + operation is not yet known. + + + + + OK means the I/O completed successfully. + + + + + PARTIAL means that the I/O completed without + error, but did not process all data. Commonly callers will need to + retry and perform the remainder of the work in a separate I/O. + + + + + WARNING means that the I/O completed without + error, but that execution of the IO triggered a warning. E.g. when + encountering a corrupted buffer with enabled. + + + + + ERROR means the I/O failed with an error. + + + + + + + + + target_desc text + + + Description of what the I/O operation is targeting. + + + + + + f_sync bool + + + Flag indicating whether the I/O is executed synchronously. + + + + + + f_localmem bool + + + Flag indicating whether the I/O references process local memory. + + + + + + f_buffered bool + + + Flag indicating whether the I/O is buffered I/O. + + + + + +
+ + + The pg_aios view is read-only. + + + + By default, the pg_aios view can be read only by + superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_available_extensions</structname> diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6db864892d0d..e554504e1f0d 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -361,8 +361,15 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) BaseInit(); bootstrap_signals(); + + /* need a resowner for IO during BootStrapXLOG() */ + CreateAuxProcessResourceOwner(); + BootStrapXLOG(bootstrap_data_checksum_version); + ReleaseAuxProcessResources(true); + CurrentResourceOwner = NULL; + /* * To ensure that src/common/link-canary.c is linked into the backend, we * must call it from somewhere. Here is as good as anywhere. diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 31d269b7ee0c..64a7240aa772 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1391,3 +1391,10 @@ CREATE VIEW pg_stat_subscription_stats AS CREATE VIEW pg_wait_events AS SELECT * FROM pg_get_wait_events(); + +CREATE VIEW pg_aios AS + SELECT * FROM pg_get_aios(); +REVOKE ALL ON pg_aios FROM PUBLIC; +GRANT SELECT ON pg_aios TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_aios() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_aios() TO pg_read_all_stats; diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6b..a700b02d5a18 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -620,7 +620,7 @@ CreateAnonymousSegment(Size *size) allocsize += hugepagesize - (allocsize % hugepagesize); ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); + PG_MMAP_FLAGS | MAP_POPULATE | mmap_flags, -1, 0); mmap_errno = errno; if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 72f5acceec78..6e8801a39e32 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -38,11 +38,13 @@ #include "postmaster/auxprocess.h" #include "postmaster/bgwriter.h" #include "postmaster/interrupt.h" +#include "storage/aio.h" #include "storage/aio_subsys.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "storage/fd.h" +#include "storage/io_queue.h" #include "storage/lwlock.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -90,6 +92,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; + IOQueue *ioq; WritebackContext wb_context; Assert(startup_data_len == 0); @@ -131,6 +134,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(bgwriter_context); + ioq = io_queue_create(128, 0); WritebackContextInit(&wb_context, &bgwriter_flush_after); /* @@ -228,12 +232,22 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) /* Clear any already-pending wakeups */ ResetLatch(MyLatch); + /* + * FIXME: this is theoretically racy, but I didn't want to copy + * ProcessMainLoopInterrupts() remaining body here. + */ + if (ShutdownRequestPending) + { + io_queue_wait_all(ioq); + io_queue_free(ioq); + } + ProcessMainLoopInterrupts(); /* * Do one cycle of dirty-buffer writing. */ - can_hibernate = BgBufferSync(&wb_context); + can_hibernate = BgBufferSync(ioq, &wb_context); /* Report pending statistics to the cumulative stats system */ pgstat_report_bgwriter(); @@ -250,6 +264,9 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) smgrdestroyall(); } + /* finish IO before sleeping, to avoid blocking other backends */ + io_queue_wait_all(ioq); + /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index fda91ffd1ce2..904fe167eb4d 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -49,10 +49,12 @@ #include "postmaster/bgwriter.h" #include "postmaster/interrupt.h" #include "replication/syncrep.h" +#include "storage/aio.h" #include "storage/aio_subsys.h" #include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "storage/fd.h" +#include "storage/io_queue.h" #include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pmsignal.h" @@ -766,7 +768,7 @@ ImmediateCheckpointRequested(void) * fraction between 0.0 meaning none, and 1.0 meaning all done. */ void -CheckpointWriteDelay(int flags, double progress) +CheckpointWriteDelay(IOQueue *ioq, int flags, double progress) { static int absorb_counter = WRITES_PER_ABSORB; @@ -800,6 +802,13 @@ CheckpointWriteDelay(int flags, double progress) /* Report interim statistics to the cumulative stats system */ pgstat_report_checkpointer(); + /* + * Ensure all pending IO is submitted to avoid unnecessary delays for + * other processes. + */ + io_queue_wait_all(ioq); + + /* * This sleep used to be connected to bgwriter_delay, typically 200ms. * That resulted in more frequent wakeups if not much work to do. diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index a8d2e024d344..72c32f5c32e8 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -386,6 +386,12 @@ CreateInitDecodingContext(const char *plugin, slot->data.plugin = plugin_name; SpinLockRelease(&slot->mutex); + if (CurrentResourceOwner == NULL) + { + Assert(am_walsender); + CurrentResourceOwner = AuxProcessResourceOwner; + } + if (XLogRecPtrIsInvalid(restart_lsn)) ReplicationSlotReserveWal(); else diff --git a/src/backend/storage/aio/Makefile b/src/backend/storage/aio/Makefile index c06c50771e02..86fa4276fda9 100644 --- a/src/backend/storage/aio/Makefile +++ b/src/backend/storage/aio/Makefile @@ -11,9 +11,11 @@ include $(top_builddir)/src/Makefile.global OBJS = \ aio.o \ aio_callback.o \ + aio_funcs.o \ aio_init.o \ aio_io.o \ aio_target.o \ + io_queue.o \ method_io_uring.o \ method_sync.o \ method_worker.o \ diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md new file mode 100644 index 000000000000..534aaad22bc3 --- /dev/null +++ b/src/backend/storage/aio/README.md @@ -0,0 +1,451 @@ +# Asynchronous & Direct IO + +## Motivation + +### Why Asynchronous IO + +Until the introduction of asynchronous IO Postgres relied on the operating +system to hide the cost of synchronous IO from Postgres. While this worked +surprisingly well in a lot of workloads, it does not do as good a job on +prefetching and controlled writeback as we would like. + +There are important expensive operations like `fdatasync()` where the operating +system cannot hide the storage latency. This is particularly important for WAL +writes, where the ability to asynchronously issue `fdatasync()` or O_DSYNC +writes can yield significantly higher throughput. + + +### Why Direct / unbuffered IO + +The main reasons to want to use Direct IO are: + +- Lower CPU usage / higher throughput. Particularly on modern storage buffered + writes are bottlenecked by the operating system having to copy data from the + kernel's page cache to postgres buffer pool using the CPU. Whereas direct IO + can often move the data directly between the storage devices and postgres' + buffer cache, using DMA. While that transfer is ongoing, the CPU is free to + perform other work. +- Reduced latency - Direct IO can have substantially lower latency than + buffered IO, which can be impactful for OLTP workloads bottlenecked by WAL + write latency. +- Avoiding double buffering between operating system cache and postgres' + shared_buffers. +- Better control over the timing and pace of dirty data writeback. + + +The main reasons *not* to use Direct IO are: + +- Without AIO, Direct IO is unusably slow for most purposes. +- Even with AIO, many parts of postgres need to be modified to perform + explicit prefetching. +- In situations where shared_buffers cannot be set appropriately large, + e.g. because there are many different postgres instances hosted on shared + hardware, performance will often be worse than when using buffered IO. + + +## AIO Usage Example + +In many cases code that can benefit from AIO does not directly have to +interact with the AIO interface, but can use AIO via higher-level +abstractions. See [Helpers](#helpers). + +In this example, a buffer will be read into shared buffers. + +```C +/* + * Result of the operation, only to be accessed in this backend. + */ +PgAioReturn ioret; + +/* + * Acquire an AIO Handle, ioret will get result upon completion. + * + * Note that ioret needs to stay alive until the IO completes or + * CurrentResourceOwner is released (i.e. an error is thrown). + */ +PgAioHandle *ioh = pgaio_io_acquire(CurrentResourceOwner, &ioret); + +/* + * Reference that can be used to wait for the IO we initiate below. This + * reference can reside in local or shared memory and waited upon by any + * process. An arbitrary number of references can be made for each IO. + */ +PgAioWaitRef iow; + +pgaio_io_get_wref(ioh, &iow); + +/* + * Arrange for shared buffer completion callbacks to be called upon completion + * of the IO. This callback will update the buffer descriptors associated with + * the AioHandle, which e.g. allows other backends to access the buffer. + * + * A callback can be passed a small bit of data, e.g. to indicate whether to + * zero a buffer if it is invalid. + * + * Multiple completion callbacks can be registered for each handle. + */ +pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); + +/* + * The completion callback needs to know which buffers to update when the IO + * completes. As the AIO subsystem does not know about buffers, we have to + * associate this information with the AioHandle, for use by the completion + * callback registered above. + * + * In this example we're reading only a single buffer, hence the 1. + */ +pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); + +/* + * Pass the AIO handle to lower-level function. When operating on the level of + * buffers, we don't know how exactly the IO is performed, that is the + * responsibility of the storage manager implementation. + * + * E.g. md.c needs to translate block numbers into offsets in segments. + * + * Once the IO handle has been handed off to smgstartreadv(), it may not + * further be used, as the IO may immediately get executed below + * smgrstartreadv() and the handle reused for another IO. + * + * To issue multiple IOs in an efficient way, a caller can call + * pgaio_enter_batchmode() before starting multiple IOs, and end that batch + * with pgaio_exit_batchmode(). Note that one needs to be careful while there + * may be unsubmitted IOs, as another backend may need to wait for one of the + * unsubmitted IOs. If this backend then had to wait for the other backend, + * it'd end in an undetected deadlock. See pgaio_enter_batchmode() for more + * details. + * + * Note that even while in batchmode an IO might get submitted immediately, + * e.g. due to reaching a limit on the number of unsubmitted IOs, and even + * complete before smgrstartreadv() returns. + */ +smgrstartreadv(ioh, operation->smgr, forknum, blkno, + BufferGetBlock(buffer), 1); + +/* + * To benefit from AIO, it is beneficial to perform other work, including + * submitting other IOs, before waiting for the IO to complete. Otherwise + * we could just have used synchronous, blocking IO. + */ +perform_other_work(); + +/* + * We did some other work and now need the IO operation to have completed to + * continue. + */ +pgaio_wref_wait(&iow); + +/* + * At this point the IO has completed. We do not yet know whether it succeeded + * or failed, however. The buffer's state has been updated, which allows other + * backends to use the buffer (if the IO succeeded), or retry the IO (if it + * failed). + * + * Note that in case the IO has failed, a LOG message may have been emitted, + * but no ERROR has been raised. This is crucial, as another backend waiting + * for this IO should not see an ERROR. + * + * To check whether the operation succeeded, and to raise an ERROR, or if more + * appropriate LOG, the PgAioReturn we passed to pgaio_io_acquire() is used. + */ +if (ioret.result.status == PGAIO_RS_ERROR) + pgaio_result_report(ioret.result, &ioret.target_data, ERROR); + +/* + * Besides having succeeded completely, the IO could also have a) partially + * completed or b) succeeded with a warning (e.g. due to zero_damaged_pages). + * If we e.g. tried to read many blocks at once, the read might have + * only succeeded for the first few blocks. + * + * If the IO partially succeeded and this backend needs all blocks to have + * completed, this backend needs to reissue the IO for the remaining buffers. + * The AIO subsystem cannot handle this retry transparently. + * + * As this example is already long, and we only read a single block, we'll just + * error out if there's a partial read or a warning. + */ +if (ioret.result.status != PGAIO_RS_OK) + pgaio_result_report(ioret.result, &ioret.target_data, ERROR); + +/* + * The IO succeeded, so we can use the buffer now. + */ +``` + + +## Design Criteria & Motivation + +### Deadlock and Starvation Dangers due to AIO + +Using AIO in a naive way can easily lead to deadlocks in an environment where +the source/target of AIO are shared resources, like pages in postgres' +shared_buffers. + +Consider one backend performing readahead on a table, initiating IO for a +number of buffers ahead of the current "scan position". If that backend then +performs some operation that blocks, or even just is slow, the IO completion +for the asynchronously initiated read may not be processed. + +This AIO implementation solves this problem by requiring that AIO methods +either allow AIO completions to be processed by any backend in the system +(e.g. io_uring), or to guarantee that AIO processing will happen even when the +issuing backend is blocked (e.g. worker mode, which offloads completion +processing to the AIO workers). + + +### IO can be started in critical sections + +Using AIO for WAL writes can reduce the overhead of WAL logging substantially: + +- AIO allows to start WAL writes eagerly, so they complete before needing to + wait +- AIO allows to have multiple WAL flushes in progress at the same time +- AIO makes it more realistic to use O\_DIRECT + O\_DSYNC, which can reduce + the number of roundtrips to storage on some OSs and storage HW (buffered IO + and direct IO without O_DSYNC needs to issue a write and after the write's + completion a cache flush, whereas O\_DIRECT + O\_DSYNC can use a single + Force Unit Access (FUA) write). + +The need to be able to execute IO in critical sections has substantial design +implication on the AIO subsystem. Mainly because completing IOs (see prior +section) needs to be possible within a critical section, even if the +to-be-completed IO itself was not issued in a critical section. Consider +e.g. the case of a backend first starting a number of writes from shared +buffers and then starting to flush the WAL. Because only a limited amount of +IO can be in-progress at the same time, initiating IO for flushing the WAL may +require to first complete IO that was started earlier. + + +### State for AIO needs to live in shared memory + +Because postgres uses a process model and because AIOs need to be +complete-able by any backend much of the state of the AIO subsystem needs to +live in shared memory. + +In an `EXEC_BACKEND` build, a backend's executable code and other process +local state is not necessarily mapped to the same addresses in each process +due to ASLR. This means that the shared memory cannot contain pointers to +callbacks. + + +## Design of the AIO Subsystem + + +### AIO Methods + +To achieve portability and performance, multiple methods of performing AIO are +implemented and others are likely worth adding in the future. + + +#### Synchronous Mode + +`io_method=sync` does not actually perform AIO but allows to use the AIO API +while performing synchronous IO. This can be useful for debugging. The code +for the synchronous mode is also used as a fallback by e.g. the [worker +mode](#worker) uses it to execute IO that cannot be executed by workers. + + +#### Worker + +`io_method=worker` is available on every platform postgres runs on, and +implements asynchronous IO - from the view of the issuing process - by +dispatching the IO to one of several worker processes performing the IO in a +synchronous manner. + + +#### io_uring + +`io_method=io_uring` is available on Linux 5.1+. In contrast to worker mode it +dispatches all IO from within the process, lowering context switch rate / +latency. + + +### AIO Handles + +The central API piece for postgres' AIO abstraction are AIO handles. To +execute an IO one first has to acquire an IO handle (`pgaio_io_acquire()`) and +then "define" it, i.e. associate an IO operation with the handle. + +Often AIO handles are acquired on a higher level and then passed to a lower +level to be fully defined. E.g., for IO to/from shared buffers, bufmgr.c +routines acquire the handle, which is then passed through smgr.c, md.c to be +finally fully defined in fd.c. + +The functions used at the lowest level to define the operation are +`pgaio_io_start_*()`. + +Because acquisition of an IO handle +[must always succeed](#io-can-be-started-in-critical-sections) +and the number of AIO Handles +[has to be limited](#state-for-aio-needs-to-live-in-shared-memory) +AIO handles can be reused as soon as they have completed. Obviously code needs +to be able to react to IO completion. State can be updated using +[AIO Completion callbacks](#aio-callbacks) +and the issuing backend can provide a backend local variable to receive the +result of the IO, as described in +[AIO Result](#aio-results). +An IO can be waited for, by both the issuing and any other backend, using +[AIO References](#aio-wait-references). + + +Because an AIO Handle is not executable just after calling +`pgaio_io_acquire()` and because `pgaio_io_acquire()` needs to always succeed +(absent a PANIC), only a single AIO Handle may be acquired (i.e. returned by +`pgaio_io_acquire()`) without causing the IO to have been defined (by, +potentially indirectly, causing `pgaio_io_start_*()` to have been +called). Otherwise a backend could trivially self-deadlock by using up all AIO +Handles without the ability to wait for some of the IOs to complete. + +If it turns out that an AIO Handle is not needed, e.g., because the handle was +acquired before holding a contended lock, it can be released without being +defined using `pgaio_io_release()`. + + +### AIO Callbacks + +Commonly several layers need to react to completion of an IO. E.g. for a read +md.c needs to check if the IO outright failed or was shorter than needed, +bufmgr.c needs to verify the page looks valid and bufmgr.c needs to update the +BufferDesc to update the buffer's state. + +The fact that several layers / subsystems need to react to IO completion poses +a few challenges: + +- Upper layers should not need to know details of lower layers. E.g. bufmgr.c + should not assume the IO will pass through md.c. Therefore upper levels + cannot know what lower layers would consider an error. + +- Lower layers should not need to know about upper layers. E.g. smgr APIs are + used going through shared buffers but are also used bypassing shared + buffers. This means that e.g. md.c is not in a position to validate + checksums. + +- Having code in the AIO subsystem for every possible combination of layers + would lead to a lot of duplication. + +The "solution" to this is the ability to associate multiple completion +callbacks with a handle. E.g. bufmgr.c can have a callback to update the +BufferDesc state and to verify the page and md.c can have another callback to +check if the IO operation was successful. + +As [mentioned](#state-for-aio-needs-to-live-in-shared-memory), shared memory +currently cannot contain function pointers. Because of that completion +callbacks are not directly identified by function pointers but by IDs +(`PgAioHandleCallbackID`). A substantial added benefit is that that +allows callbacks to be identified by much smaller amount of memory (a single +byte currently). + +In addition to completion, AIO callbacks also are called to "stage" an +IO. This is, e.g., used to increase buffer reference counts to account for the +AIO subsystem referencing the buffer, which is required to handle the case +where the issuing backend errors out and releases its own pins while the IO is +still ongoing. + +As [explained earlier](#io-can-be-started-in-critical-sections) IO completions +need to be safe to execute in critical sections. To allow the backend that +issued the IO to error out in case of failure [AIO Result](#aio-results) can +be used. + + +### AIO Targets + +In addition to the completion callbacks describe above, each AIO Handle has +exactly one "target". Each target has some space inside an AIO Handle with +information specific to the target and can provide callbacks to allow to +reopen the underlying file (required for worker mode) and to describe the IO +operation (used for debug logging and error messages). + +I.e., if two different uses of AIO can describe the identity of the file being +operated on the same way, it likely makes sense to use the same +target. E.g. different smgr implementations can describe IO with +RelFileLocator, ForkNumber and BlockNumber and can thus share a target. In +contrast, IO for a WAL file would be described with TimeLineID and XLogRecPtr +and it would not make sense to use the same target for smgr and WAL. + + +### AIO Wait References + +As [described above](#aio-handles), AIO Handles can be reused immediately +after completion and therefore cannot be used to wait for completion of the +IO. Waiting is enabled using AIO wait references, which do not just identify +an AIO Handle but also include the handles "generation". + +A reference to an AIO Handle can be acquired using `pgaio_io_get_wref()` and +then waited upon using `pgaio_wref_wait()`. + + +### AIO Results + +As AIO completion callbacks +[are executed in critical sections](#io-can-be-started-in-critical-sections) +and [may be executed by any backend](#deadlock-and-starvation-dangers-due-to-aio) +completion callbacks cannot be used to, e.g., make the query that triggered an +IO ERROR out. + +To allow to react to failing IOs the issuing backend can pass a pointer to a +`PgAioReturn` in backend local memory. Before an AIO Handle is reused the +`PgAioReturn` is filled with information about the IO. This includes +information about whether the IO was successful (as a value of +`PgAioResultStatus`) and enough information to raise an error in case of a +failure (via `pgaio_result_report()`, with the error details encoded in +`PgAioResult`). + + +### AIO Errors + +It would be very convenient to have shared completion callbacks encode the +details of errors as an `ErrorData` that could be raised at a later +time. Unfortunately doing so would require allocating memory. While elog.c can +guarantee (well, kinda) that logging a message will not run out of memory, +that only works because a very limited number of messages are in the process +of being logged. With AIO a large number of concurrently issued AIOs might +fail. + +To avoid the need for preallocating a potentially large amount of memory (in +shared memory no less!), completion callbacks instead have to encode errors in +a more compact format that can be converted into an error message. + + +### AIO Bounce Buffers + +For some uses of AIO there is no convenient memory location as the source / +destination of an AIO. E.g. when data checksums are enabled, writes from +shared buffers currently cannot be done directly from shared buffers, as a +shared buffer lock still allows some modification, e.g., for hint bits (see +`FlushBuffer()`). If the write were done in-place, such modifications can +cause the checksum to fail. + +For synchronous IO this is solved by copying the buffer to separate memory +before computing the checksum and using that copy as the source buffer for the +AIO. + +However, for AIO that is not a workable solution: +- Instead of a single buffer many buffers are required, as many IOs might be + in flight +- When using the [worker method](#worker), the source/target of IO needs to be + in shared memory, otherwise the workers won't be able to access the memory. + +The AIO subsystem addresses this by providing a limited number of bounce +buffers that can be used as the source / target for IO. A bounce buffer can be +acquired with `pgaio_bounce_buffer_get()` and multiple bounce buffers can be +associated with an AIO Handle with `pgaio_io_assoc_bounce_buffer()`. + +Bounce buffers are automatically released when the IO completes. + + +## Helpers + +Using the low-level AIO API introduces too much complexity to do so all over +the tree. Most uses of AIO should be done via reusable, higher-level, +helpers. + + +### Read Stream + +A common and very beneficial use of AIO are reads where a substantial number +of to-be-read locations are known ahead of time. E.g., for a sequential scan +the set of blocks that need to be read can be determined solely by knowing the +current position and checking the buffer mapping table. + +The [Read Stream](../../../include/storage/read_stream.h) interface makes it +comparatively easy to use AIO for such use cases. diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index e3ed087e8a2b..cff48964d076 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -24,6 +24,8 @@ * * - read_stream.c - helper for reading buffered relation data * + * - README.md - higher-level overview over AIO + * * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -60,6 +62,8 @@ static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation static const char *pgaio_io_state_get_name(PgAioHandleState s); static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation); +static void pgaio_bounce_buffer_wait_for_free(void); + /* Options for io_method. */ const struct config_enum_entry io_method_options[] = { @@ -74,6 +78,7 @@ const struct config_enum_entry io_method_options[] = { /* GUCs */ int io_method = DEFAULT_IO_METHOD; int io_max_concurrency = -1; +int io_bounce_buffers = -1; /* global control for AIO */ PgAioCtl *pgaio_ctl; @@ -660,6 +665,21 @@ pgaio_io_reclaim(PgAioHandle *ioh) if (ioh->state != PGAIO_HS_HANDED_OUT) dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node); + /* reclaim all associated bounce buffers */ + if (!slist_is_empty(&ioh->bounce_buffers)) + { + slist_mutable_iter it; + + slist_foreach_modify(it, &ioh->bounce_buffers) + { + PgAioBounceBuffer *bb = slist_container(PgAioBounceBuffer, node, it.cur); + + slist_delete_current(&it); + + slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node); + } + } + if (ioh->resowner) { ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node); @@ -1044,6 +1064,166 @@ pgaio_submit_staged(void) +/* -------------------------------------------------------------------------------- + * Functions primarily related to PgAioBounceBuffer + * -------------------------------------------------------------------------------- + */ + +PgAioBounceBuffer * +pgaio_bounce_buffer_get(void) +{ + PgAioBounceBuffer *bb = NULL; + slist_node *node; + + if (pgaio_my_backend->handed_out_bb != NULL) + elog(ERROR, "can only hand out one BB"); + + /* + * XXX: It probably is not a good idea to have bounce buffers be per + * backend, that's a fair bit of memory. + */ + if (slist_is_empty(&pgaio_my_backend->idle_bbs)) + { + pgaio_bounce_buffer_wait_for_free(); + } + + node = slist_pop_head_node(&pgaio_my_backend->idle_bbs); + bb = slist_container(PgAioBounceBuffer, node, node); + + pgaio_my_backend->handed_out_bb = bb; + + bb->resowner = CurrentResourceOwner; + ResourceOwnerRememberAioBounceBuffer(bb->resowner, &bb->resowner_node); + + return bb; +} + +void +pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb) +{ + if (pgaio_my_backend->handed_out_bb != bb) + elog(ERROR, "can only assign handed out BB"); + pgaio_my_backend->handed_out_bb = NULL; + + /* + * There can be many bounce buffers assigned in case of vectorized IOs. + */ + slist_push_head(&ioh->bounce_buffers, &bb->node); + + /* once associated with an IO, the IO has ownership */ + ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node); + bb->resowner = NULL; +} + +uint32 +pgaio_bounce_buffer_id(PgAioBounceBuffer *bb) +{ + return bb - pgaio_ctl->bounce_buffers; +} + +void +pgaio_bounce_buffer_release(PgAioBounceBuffer *bb) +{ + if (pgaio_my_backend->handed_out_bb != bb) + elog(ERROR, "can only release handed out BB"); + + slist_push_head(&pgaio_my_backend->idle_bbs, &bb->node); + pgaio_my_backend->handed_out_bb = NULL; + + ResourceOwnerForgetAioBounceBuffer(bb->resowner, &bb->resowner_node); + bb->resowner = NULL; +} + +void +pgaio_bounce_buffer_release_resowner(dlist_node *bb_node, bool on_error) +{ + PgAioBounceBuffer *bb = dlist_container(PgAioBounceBuffer, resowner_node, bb_node); + + Assert(bb->resowner); + + if (!on_error) + elog(WARNING, "leaked AIO bounce buffer"); + + pgaio_bounce_buffer_release(bb); +} + +char * +pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb) +{ + return bb->buffer; +} + +static void +pgaio_bounce_buffer_wait_for_free(void) +{ + static uint32 lastpos = 0; + + if (pgaio_my_backend->num_staged_ios > 0) + { + pgaio_debug(DEBUG2, "submitting %d, while acquiring free bb", + pgaio_my_backend->num_staged_ios); + pgaio_submit_staged(); + } + + for (uint32 i = lastpos; i < lastpos + io_max_concurrency; i++) + { + uint32 thisoff = pgaio_my_backend->io_handle_off + (i % io_max_concurrency); + PgAioHandle *ioh = &pgaio_ctl->io_handles[thisoff]; + + switch (ioh->state) + { + case PGAIO_HS_IDLE: + case PGAIO_HS_HANDED_OUT: + continue; + case PGAIO_HS_DEFINED: /* should have been submitted above */ + case PGAIO_HS_STAGED: + elog(ERROR, "shouldn't get here with io:%d in state %d", + pgaio_io_get_id(ioh), ioh->state); + break; + case PGAIO_HS_COMPLETED_IO: + case PGAIO_HS_SUBMITTED: + if (!slist_is_empty(&ioh->bounce_buffers)) + { + pgaio_debug_io(DEBUG2, ioh, + "waiting for IO to reclaim BB with %d in flight", + dclist_count(&pgaio_my_backend->in_flight_ios)); + + /* see comment in pgaio_io_wait_for_free() about raciness */ + pgaio_io_wait(ioh, ioh->generation); + + if (slist_is_empty(&pgaio_my_backend->idle_bbs)) + elog(WARNING, "empty after wait"); + + if (!slist_is_empty(&pgaio_my_backend->idle_bbs)) + { + lastpos = i; + return; + } + } + break; + case PGAIO_HS_COMPLETED_SHARED: + case PGAIO_HS_COMPLETED_LOCAL: + /* reclaim */ + pgaio_io_reclaim(ioh); + + if (!slist_is_empty(&pgaio_my_backend->idle_bbs)) + { + lastpos = i; + return; + } + break; + } + } + + /* + * The submission above could have caused the IO to complete at any time. + */ + if (slist_is_empty(&pgaio_my_backend->idle_bbs)) + elog(PANIC, "no more bbs"); +} + + + /* -------------------------------------------------------------------------------- * Other * -------------------------------------------------------------------------------- diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c index bf42778a48c7..21665e7eccb3 100644 --- a/src/backend/storage/aio/aio_callback.c +++ b/src/backend/storage/aio/aio_callback.c @@ -41,10 +41,13 @@ static const PgAioHandleCallbacksEntry aio_handle_cbs[] = { CALLBACK_ENTRY(PGAIO_HCB_INVALID, aio_invalid_cb), CALLBACK_ENTRY(PGAIO_HCB_MD_READV, aio_md_readv_cb), + CALLBACK_ENTRY(PGAIO_HCB_MD_WRITEV, aio_md_writev_cb), CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_READV, aio_shared_buffer_readv_cb), + CALLBACK_ENTRY(PGAIO_HCB_SHARED_BUFFER_WRITEV, aio_shared_buffer_writev_cb), CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_READV, aio_local_buffer_readv_cb), + CALLBACK_ENTRY(PGAIO_HCB_LOCAL_BUFFER_WRITEV, aio_local_buffer_writev_cb), #undef CALLBACK_ENTRY }; diff --git a/src/backend/storage/aio/aio_funcs.c b/src/backend/storage/aio/aio_funcs.c new file mode 100644 index 000000000000..584e683371a3 --- /dev/null +++ b/src/backend/storage/aio/aio_funcs.c @@ -0,0 +1,230 @@ +/*------------------------------------------------------------------------- + * + * aio_funcs.c + * AIO - SQL interface for AIO + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/aio/aio_funcs.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "funcapi.h" +#include "nodes/execnodes.h" +#include "port/atomics.h" +#include "storage/aio_internal.h" +#include "storage/lock.h" +#include "storage/proc.h" +#include "storage/procnumber.h" +#include "utils/builtins.h" +#include "utils/fmgrprotos.h" +#include "utils/tuplestore.h" + + +/* + * Byte length of an iovec. + */ +static size_t +iov_byte_length(const struct iovec *iov, int cnt) +{ + size_t len = 0; + + for (int i = 0; i < cnt; i++) + { + len += iov[i].iov_len; + } + + return len; +} + +Datum +pg_get_aios(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + InitMaterializedSRF(fcinfo, 0); + +#define PG_GET_AIOS_COLS 15 + + for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++) + { + PgAioHandle *live_ioh = &pgaio_ctl->io_handles[i]; + uint32 ioh_id = pgaio_io_get_id(live_ioh); + Datum values[PG_GET_AIOS_COLS] = {0}; + bool nulls[PG_GET_AIOS_COLS] = {0}; + ProcNumber owner; + PGPROC *owner_proc; + int32 owner_pid; + PgAioHandleState start_state; + uint64 start_generation; + PgAioHandle ioh_copy; + struct iovec iov_copy[PG_IOV_MAX]; + + + /* + * There is no lock that could prevent the state of the IO to advance + * concurrently - and we don't want to introduce one, as that would + * introduce atomics into a very common path. Instead we + * + * 1) Determine the state + generation of the IO. + * + * 2) Copy the IO to local memory. + * + * 3) Check if state or generation of the IO changed. If the state + * changed, retry, if the generation changed don't display the IO. + */ + + /* 1) from above */ + start_generation = live_ioh->generation; + + /* + * Retry at this point, so we can accept changing states, but not + * changing generations. + */ +retry: + pg_read_barrier(); + start_state = live_ioh->state; + + if (start_state == PGAIO_HS_IDLE) + continue; + + /* 2) from above */ + memcpy(&ioh_copy, live_ioh, sizeof(PgAioHandle)); + + /* + * Safe to copy even if no iovec is used - we always reserve the + * required space. + */ + memcpy(&iov_copy, &pgaio_ctl->iovecs[ioh_copy.iovec_off], + PG_IOV_MAX * sizeof(struct iovec)); + + /* + * Copy information about owner before 3) below, if the process exited + * it'd have to wait for the IO to finish first, which we would detect + * in 3). + */ + owner = ioh_copy.owner_procno; + owner_proc = GetPGProcByNumber(owner); + owner_pid = owner_proc->pid; + + /* 3) from above */ + pg_read_barrier(); + + /* + * The IO completed and a new one was started with the same ID. Don't + * display it - it really started after this function was called. + * There be a risk of a livelock if we just retried endlessly, if IOs + * complete very quickly. + */ + if (live_ioh->generation != start_generation) + continue; + + /* + * The IO's state changed while we were "rendering" it. Just start + * from scratch. There's no risk of a livelock here, as an IO has a + * limited sets of states it can be in, and state changes go only in a + * single direction. + */ + if (live_ioh->state != start_state) + goto retry; + + /* + * Now that we have copied the IO into local memory and checked that + * it's still in the same state, we are not allowed to access "live" + * memory anymore. To make it slightly easier to catch such cases, set + * the "live" pointers to NULL. + */ + live_ioh = NULL; + owner_proc = NULL; + + + /* column: owning pid */ + if (owner_pid != 0) + values[0] = Int32GetDatum(owner_pid); + else + nulls[0] = false; + + /* column: IO's id */ + values[1] = ioh_id; + + /* column: IO's generation */ + values[2] = Int64GetDatum(start_generation); + + /* column: IO's state */ + values[3] = CStringGetTextDatum(pgaio_io_get_state_name(&ioh_copy)); + + /* + * If the IO is in PGAIO_HS_HANDED_OUT state, none of the following + * fields are valid yet (or are in the process of being set). + * Therefore we don't want to display any other columns. + */ + if (start_state == PGAIO_HS_HANDED_OUT) + { + memset(nulls + 4, 1, (lengthof(nulls) - 4) * sizeof(bool)); + goto display; + } + + /* column: IO's operation */ + values[4] = CStringGetTextDatum(pgaio_io_get_op_name(&ioh_copy)); + + /* columns: details about the IO's operation (offset, length) */ + switch (ioh_copy.op) + { + case PGAIO_OP_INVALID: + nulls[5] = true; + nulls[6] = true; + break; + case PGAIO_OP_READV: + values[5] = Int64GetDatum(ioh_copy.op_data.read.offset); + values[6] = + Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.read.iov_length)); + break; + case PGAIO_OP_WRITEV: + values[5] = Int64GetDatum(ioh_copy.op_data.write.offset); + values[6] = + Int64GetDatum(iov_byte_length(iov_copy, ioh_copy.op_data.write.iov_length)); + break; + } + + /* column: IO's target */ + values[7] = CStringGetTextDatum(pgaio_io_get_target_name(&ioh_copy)); + + /* column: length of IO's data array */ + values[8] = Int16GetDatum(ioh_copy.handle_data_len); + + /* column: raw result (i.e. some form of syscall return value) */ + if (start_state == PGAIO_HS_COMPLETED_IO + || start_state == PGAIO_HS_COMPLETED_SHARED + || start_state == PGAIO_HS_COMPLETED_LOCAL) + values[9] = Int32GetDatum(ioh_copy.result); + else + nulls[9] = true; + + /* + * column: result in the higher level representation (unknown if not + * finished) + */ + values[10] = + CStringGetTextDatum(pgaio_result_status_string(ioh_copy.distilled_result.status)); + + /* column: target description */ + values[11] = CStringGetTextDatum(pgaio_io_get_target_description(&ioh_copy)); + + /* columns: one for each flag */ + values[12] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_SYNCHRONOUS); + values[13] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_REFERENCES_LOCAL); + values[14] = BoolGetDatum(ioh_copy.flags & PGAIO_HF_BUFFERED); + +display: + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + + return (Datum) 0; +} diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c index 885c3940c662..95b10933fedb 100644 --- a/src/backend/storage/aio/aio_init.c +++ b/src/backend/storage/aio/aio_init.c @@ -88,6 +88,32 @@ AioHandleDataShmemSize(void) io_max_concurrency)); } +static Size +AioBounceBufferDescShmemSize(void) +{ + Size sz; + + /* PgAioBounceBuffer itself */ + sz = mul_size(sizeof(PgAioBounceBuffer), + mul_size(AioProcs(), io_bounce_buffers)); + + return sz; +} + +static Size +AioBounceBufferDataShmemSize(void) +{ + Size sz; + + /* and the associated buffer */ + sz = mul_size(BLCKSZ, + mul_size(io_bounce_buffers, AioProcs())); + /* memory for alignment */ + sz += BLCKSZ; + + return sz; +} + /* * Choose a suitable value for io_max_concurrency. * @@ -113,6 +139,33 @@ AioChooseMaxConcurrency(void) return Min(max_proportional_pins, 64); } +/* + * Choose a suitable value for io_bounce_buffers. + * + * It's very unlikely to be useful to allocate more bounce buffers for each + * backend than the backend is allowed to pin. Additionally, bounce buffers + * currently are used for writes, it seems very uncommon for more than 10% of + * shared_buffers to be written out concurrently. + * + * XXX: This quickly can take up significant amounts of memory, the logic + * should probably fine tuned. + */ +static int +AioChooseBounceBuffers(void) +{ + uint32 max_backends; + int max_proportional_pins; + + /* Similar logic to LimitAdditionalPins() */ + max_backends = MaxBackends + NUM_AUXILIARY_PROCS; + max_proportional_pins = (NBuffers / 10) / max_backends; + + max_proportional_pins = Max(max_proportional_pins, 1); + + /* apply upper limit */ + return Min(max_proportional_pins, 256); +} + Size AioShmemSize(void) { @@ -136,11 +189,31 @@ AioShmemSize(void) PGC_S_OVERRIDE); } + + /* + * If io_bounce_buffers is -1, we automatically choose a suitable value. + * + * See also comment above. + */ + if (io_bounce_buffers == -1) + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", AioChooseBounceBuffers()); + SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER, + PGC_S_DYNAMIC_DEFAULT); + if (io_bounce_buffers == -1) /* failed to apply it? */ + SetConfigOption("io_bounce_buffers", buf, PGC_POSTMASTER, + PGC_S_OVERRIDE); + } + sz = add_size(sz, AioCtlShmemSize()); sz = add_size(sz, AioBackendShmemSize()); sz = add_size(sz, AioHandleShmemSize()); sz = add_size(sz, AioHandleIOVShmemSize()); sz = add_size(sz, AioHandleDataShmemSize()); + sz = add_size(sz, AioBounceBufferDescShmemSize()); + sz = add_size(sz, AioBounceBufferDataShmemSize()); /* Reserve space for method specific resources. */ if (pgaio_method_ops->shmem_size) @@ -156,6 +229,9 @@ AioShmemInit(void) uint32 io_handle_off = 0; uint32 iovec_off = 0; uint32 per_backend_iovecs = io_max_concurrency * io_max_combine_limit; + uint32 bounce_buffers_off = 0; + uint32 per_backend_bb = io_bounce_buffers; + char *bounce_buffers_data; pgaio_ctl = (PgAioCtl *) ShmemInitStruct("AioCtl", AioCtlShmemSize(), &found); @@ -167,6 +243,7 @@ AioShmemInit(void) pgaio_ctl->io_handle_count = AioProcs() * io_max_concurrency; pgaio_ctl->iovec_count = AioProcs() * per_backend_iovecs; + pgaio_ctl->bounce_buffers_count = AioProcs() * per_backend_bb; pgaio_ctl->backend_state = (PgAioBackend *) ShmemInitStruct("AioBackend", AioBackendShmemSize(), &found); @@ -179,6 +256,40 @@ AioShmemInit(void) pgaio_ctl->handle_data = (uint64 *) ShmemInitStruct("AioHandleData", AioHandleDataShmemSize(), &found); + pgaio_ctl->bounce_buffers = (PgAioBounceBuffer *) + ShmemInitStruct("AioBounceBufferDesc", AioBounceBufferDescShmemSize(), + &found); + + bounce_buffers_data = + ShmemInitStruct("AioBounceBufferData", AioBounceBufferDataShmemSize(), + &found); + bounce_buffers_data = + (char *) TYPEALIGN(BLCKSZ, (uintptr_t) bounce_buffers_data); + pgaio_ctl->bounce_buffers_data = bounce_buffers_data; + + + /* Initialize IO handles. */ + for (uint64 i = 0; i < pgaio_ctl->io_handle_count; i++) + { + PgAioHandle *ioh = &pgaio_ctl->io_handles[i]; + + ioh->op = PGAIO_OP_INVALID; + ioh->target = PGAIO_TID_INVALID; + ioh->state = PGAIO_HS_IDLE; + + slist_init(&ioh->bounce_buffers); + } + + /* Initialize Bounce Buffers. */ + for (uint64 i = 0; i < pgaio_ctl->bounce_buffers_count; i++) + { + PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[i]; + + bb->buffer = bounce_buffers_data; + bounce_buffers_data += BLCKSZ; + } + + for (int procno = 0; procno < AioProcs(); procno++) { PgAioBackend *bs = &pgaio_ctl->backend_state[procno]; @@ -186,9 +297,13 @@ AioShmemInit(void) bs->io_handle_off = io_handle_off; io_handle_off += io_max_concurrency; + bs->bounce_buffers_off = bounce_buffers_off; + bounce_buffers_off += per_backend_bb; + dclist_init(&bs->idle_ios); memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE); dclist_init(&bs->in_flight_ios); + slist_init(&bs->idle_bbs); /* initialize per-backend IOs */ for (int i = 0; i < io_max_concurrency; i++) @@ -210,6 +325,14 @@ AioShmemInit(void) dclist_push_tail(&bs->idle_ios, &ioh->node); iovec_off += io_max_combine_limit; } + + /* initialize per-backend bounce buffers */ + for (int i = 0; i < per_backend_bb; i++) + { + PgAioBounceBuffer *bb = &pgaio_ctl->bounce_buffers[bs->bounce_buffers_off + i]; + + slist_push_head(&bs->idle_bbs, &bb->node); + } } out: diff --git a/src/backend/storage/aio/io_queue.c b/src/backend/storage/aio/io_queue.c new file mode 100644 index 000000000000..526aa1d5e06d --- /dev/null +++ b/src/backend/storage/aio/io_queue.c @@ -0,0 +1,204 @@ +/*------------------------------------------------------------------------- + * + * io_queue.c + * AIO - Mechanism for tracking many IOs + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/aio/io_queue.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/ilist.h" +#include "storage/aio.h" +#include "storage/io_queue.h" +#include "utils/resowner.h" + + + +typedef struct TrackedIO +{ + PgAioWaitRef iow; + dlist_node node; +} TrackedIO; + +struct IOQueue +{ + int depth; + int unsubmitted; + + bool has_reserved; + + dclist_head idle; + dclist_head in_progress; + + TrackedIO tracked_ios[FLEXIBLE_ARRAY_MEMBER]; +}; + + +IOQueue * +io_queue_create(int depth, int flags) +{ + size_t sz; + IOQueue *ioq; + + sz = offsetof(IOQueue, tracked_ios) + + sizeof(TrackedIO) * depth; + + ioq = palloc0(sz); + + ioq->depth = 0; + + for (int i = 0; i < depth; i++) + { + TrackedIO *tio = &ioq->tracked_ios[i]; + + pgaio_wref_clear(&tio->iow); + dclist_push_tail(&ioq->idle, &tio->node); + } + + return ioq; +} + +void +io_queue_wait_one(IOQueue *ioq) +{ + /* submit all pending IO before waiting */ + pgaio_submit_staged(); + + while (!dclist_is_empty(&ioq->in_progress)) + { + /* FIXME: Should we really pop here already? */ + dlist_node *node = dclist_pop_head_node(&ioq->in_progress); + TrackedIO *tio = dclist_container(TrackedIO, node, node); + + pgaio_wref_wait(&tio->iow); + dclist_push_head(&ioq->idle, &tio->node); + } +} + +void +io_queue_reserve(IOQueue *ioq) +{ + if (ioq->has_reserved) + return; + + if (dclist_is_empty(&ioq->idle)) + io_queue_wait_one(ioq); + + Assert(!dclist_is_empty(&ioq->idle)); + + ioq->has_reserved = true; +} + +PgAioHandle * +io_queue_acquire_io(IOQueue *ioq) +{ + PgAioHandle *ioh; + + io_queue_reserve(ioq); + + Assert(!dclist_is_empty(&ioq->idle)); + + if (!io_queue_is_empty(ioq)) + { + ioh = pgaio_io_acquire_nb(CurrentResourceOwner, NULL); + if (ioh == NULL) + { + /* + * Need to wait for all IOs, blocking might not be legal in the + * context. + * + * XXX: This doesn't make a whole lot of sense, we're also + * blocking here. What was I smoking when I wrote the above? + */ + io_queue_wait_all(ioq); + ioh = pgaio_io_acquire(CurrentResourceOwner, NULL); + } + } + else + { + ioh = pgaio_io_acquire(CurrentResourceOwner, NULL); + } + + return ioh; +} + +void +io_queue_track(IOQueue *ioq, const struct PgAioWaitRef *iow) +{ + dlist_node *node; + TrackedIO *tio; + + Assert(ioq->has_reserved); + ioq->has_reserved = false; + + Assert(!dclist_is_empty(&ioq->idle)); + + node = dclist_pop_head_node(&ioq->idle); + tio = dclist_container(TrackedIO, node, node); + + tio->iow = *iow; + + dclist_push_tail(&ioq->in_progress, &tio->node); + + ioq->unsubmitted++; + + /* + * XXX: Should have some smarter logic here. We don't want to wait too + * long to submit, that'll mean we're more likely to block. But we also + * don't want to have the overhead of submitting every IO individually. + */ + if (ioq->unsubmitted >= 4) + { + pgaio_submit_staged(); + ioq->unsubmitted = 0; + } +} + +void +io_queue_wait_all(IOQueue *ioq) +{ + /* submit all pending IO before waiting */ + pgaio_submit_staged(); + + while (!dclist_is_empty(&ioq->in_progress)) + { + /* wait for the last IO to minimize unnecessary wakeups */ + dlist_node *node = dclist_tail_node(&ioq->in_progress); + TrackedIO *tio = dclist_container(TrackedIO, node, node); + + if (!pgaio_wref_check_done(&tio->iow)) + { + ereport(DEBUG3, + errmsg("io_queue_wait_all for io:%d", + pgaio_wref_get_id(&tio->iow)), + errhidestmt(true), + errhidecontext(true)); + + pgaio_wref_wait(&tio->iow); + } + + dclist_delete_from(&ioq->in_progress, &tio->node); + dclist_push_head(&ioq->idle, &tio->node); + } +} + +bool +io_queue_is_empty(IOQueue *ioq) +{ + return dclist_is_empty(&ioq->in_progress); +} + +void +io_queue_free(IOQueue *ioq) +{ + io_queue_wait_all(ioq); + + pfree(ioq); +} diff --git a/src/backend/storage/aio/meson.build b/src/backend/storage/aio/meson.build index 2f0f03d80712..270c4a64428b 100644 --- a/src/backend/storage/aio/meson.build +++ b/src/backend/storage/aio/meson.build @@ -3,9 +3,11 @@ backend_sources += files( 'aio.c', 'aio_callback.c', + 'aio_funcs.c', 'aio_init.c', 'aio_io.c', 'aio_target.c', + 'io_queue.c', 'method_io_uring.c', 'method_sync.c', 'method_worker.c', diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index 0bcdab14ae7e..c719ba2727a8 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -302,14 +302,41 @@ pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios) return num_staged_ios; } +static void +pgaio_uring_completion_error_callback(void *arg) +{ + ProcNumber owner; + PGPROC *owner_proc; + int32 owner_pid; + PgAioHandle *ioh = arg; + + if (!ioh) + return; + + /* No need for context if a backend is completing the IO for itself */ + if (ioh->owner_procno == MyProcNumber) + return; + + owner = ioh->owner_procno; + owner_proc = GetPGProcByNumber(owner); + owner_pid = owner_proc->pid; + + errcontext("completing I/O on behalf of process %d", owner_pid); +} + static void pgaio_uring_drain_locked(PgAioUringContext *context) { int ready; int orig_ready; + ErrorContextCallback errcallback = {0}; Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE)); + errcallback.callback = pgaio_uring_completion_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + /* * Don't drain more events than available right now. Otherwise it's * plausible that one backend could get stuck, for a while, receiving CQEs @@ -337,9 +364,11 @@ pgaio_uring_drain_locked(PgAioUringContext *context) PgAioHandle *ioh; ioh = io_uring_cqe_get_data(cqe); + errcallback.arg = ioh; io_uring_cqe_seen(&context->io_uring_ring, cqe); pgaio_io_process_completion(ioh, cqe->res); + errcallback.arg = NULL; } END_CRIT_SECTION(); @@ -348,6 +377,8 @@ pgaio_uring_drain_locked(PgAioUringContext *context) "drained %d/%d, now expecting %d", ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring)); } + + error_context_stack = errcallback.previous; } static void diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 4a7853d13fac..31d94ac82c54 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -357,11 +357,33 @@ pgaio_worker_register(void) on_shmem_exit(pgaio_worker_die, 0); } +static void +pgaio_worker_error_callback(void *arg) +{ + ProcNumber owner; + PGPROC *owner_proc; + int32 owner_pid; + PgAioHandle *ioh = arg; + + if (!ioh) + return; + + Assert(ioh->owner_procno != MyProcNumber); + Assert(MyBackendType == B_IO_WORKER); + + owner = ioh->owner_procno; + owner_proc = GetPGProcByNumber(owner); + owner_pid = owner_proc->pid; + + errcontext("I/O worker executing I/O on behalf of process %d", owner_pid); +} + void IoWorkerMain(const void *startup_data, size_t startup_data_len) { sigjmp_buf local_sigjmp_buf; PgAioHandle *volatile error_ioh = NULL; + ErrorContextCallback errcallback = {0}; volatile int error_errno = 0; char cmd[128]; @@ -388,6 +410,10 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) sprintf(cmd, "%d", MyIoWorkerId); set_ps_display(cmd); + errcallback.callback = pgaio_worker_error_callback; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + /* see PostgresMain() */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { @@ -471,6 +497,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) ioh = &pgaio_ctl->io_handles[io_index]; error_ioh = ioh; + errcallback.arg = ioh; pgaio_debug_io(DEBUG4, ioh, "worker %d processing IO", @@ -511,6 +538,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) pgaio_io_perform_synchronously(ioh); RESUME_INTERRUPTS(); + errcallback.arg = NULL; } else { @@ -522,6 +550,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) CHECK_FOR_INTERRUPTS(); } + error_context_stack = errcallback.previous; proc_exit(0); } diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 36c54fb695b0..cec93129f582 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -404,6 +404,30 @@ read_stream_start_pending_read(ReadStream *stream) static void read_stream_look_ahead(ReadStream *stream) { + /* + * Batch-submitting multiple IOs is more efficient than doing so + * one-by-one. If we just ramp up to the max, we'll only be allowed to + * submit one io_combine_limit sized IO. Defer submitting IO in that case. + * + * FIXME: This needs better heuristics. + */ +#if 1 + if (!stream->sync_mode && stream->distance > (io_combine_limit * 8)) + { + if (stream->pinned_buffers + stream->pending_read_nblocks > ((stream->distance * 3) / 4)) + { +#if 0 + ereport(LOG, + errmsg("reduce reduce reduce: pinned: %d, pending: %d, distance: %d", + stream->pinned_buffers, + stream->pending_read_nblocks, + stream->distance)); +#endif + return; + } + } +#endif + /* * Allow amortizing the cost of submitting IO over multiple IOs. This * requires that we don't do any operations that could lead to a deadlock diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f9681d09e1e7..d9a362d75539 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -52,6 +52,7 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/fd.h" +#include "storage/io_queue.h" #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/proc.h" @@ -76,6 +77,7 @@ /* Bits in SyncOneBuffer's return value */ #define BUF_WRITTEN 0x01 #define BUF_REUSABLE 0x02 +#define BUF_CANT_MERGE 0x04 #define RELS_BSEARCH_THRESHOLD 20 @@ -515,13 +517,7 @@ static void UnpinBuffer(BufferDesc *buf); static void UnpinBufferNoOwner(BufferDesc *buf); static void BufferSync(int flags); static uint32 WaitBufHdrUnlocked(BufferDesc *buf); -static int SyncOneBuffer(int buf_id, bool skip_recently_used, - WritebackContext *wb_context); static void WaitIO(BufferDesc *buf); -static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait); -static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, - uint32 set_flag_bits, bool forget_owner, - bool release_aio); static void AbortBufferIO(Buffer buffer); static void shared_buffer_write_error_callback(void *arg); static void local_buffer_write_error_callback(void *arg); @@ -536,6 +532,7 @@ static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_c static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context); static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context); + static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, @@ -3325,6 +3322,57 @@ UnpinBufferNoOwner(BufferDesc *buf) } } +typedef struct BuffersToWrite +{ + int nbuffers; + BufferTag start_at_tag; + uint32 max_combine; + + XLogRecPtr max_lsn; + + PgAioHandle *ioh; + PgAioWaitRef iow; + + uint64 total_writes; + + Buffer buffers[IOV_MAX]; + PgAioBounceBuffer *bounce_buffers[IOV_MAX]; + const void *data_ptrs[IOV_MAX]; +} BuffersToWrite; + +static int PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf, + bool skip_recently_used, + IOQueue *ioq, WritebackContext *wb_context); + +static void WriteBuffers(BuffersToWrite *to_write, + IOQueue *ioq, WritebackContext *wb_context); + +static void +BuffersToWriteInit(BuffersToWrite *to_write, + IOQueue *ioq, WritebackContext *wb_context) +{ + to_write->total_writes = 0; + to_write->nbuffers = 0; + to_write->ioh = NULL; + pgaio_wref_clear(&to_write->iow); + to_write->max_lsn = InvalidXLogRecPtr; + + pgaio_enter_batchmode(); +} + +static void +BuffersToWriteEnd(BuffersToWrite *to_write) +{ + if (to_write->ioh != NULL) + { + pgaio_io_release(to_write->ioh); + to_write->ioh = NULL; + } + + pgaio_exit_batchmode(); +} + + #define ST_SORT sort_checkpoint_bufferids #define ST_ELEMENT_TYPE CkptSortItem #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b) @@ -3356,7 +3404,10 @@ BufferSync(int flags) binaryheap *ts_heap; int i; int mask = BM_DIRTY; + IOQueue *ioq; WritebackContext wb_context; + BuffersToWrite to_write; + int max_combine; /* * Unless this is a shutdown checkpoint or we have been explicitly told, @@ -3418,7 +3469,9 @@ BufferSync(int flags) if (num_to_scan == 0) return; /* nothing to do */ + ioq = io_queue_create(512, 0); WritebackContextInit(&wb_context, &checkpoint_flush_after); + max_combine = Min(io_bounce_buffers, io_combine_limit); TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan); @@ -3526,48 +3579,91 @@ BufferSync(int flags) */ num_processed = 0; num_written = 0; + + BuffersToWriteInit(&to_write, ioq, &wb_context); + while (!binaryheap_empty(ts_heap)) { BufferDesc *bufHdr = NULL; CkptTsStatus *ts_stat = (CkptTsStatus *) DatumGetPointer(binaryheap_first(ts_heap)); + bool batch_continue = true; - buf_id = CkptBufferIds[ts_stat->index].buf_id; - Assert(buf_id != -1); - - bufHdr = GetBufferDescriptor(buf_id); - - num_processed++; + Assert(ts_stat->num_scanned <= ts_stat->num_to_scan); /* - * We don't need to acquire the lock here, because we're only looking - * at a single bit. It's possible that someone else writes the buffer - * and clears the flag right after we check, but that doesn't matter - * since SyncOneBuffer will then do nothing. However, there is a - * further race condition: it's conceivable that between the time we - * examine the bit here and the time SyncOneBuffer acquires the lock, - * someone else not only wrote the buffer but replaced it with another - * page and dirtied it. In that improbable case, SyncOneBuffer will - * write the buffer though we didn't need to. It doesn't seem worth - * guarding against this, though. + * Collect a batch of buffers to write out from the current + * tablespace. That causes some imbalance between the tablespaces, but + * that's more than outweighed by the efficiency gain due to batching. */ - if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED) + while (batch_continue && + to_write.nbuffers < max_combine && + ts_stat->num_scanned < ts_stat->num_to_scan) { - if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) + buf_id = CkptBufferIds[ts_stat->index].buf_id; + Assert(buf_id != -1); + + bufHdr = GetBufferDescriptor(buf_id); + + num_processed++; + + /* + * We don't need to acquire the lock here, because we're only + * looking at a single bit. It's possible that someone else writes + * the buffer and clears the flag right after we check, but that + * doesn't matter since SyncOneBuffer will then do nothing. + * However, there is a further race condition: it's conceivable + * that between the time we examine the bit here and the time + * SyncOneBuffer acquires the lock, someone else not only wrote + * the buffer but replaced it with another page and dirtied it. In + * that improbable case, SyncOneBuffer will write the buffer + * though we didn't need to. It doesn't seem worth guarding + * against this, though. + */ + if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED) { - TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); - PendingCheckpointerStats.buffers_written++; - num_written++; + int result = PrepareToWriteBuffer(&to_write, buf_id + 1, false, + ioq, &wb_context); + + if (result & BUF_CANT_MERGE) + { + Assert(to_write.nbuffers > 0); + WriteBuffers(&to_write, ioq, &wb_context); + + result = PrepareToWriteBuffer(&to_write, buf_id + 1, false, + ioq, &wb_context); + Assert(result != BUF_CANT_MERGE); + } + + if (result & BUF_WRITTEN) + { + TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); + PendingCheckpointerStats.buffers_written++; + num_written++; + } + else + { + batch_continue = false; + } } + else + { + if (to_write.nbuffers > 0) + WriteBuffers(&to_write, ioq, &wb_context); + } + + /* + * Measure progress independent of actually having to flush the + * buffer - otherwise writing become unbalanced. + */ + ts_stat->progress += ts_stat->progress_slice; + ts_stat->num_scanned++; + ts_stat->index++; } - /* - * Measure progress independent of actually having to flush the buffer - * - otherwise writing become unbalanced. - */ - ts_stat->progress += ts_stat->progress_slice; - ts_stat->num_scanned++; - ts_stat->index++; + if (to_write.nbuffers > 0) + WriteBuffers(&to_write, ioq, &wb_context); + /* Have all the buffers from the tablespace been processed? */ if (ts_stat->num_scanned == ts_stat->num_to_scan) @@ -3585,15 +3681,23 @@ BufferSync(int flags) * * (This will check for barrier events even if it doesn't sleep.) */ - CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + CheckpointWriteDelay(ioq, flags, (double) num_processed / num_to_scan); } + Assert(to_write.nbuffers == 0); + io_queue_wait_all(ioq); + /* * Issue all pending flushes. Only checkpointer calls BufferSync(), so * IOContext will always be IOCONTEXT_NORMAL. */ IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL); + io_queue_wait_all(ioq); /* IssuePendingWritebacks might have added + * more */ + io_queue_free(ioq); + BuffersToWriteEnd(&to_write); + pfree(per_ts_stat); per_ts_stat = NULL; binaryheap_free(ts_heap); @@ -3619,7 +3723,7 @@ BufferSync(int flags) * bgwriter_lru_maxpages to 0.) */ bool -BgBufferSync(WritebackContext *wb_context) +BgBufferSync(IOQueue *ioq, WritebackContext *wb_context) { /* info obtained from freelist.c */ int strategy_buf_id; @@ -3662,6 +3766,9 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + BuffersToWrite to_write; + int max_combine; + /* * Find out where the freelist clock sweep currently is, and how many * buffer allocations have happened since our last call. @@ -3682,6 +3789,8 @@ BgBufferSync(WritebackContext *wb_context) return true; } + max_combine = Min(io_bounce_buffers, io_combine_limit); + /* * Compute strategy_delta = how many buffers have been scanned by the * clock sweep since last time. If first time through, assume none. Then @@ -3838,11 +3947,25 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; + BuffersToWriteInit(&to_write, ioq, wb_context); + /* Execute the LRU scan */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { - int sync_state = SyncOneBuffer(next_to_clean, true, - wb_context); + int sync_state; + + sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1, + true, ioq, wb_context); + if (sync_state & BUF_CANT_MERGE) + { + Assert(to_write.nbuffers > 0); + + WriteBuffers(&to_write, ioq, wb_context); + + sync_state = PrepareToWriteBuffer(&to_write, next_to_clean + 1, + true, ioq, wb_context); + Assert(sync_state != BUF_CANT_MERGE); + } if (++next_to_clean >= NBuffers) { @@ -3853,6 +3976,13 @@ BgBufferSync(WritebackContext *wb_context) if (sync_state & BUF_WRITTEN) { + Assert(sync_state & BUF_REUSABLE); + + if (to_write.nbuffers == max_combine) + { + WriteBuffers(&to_write, ioq, wb_context); + } + reusable_buffers++; if (++num_written >= bgwriter_lru_maxpages) { @@ -3864,6 +3994,11 @@ BgBufferSync(WritebackContext *wb_context) reusable_buffers++; } + if (to_write.nbuffers > 0) + WriteBuffers(&to_write, ioq, wb_context); + + BuffersToWriteEnd(&to_write); + PendingBgWriterStats.buf_written_clean += num_written; #ifdef BGW_DEBUG @@ -3902,8 +4037,66 @@ BgBufferSync(WritebackContext *wb_context) return (bufs_to_lap == 0 && recent_alloc == 0); } +static inline bool +BufferTagsSameRel(const BufferTag *tag1, const BufferTag *tag2) +{ + return (tag1->spcOid == tag2->spcOid) && + (tag1->dbOid == tag2->dbOid) && + (tag1->relNumber == tag2->relNumber) && + (tag1->forkNum == tag2->forkNum) + ; +} + +static bool +CanMergeWrite(BuffersToWrite *to_write, BufferDesc *cur_buf_hdr) +{ + BlockNumber cur_block = cur_buf_hdr->tag.blockNum; + + Assert(to_write->nbuffers > 0); /* can't merge with nothing */ + Assert(to_write->start_at_tag.relNumber != InvalidOid); + Assert(to_write->start_at_tag.blockNum != InvalidBlockNumber); + + Assert(to_write->ioh != NULL); + + /* + * First check if the blocknumber is one that we could actually merge, + * that's cheaper than checking the tablespace/db/relnumber/fork match. + */ + if (to_write->start_at_tag.blockNum + to_write->nbuffers != cur_block) + return false; + + if (!BufferTagsSameRel(&to_write->start_at_tag, &cur_buf_hdr->tag)) + return false; + + /* + * Need to check with smgr how large a write we're allowed to make. To + * reduce the overhead of the smgr check, only inquire once, when + * processing the first to-be-merged buffer. That avoids the overhead in + * the common case of writing out buffers that definitely not mergeable. + */ + if (to_write->nbuffers == 1) + { + SMgrRelation smgr; + + smgr = smgropen(BufTagGetRelFileLocator(&to_write->start_at_tag), INVALID_PROC_NUMBER); + + to_write->max_combine = smgrmaxcombine(smgr, + to_write->start_at_tag.forkNum, + to_write->start_at_tag.blockNum); + } + else + { + Assert(to_write->max_combine > 0); + } + + if (to_write->start_at_tag.blockNum + to_write->max_combine <= cur_block) + return false; + + return true; +} + /* - * SyncOneBuffer -- process a single buffer during syncing. + * PrepareToWriteBuffer -- process a single buffer during syncing. * * If skip_recently_used is true, we don't write currently-pinned buffers, nor * buffers marked recently used, as these are not replacement candidates. @@ -3912,22 +4105,50 @@ BgBufferSync(WritebackContext *wb_context) * BUF_WRITTEN: we wrote the buffer. * BUF_REUSABLE: buffer is available for replacement, ie, it has * pin count 0 and usage count 0. + * BUF_CANT_MERGE: can't combine this write with prior writes, caller needs + * to issue those first * * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean * after locking it, but we don't care all that much.) */ static int -SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) +PrepareToWriteBuffer(BuffersToWrite *to_write, Buffer buf, + bool skip_recently_used, + IOQueue *ioq, WritebackContext *wb_context) { - BufferDesc *bufHdr = GetBufferDescriptor(buf_id); - int result = 0; + BufferDesc *cur_buf_hdr = GetBufferDescriptor(buf - 1); uint32 buf_state; - BufferTag tag; + int result = 0; + XLogRecPtr cur_buf_lsn; + LWLock *content_lock; + bool may_block; + + /* + * Check if this buffer can be written out together with already prepared + * writes. We check before we have pinned the buffer, so the buffer can be + * written out and replaced between this check and us pinning the buffer - + * we'll recheck below. The reason for the pre-check is that we don't want + * to pin the buffer just to find out that we can't merge the IO. + */ + if (to_write->nbuffers != 0) + { + if (!CanMergeWrite(to_write, cur_buf_hdr)) + { + result |= BUF_CANT_MERGE; + return result; + } + } + else + { + to_write->start_at_tag = cur_buf_hdr->tag; + } /* Make sure we can handle the pin */ ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); + /* XXX: Should also check if we are allowed to pin one more buffer */ + /* * Check whether buffer needs writing. * @@ -3937,7 +4158,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) * don't worry because our checkpoint.redo points before log record for * upcoming changes and so we are not required to write such dirty buffer. */ - buf_state = LockBufHdr(bufHdr); + buf_state = LockBufHdr(cur_buf_hdr); if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 && BUF_STATE_GET_USAGECOUNT(buf_state) == 0) @@ -3946,40 +4167,300 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) } else if (skip_recently_used) { +#if 0 + elog(LOG, "at block %d: skip recent with nbuffers %d", + cur_buf_hdr->tag.blockNum, to_write->nbuffers); +#endif /* Caller told us not to write recently-used buffers */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(cur_buf_hdr, buf_state); return result; } if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY)) { /* It's clean, so nothing to do */ - UnlockBufHdr(bufHdr, buf_state); + UnlockBufHdr(cur_buf_hdr, buf_state); return result; } + /* pin the buffer, from now on its identity can't change anymore */ + PinBuffer_Locked(cur_buf_hdr); + /* - * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the - * buffer is clean by the time we've locked it.) + * Acquire IO, if needed, now that it's likely that we'll need to write. */ - PinBuffer_Locked(bufHdr); - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + if (to_write->ioh == NULL) + { + /* otherwise we should already have acquired a handle */ + Assert(to_write->nbuffers == 0); - FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + to_write->ioh = io_queue_acquire_io(ioq); + pgaio_io_get_wref(to_write->ioh, &to_write->iow); + } - LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + /* + * If we are merging, check if the buffer's identity possibly changed + * while we hadn't yet pinned it. + * + * XXX: It might be worth checking if we still want to write the buffer + * out, e.g. it could have been replaced with a buffer that doesn't have + * BM_CHECKPOINT_NEEDED set. + */ + if (to_write->nbuffers != 0) + { + if (!CanMergeWrite(to_write, cur_buf_hdr)) + { + elog(LOG, "changed identity"); + UnpinBuffer(cur_buf_hdr); - tag = bufHdr->tag; + result |= BUF_CANT_MERGE; - UnpinBuffer(bufHdr); + return result; + } + } + + may_block = to_write->nbuffers == 0 + && !pgaio_have_staged() + && io_queue_is_empty(ioq) + ; + content_lock = BufferDescriptorGetContentLock(cur_buf_hdr); + + if (!may_block) + { + if (LWLockConditionalAcquire(content_lock, LW_SHARED)) + { + /* done */ + } + else if (to_write->nbuffers == 0) + { + /* + * Need to wait for all prior IO to finish before blocking for + * lock acquisition, to avoid the risk a deadlock due to us + * waiting for another backend that is waiting for our unsubmitted + * IO to complete. + */ + pgaio_submit_staged(); + io_queue_wait_all(ioq); + + elog(DEBUG2, "at block %u: can't block, nbuffers = 0", + cur_buf_hdr->tag.blockNum + ); + + may_block = to_write->nbuffers == 0 + && !pgaio_have_staged() + && io_queue_is_empty(ioq) + ; + Assert(may_block); + + LWLockAcquire(content_lock, LW_SHARED); + } + else + { + elog(DEBUG2, "at block %d: can't block nbuffers = %d", + cur_buf_hdr->tag.blockNum, + to_write->nbuffers); + + UnpinBuffer(cur_buf_hdr); + result |= BUF_CANT_MERGE; + Assert(to_write->nbuffers > 0); + + return result; + } + } + else + { + LWLockAcquire(content_lock, LW_SHARED); + } + + if (!may_block) + { + if (!StartBufferIO(cur_buf_hdr, false, !may_block)) + { + pgaio_submit_staged(); + io_queue_wait_all(ioq); + + may_block = io_queue_is_empty(ioq) && to_write->nbuffers == 0 && !pgaio_have_staged(); + + if (!StartBufferIO(cur_buf_hdr, false, !may_block)) + { + elog(DEBUG2, "at block %d: non-waitable StartBufferIO returns false, %d", + cur_buf_hdr->tag.blockNum, + may_block); + + /* + * FIXME: can't tell whether this is because the buffer has + * been cleaned + */ + if (!may_block) + { + result |= BUF_CANT_MERGE; + Assert(to_write->nbuffers > 0); + } + LWLockRelease(content_lock); + UnpinBuffer(cur_buf_hdr); + + return result; + } + } + } + else + { + if (!StartBufferIO(cur_buf_hdr, false, false)) + { + elog(DEBUG2, "waitable StartBufferIO returns false"); + LWLockRelease(content_lock); + UnpinBuffer(cur_buf_hdr); + + /* + * FIXME: Historically we returned BUF_WRITTEN in this case, which + * seems wrong + */ + return result; + } + } /* - * SyncOneBuffer() is only called by checkpointer and bgwriter, so - * IOContext will always be IOCONTEXT_NORMAL. + * Run PageGetLSN while holding header lock, since we don't have the + * buffer locked exclusively in all cases. */ - ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag); + buf_state = LockBufHdr(cur_buf_hdr); - return result | BUF_WRITTEN; + cur_buf_lsn = BufferGetLSN(cur_buf_hdr); + + /* To check if block content changes while flushing. - vadim 01/17/97 */ + buf_state &= ~BM_JUST_DIRTIED; + + UnlockBufHdr(cur_buf_hdr, buf_state); + + to_write->buffers[to_write->nbuffers] = buf; + to_write->nbuffers++; + + if (buf_state & BM_PERMANENT && + (to_write->max_lsn == InvalidXLogRecPtr || to_write->max_lsn < cur_buf_lsn)) + { + to_write->max_lsn = cur_buf_lsn; + } + + result |= BUF_WRITTEN; + + return result; +} + +static void +WriteBuffers(BuffersToWrite *to_write, + IOQueue *ioq, WritebackContext *wb_context) +{ + SMgrRelation smgr; + Buffer first_buf; + BufferDesc *first_buf_hdr; + bool needs_checksum; + + Assert(to_write->nbuffers > 0 && to_write->nbuffers <= io_combine_limit); + + first_buf = to_write->buffers[0]; + first_buf_hdr = GetBufferDescriptor(first_buf - 1); + + smgr = smgropen(BufTagGetRelFileLocator(&first_buf_hdr->tag), INVALID_PROC_NUMBER); + + /* + * Force XLOG flush up to buffer's LSN. This implements the basic WAL + * rule that log updates must hit disk before any of the data-file changes + * they describe do. + * + * However, this rule does not apply to unlogged relations, which will be + * lost after a crash anyway. Most unlogged relation pages do not bear + * LSNs since we never emit WAL records for them, and therefore flushing + * up through the buffer LSN would be useless, but harmless. However, + * GiST indexes use LSNs internally to track page-splits, and therefore + * unlogged GiST pages bear "fake" LSNs generated by + * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake + * LSN counter could advance past the WAL insertion point; and if it did + * happen, attempting to flush WAL through that location would fail, with + * disastrous system-wide consequences. To make sure that can't happen, + * skip the flush if the buffer isn't permanent. + */ + if (to_write->max_lsn != InvalidXLogRecPtr) + XLogFlush(to_write->max_lsn); + + /* + * Now it's safe to write the buffer to disk. Note that no one else should + * have been able to write it, while we were busy with log flushing, + * because we got the exclusive right to perform I/O by setting the + * BM_IO_IN_PROGRESS bit. + */ + + for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++) + { + Buffer cur_buf = to_write->buffers[nbuf]; + BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1); + Block bufBlock; + char *bufToWrite; + + bufBlock = BufHdrGetBlock(cur_buf_hdr); + needs_checksum = PageNeedsChecksumCopy((Page) bufBlock); + + /* + * Update page checksum if desired. Since we have only shared lock on + * the buffer, other processes might be updating hint bits in it, so + * we must copy the page to a bounce buffer if we do checksumming. + */ + if (needs_checksum) + { + PgAioBounceBuffer *bb = pgaio_bounce_buffer_get(); + + pgaio_io_assoc_bounce_buffer(to_write->ioh, bb); + + bufToWrite = pgaio_bounce_buffer_buffer(bb); + memcpy(bufToWrite, bufBlock, BLCKSZ); + PageSetChecksumInplace((Page) bufToWrite, cur_buf_hdr->tag.blockNum); + } + else + { + bufToWrite = bufBlock; + } + + to_write->data_ptrs[nbuf] = bufToWrite; + } + + pgaio_io_set_handle_data_32(to_write->ioh, + (uint32 *) to_write->buffers, + to_write->nbuffers); + pgaio_io_register_callbacks(to_write->ioh, PGAIO_HCB_SHARED_BUFFER_WRITEV, 0); + + smgrstartwritev(to_write->ioh, smgr, + BufTagGetForkNum(&first_buf_hdr->tag), + first_buf_hdr->tag.blockNum, + to_write->data_ptrs, + to_write->nbuffers, + false); + pgstat_count_io_op(IOOBJECT_RELATION, IOCONTEXT_NORMAL, + IOOP_WRITE, 1, BLCKSZ * to_write->nbuffers); + + + for (int nbuf = 0; nbuf < to_write->nbuffers; nbuf++) + { + Buffer cur_buf = to_write->buffers[nbuf]; + BufferDesc *cur_buf_hdr = GetBufferDescriptor(cur_buf - 1); + + UnpinBuffer(cur_buf_hdr); + } + + io_queue_track(ioq, &to_write->iow); + to_write->total_writes++; + + /* clear state for next write */ + to_write->nbuffers = 0; + to_write->start_at_tag.relNumber = InvalidOid; + to_write->start_at_tag.blockNum = InvalidBlockNumber; + to_write->max_combine = 0; + to_write->max_lsn = InvalidXLogRecPtr; + to_write->ioh = NULL; + pgaio_wref_clear(&to_write->iow); + + /* + * FIXME: Implement issuing writebacks (note wb_context isn't used here). + * Possibly needs to be integrated with io_queue.c. + */ } /* @@ -4353,6 +4834,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, error_context_stack = errcallback.previous; } + /* * RelationGetNumberOfBlocksInFork * Determines the current number of pages in the specified relation fork. @@ -5540,7 +6022,15 @@ LockBuffer(Buffer buffer, int mode) else if (mode == BUFFER_LOCK_SHARE) LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); else if (mode == BUFFER_LOCK_EXCLUSIVE) + { + /* + * FIXME: Wait for AIO writes, otherwise there would be a risk of + * deadlock. This isn't entirely trivial to do in a race-free way, IO + * could be started between us checking whether there is IO and + * enqueueing ourselves for the lock. + */ LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); + } else elog(ERROR, "unrecognized buffer lock mode: %d", mode); } @@ -5555,6 +6045,19 @@ ConditionalLockBuffer(Buffer buffer) { BufferDesc *buf; + /* + * FIXME: Wait for AIO writes. Some code does not deal well + * ConditionalLockBuffer() continuously failing, e.g. + * spginsert()->spgdoinsert() ends up busy-looping (easily reproducible by + * just making this function always fail and running the regression + * tests). While that code could be fixed, it'd be hard to find all + * problematic places. + * + * It would be OK to wait for the IO as waiting for IO completion does not + * need to wait for any locks that could lead to an undetected deadlock or + * such. + */ + Assert(BufferIsPinned(buffer)); if (BufferIsLocal(buffer)) return true; /* act as though we got it */ @@ -5618,10 +6121,8 @@ LockBufferForCleanup(Buffer buffer) CheckBufferIsPinnedOnce(buffer); /* - * We do not yet need to be worried about in-progress AIOs holding a pin, - * as we, so far, only support doing reads via AIO and this function can - * only be called once the buffer is valid (i.e. no read can be in - * flight). + * FIXME: See AIO related comments in LockBuffer() and + * ConditionalLockBuffer() */ /* Nobody else to wait for */ @@ -5634,6 +6135,11 @@ LockBufferForCleanup(Buffer buffer) { uint32 buf_state; + /* + * FIXME: LockBuffer()'s handling of in-progress writes (once + * implemented) should suffice to deal with deadlock risk. + */ + /* Try to acquire lock */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); buf_state = LockBufHdr(bufHdr); @@ -5781,7 +6287,13 @@ ConditionalLockBufferForCleanup(Buffer buffer) Assert(BufferIsValid(buffer)); - /* see AIO related comment in LockBufferForCleanup() */ + /* + * FIXME: Should wait for IO for the same reason as in + * ConditionalLockBuffer(). Needs to happen before the + * ConditionalLockBuffer() call below, as we'd never reach the + * ConditionalLockBuffer() call due the buffer pin held for the duration + * of the IO. + */ if (BufferIsLocal(buffer)) { @@ -5838,7 +6350,10 @@ IsBufferCleanupOK(Buffer buffer) Assert(BufferIsValid(buffer)); - /* see AIO related comment in LockBufferForCleanup() */ + /* + * FIXME: See AIO related comments in LockBuffer() and + * ConditionalLockBuffer() + */ if (BufferIsLocal(buffer)) { @@ -5962,7 +6477,7 @@ WaitIO(BufferDesc *buf) * find out if they can perform the I/O as part of a larger operation, without * waiting for the answer or distinguishing the reasons why not. */ -static bool +bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) { uint32 buf_state; @@ -6019,7 +6534,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) * resource owner. (forget_owner=false is used when the resource owner itself * is being released) */ -static void +void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio) { @@ -7144,12 +7659,129 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0); } +/* + * Helper for AIO writev completion callbacks, supporting both shared and temp + * buffers. Gets called once for each buffer in a multi-page write. + */ +static pg_attribute_always_inline PgAioResult +buffer_writev_complete_one(uint8 buf_off, Buffer buffer, uint8 flags, + bool failed, bool is_temp) +{ + BufferDesc *buf_hdr = is_temp ? + GetLocalBufferDescriptor(-buffer - 1) + : GetBufferDescriptor(buffer - 1); + PgAioResult result = {.status = PGAIO_RS_OK}; + bool clear_dirty; + uint32 set_flag_bits; + +#ifdef USE_ASSERT_CHECKING + { + uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state); + + Assert(buf_state & BM_VALID); + Assert(buf_state & BM_TAG_VALID); + /* temp buffers don't use BM_IO_IN_PROGRESS */ + if (!is_temp) + Assert(buf_state & BM_IO_IN_PROGRESS); + Assert(buf_state & BM_DIRTY); + } +#endif + + clear_dirty = failed ? false : true; + set_flag_bits = failed ? BM_IO_ERROR : 0; + + if (is_temp) + TerminateLocalBufferIO(buf_hdr, clear_dirty, set_flag_bits, true); + else + TerminateBufferIO(buf_hdr, clear_dirty, set_flag_bits, false, true); + + /* + * The initiator of IO is not managing the lock (i.e. we called + * LWLockDisown()), we are. + */ + if (!is_temp) + LWLockReleaseDisowned(BufferDescriptorGetContentLock(buf_hdr), + LW_SHARED); + + /* FIXME: tracepoint */ + + return result; +} + +/* + * Perform completion handling of a single AIO write. This write may cover + * multiple blocks / buffers. + * + * Shared between shared and local buffers, to reduce code duplication. + */ +static pg_attribute_always_inline PgAioResult +buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, + uint8 cb_data, bool is_temp) +{ + PgAioResult result = prior_result; + PgAioTargetData *td = pgaio_io_get_target_data(ioh); + uint64 *io_data; + uint8 handle_data_len; + + if (is_temp) + { + Assert(td->smgr.is_temp); + Assert(pgaio_io_get_owner(ioh) == MyProcNumber); + } + else + Assert(!td->smgr.is_temp); + + /* + * Iterate over all the buffers affected by this IO and call appropriate + * per-buffer completion function for each buffer. + */ + io_data = pgaio_io_get_handle_data(ioh, &handle_data_len); + for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++) + { + Buffer buf = io_data[buf_off]; + PgAioResult buf_result; + bool failed; + + Assert(BufferIsValid(buf)); + + /* + * If the entire failed on a lower-level, each buffer needs to be + * marked as failed. In case of a partial read, some buffers may be + * ok. + */ + failed = + prior_result.status == PGAIO_RS_ERROR + || prior_result.result <= buf_off; + + buf_result = buffer_writev_complete_one(buf_off, buf, cb_data, failed, + is_temp); + + /* + * If there wasn't any prior error and the IO for this page failed in + * some form, set the whole IO's to the page's result. + */ + if (result.status != PGAIO_RS_ERROR && buf_result.status != PGAIO_RS_OK) + { + result = buf_result; + pgaio_result_report(result, td, LOG); + } + } + + return result; +} + static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) { buffer_stage_common(ioh, false, false); } +static void +shared_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data) +{ + buffer_stage_common(ioh, true, false); +} + static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data) @@ -7195,6 +7827,13 @@ shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, return prior_result; } +static PgAioResult +shared_buffer_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, + uint8 cb_data) +{ + return buffer_writev_complete(ioh, prior_result, cb_data, false); +} + static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) { @@ -7208,6 +7847,17 @@ local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, return buffer_readv_complete(ioh, prior_result, cb_data, true); } +static void +local_buffer_writev_stage(PgAioHandle *ioh, uint8 cb_data) +{ + /* + * Currently this is unreachable as the only write support is for + * checkpointer / bgwriter, which don't deal with local buffers. + */ + elog(ERROR, "should be unreachable"); +} + + /* readv callback is passed READ_BUFFERS_* flags as callback data */ const PgAioHandleCallbacks aio_shared_buffer_readv_cb = { .stage = shared_buffer_readv_stage, @@ -7217,6 +7867,11 @@ const PgAioHandleCallbacks aio_shared_buffer_readv_cb = { .report = buffer_readv_report, }; +const PgAioHandleCallbacks aio_shared_buffer_writev_cb = { + .stage = shared_buffer_writev_stage, + .complete_shared = shared_buffer_writev_complete, +}; + /* readv callback is passed READ_BUFFERS_* flags as callback data */ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .stage = local_buffer_readv_stage, @@ -7230,3 +7885,7 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + +const PgAioHandleCallbacks aio_local_buffer_writev_cb = { + .stage = local_buffer_writev_stage, +}; diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 336715b6c634..b72a5957a206 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -555,7 +555,12 @@ GetAccessStrategy(BufferAccessStrategyType btype) return NULL; case BAS_BULKREAD: - ring_size_kb = 256; + + /* + * FIXME: Temporary increase to allow large enough streaming reads + * to actually benefit from AIO. This needs a better solution. + */ + ring_size_kb = 2 * 1024; break; case BAS_BULKWRITE: ring_size_kb = 16 * 1024; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index bf89076bb109..ed56202af14f 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -57,7 +57,6 @@ static int NLocalPinnedBuffers = 0; static void InitLocalBuffers(void); static Block GetLocalBufferStorage(void); static Buffer GetLocalVictimBuffer(void); -static void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced); /* @@ -597,7 +596,7 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit * * See also InvalidateBuffer(). */ -static void +void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) { Buffer buffer = BufferDescriptorGetBuffer(bufHdr); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd5564..2138d47dab93 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2348,6 +2348,34 @@ FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, return returnCode; } +int +FileStartWriteV(PgAioHandle *ioh, File file, + int iovcnt, off_t offset, + uint32 wait_event_info) +{ + int returnCode; + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileStartWriteV: %d (%s) " INT64_FORMAT " %d", + file, VfdCache[file].fileName, + (int64) offset, + iovcnt)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + vfdP = &VfdCache[file]; + + /* FIXME: think about / reimplement temp_file_limit */ + + pgaio_io_start_writev(ioh, vfdP->fd, iovcnt, offset); + + return 0; +} + int FileSync(File file, uint32 wait_event_info) { diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 82457bacc62f..d4bbd2cdf00e 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1490,6 +1490,16 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, return true; } +bool +PageNeedsChecksumCopy(Page page) +{ + if (PageIsNew(page)) + return false; + + /* If we don't need a checksum, just return the passed-in data */ + return DataChecksumsEnabled(); +} + /* * Set checksum for a page in shared buffers. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index b55892b94051..bacfe5f121ba 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -155,12 +155,19 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data); static void md_readv_report(PgAioResult result, const PgAioTargetData *target_data, int elevel); +static PgAioResult md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data); +static void md_writev_report(PgAioResult result, const PgAioTargetData *target_data, int elevel); const PgAioHandleCallbacks aio_md_readv_cb = { .complete_shared = md_readv_complete, .report = md_readv_report, }; +const PgAioHandleCallbacks aio_md_writev_cb = { + .complete_shared = md_writev_complete, + .report = md_writev_report, +}; + static inline int _mdfd_open_flags(void) @@ -910,9 +917,30 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * is ON or we are InRecovery, we should instead return zeroes * without complaining. This allows, for example, the case of * trying to update a block that was later truncated away. + * + * NB: We think that this codepath is unreachable in recovery + * and incomplete with zero_damaged_pages, as missing segments + * are not created. Putting blocks into the buffer-pool that + * do not exist on disk is rather problematic, as it will not + * be found by scans that rely on smgrnblocks(), as they are + * beyond EOF. It also can cause weird problems with relation + * extension, as relation extension does not expect blocks + * beyond EOF to exist. + * + * Therefore we do not want to copy the logic into + * mdstartreadv(), where it would have to be more complicated + * due to potential differences in the zero_damaged_pages + * setting between the definer and completor of IO. + * + * For PG 18, we are putting an Assert(false) in into + * mdreadv() (triggering failures in assertion-enabled builds, + * but continuing to work in production builds). Afterwards we + * plan to remove this code entirely. */ if (zero_damaged_pages || InRecovery) { + Assert(false); /* see comment above */ + for (BlockNumber i = transferred_this_segment / BLCKSZ; i < nblocks_this_segment; ++i) @@ -1007,6 +1035,13 @@ mdstartreadv(PgAioHandle *ioh, /* * The error checks corresponding to the post-read checks in mdreadv() are * in md_readv_complete(). + * + * However we chose, at least for now, to not implement the + * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv() + * that logic is rather problematic, and we want to get rid of it. Here + * equivalent logic would have to be more complicated due to potential + * differences in the zero_damaged_pages setting between the definer and + * completor of IO. */ } @@ -1115,6 +1150,64 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } } +/* + * mdstartwritev() -- Asynchronous version of mdrwritev(). + */ +void +mdstartwritev(PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + off_t seekpos; + MdfdVec *v; + BlockNumber nblocks_this_segment; + struct iovec *iov; + int iovcnt; + int ret; + + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + nblocks_this_segment = + Min(nblocks, + RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE))); + + if (nblocks_this_segment != nblocks) + elog(ERROR, "write crossing segment boundary"); + + iovcnt = pgaio_io_get_iovec(ioh, &iov); + + Assert(nblocks <= iovcnt); + + iovcnt = buffers_to_iovec(iov, unconstify(void **, buffers), nblocks_this_segment); + + Assert(iovcnt <= nblocks_this_segment); + + if (!(io_direct_flags & IO_DIRECT_DATA)) + pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED); + + pgaio_io_set_target_smgr(ioh, + reln, + forknum, + blocknum, + nblocks, + skipFsync); + pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_WRITEV, 0); + + ret = FileStartWriteV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + if (ret != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not start writing blocks %u..%u in file \"%s\": %m", + blocknum, + blocknum + nblocks_this_segment - 1, + FilePathName(v->mdfd_vfd)))); +} + /* * mdwriteback() -- Tell the kernel to write pages back to storage. @@ -1503,6 +1596,40 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) } } +/* + * Like register_dirty_segment(), except for use by AIO. In the completion + * callback we don't have access to the MdfdVec (the completion callback might + * be executed in a different backend than the issuing backend), therefore we + * have to implement this slightly differently. + */ +static void +register_dirty_segment_aio(RelFileLocator locator, ForkNumber forknum, uint64 segno) +{ + FileTag tag; + + INIT_MD_FILETAG(tag, locator, forknum, segno); + + /* + * Can't block here waiting for checkpointer to accept our sync request, + * as checkpointer might be waiting for this AIO to finish if offloaded to + * a worker. + */ + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) + { + char path[MAXPGPATH]; + + ereport(DEBUG1, + (errmsg_internal("could not forward fsync request because request queue is full"))); + + /* reuse mdsyncfiletag() to avoid duplicating code */ + if (mdsyncfiletag(&tag, path)) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + } +} + /* * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint */ @@ -1953,7 +2080,7 @@ md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data) * might not process the query result immediately (because it is busy * doing another part of query processing) or at all (e.g. if it was * cancelled or errored out due to another IO also failing). The - * issuer of the IO will emit an ERROR when processing the IO's + * definer of the IO will emit an ERROR when processing the IO's * results */ pgaio_result_report(result, td, LOG_SERVER_ONLY); @@ -2037,3 +2164,103 @@ md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel) td->smgr.nblocks * (size_t) BLCKSZ)); } } + +/* + * AIO completion callback for mdstartwritev(). + */ +static PgAioResult +md_writev_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data) +{ + PgAioTargetData *td = pgaio_io_get_target_data(ioh); + PgAioResult result = prior_result; + + if (prior_result.result < 0) + { + result.status = PGAIO_RS_ERROR; + result.id = PGAIO_HCB_MD_WRITEV; + /* For "hard" errors, track the error number in error_data */ + result.error_data = -prior_result.result; + result.result = 0; + + pgaio_result_report(result, td, LOG); + + return result; + } + + /* + * As explained above smgrstartwritev(), the smgr API operates on the + * level of blocks, rather than bytes. Convert. + */ + result.result /= BLCKSZ; + + Assert(result.result <= td->smgr.nblocks); + + if (result.result == 0) + { + /* consider 0 blocks written a failure */ + result.status = PGAIO_RS_ERROR; + result.id = PGAIO_HCB_MD_WRITEV; + result.error_data = 0; + + pgaio_result_report(result, td, LOG); + + return result; + } + + if (result.status != PGAIO_RS_ERROR && + result.result < td->smgr.nblocks) + { + /* partial writes should be retried at upper level */ + result.status = PGAIO_RS_PARTIAL; + result.id = PGAIO_HCB_MD_WRITEV; + } + + if (!td->smgr.skip_fsync) + register_dirty_segment_aio(td->smgr.rlocator, td->smgr.forkNum, + td->smgr.blockNum / ((BlockNumber) RELSEG_SIZE)); + + return result; +} + +/* + * AIO error reporting callback for mdstartwritev(). + */ +static void +md_writev_report(PgAioResult result, const PgAioTargetData *td, int elevel) +{ + RelPathStr path; + + path = relpathbackend(td->smgr.rlocator, + td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER, + td->smgr.forkNum); + + if (result.error_data != 0) + { + errno = result.error_data; /* for errcode_for_file_access() */ + + ereport(elevel, + errcode_for_file_access(), + errmsg("could not write blocks %u..%u in file \"%s\": %m", + td->smgr.blockNum, + td->smgr.blockNum + td->smgr.nblocks, + path.str) + ); + } + else + { + /* + * NB: This will typically only be output in debug messages, while + * retrying a partial IO. + */ + ereport(elevel, + errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not write blocks %u..%u in file \"%s\": wrote only %zu of %zu bytes", + td->smgr.blockNum, + td->smgr.blockNum + td->smgr.nblocks - 1, + path.str, + result.result * (size_t) BLCKSZ, + td->smgr.nblocks * (size_t) BLCKSZ + ) + ); + } +} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 320dc04d83ae..85daf686e08a 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -115,6 +115,11 @@ typedef struct f_smgr BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync); + void (*smgr_startwritev) (PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, + bool skipFsync); void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); @@ -142,6 +147,7 @@ static const f_smgr smgrsw[] = { .smgr_readv = mdreadv, .smgr_startreadv = mdstartreadv, .smgr_writev = mdwritev, + .smgr_startwritev = mdstartwritev, .smgr_writeback = mdwriteback, .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, @@ -738,6 +744,14 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * blocks not successfully read might bear unspecified modifications, up to * the full nblocks). This maintains the abstraction that smgr operates on the * level of blocks, rather than bytes. + * + * Compared to smgrreadv(), more responsibilities fall on the caller: + * - Partial reads need to be handle by the caller re-issuing IO for the + * unread blocks + * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are + * responsible for pgaio_result_report() to mirror that news to the user (if + * the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if + * PGAIO_RS_ERROR). */ void smgrstartreadv(PgAioHandle *ioh, @@ -787,6 +801,29 @@ smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, RESUME_INTERRUPTS(); } +/* + * smgrstartwritev() -- asynchronous version of smgrwritev() + * + * This starts an asynchronous writev IO using the IO handle `ioh`. Other than + * `ioh` all parameters are the same as smgrwritev(). + * + * Completion callbacks above smgr will be passed the result as the number of + * successfully written blocks if the write [partially] succeeds. This + * maintains the abstraction that smgr operates on the level of blocks, rather + * than bytes. + */ +void +smgrstartwritev(PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync) +{ + HOLD_INTERRUPTS(); + smgrsw[reln->smgr_which].smgr_startwritev(ioh, + reln, forknum, blocknum, buffers, + nblocks, skipFsync); + RESUME_INTERRUPTS(); +} + /* * smgrwriteback() -- Trigger kernel writeback for the supplied range of * blocks. diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 7958ea11b735..222e24bcb08b 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -785,8 +785,12 @@ InitPostgres(const char *in_dbname, Oid dboid, * We don't yet have an aux-process resource owner, but StartupXLOG * and ShutdownXLOG will need one. Hence, create said resource owner * (and register a callback to clean it up after ShutdownXLOG runs). + * + * In bootstrap mode CreateAuxProcessResourceOwner() was already + * called in BootstrapModeMain(). */ - CreateAuxProcessResourceOwner(); + if (!bootstrap) + CreateAuxProcessResourceOwner(); StartupXLOG(); /* Release (and warn about) any buffer pins leaked in StartupXLOG */ diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 4eaeca89f2c7..bd49b302293c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -3293,6 +3293,19 @@ struct config_int ConfigureNamesInt[] = check_io_max_concurrency, NULL, NULL }, + { + {"io_bounce_buffers", + PGC_POSTMASTER, + RESOURCES_IO, + gettext_noop("Number of IO Bounce Buffers reserved for each backend."), + NULL, + GUC_UNIT_BLOCKS + }, + &io_bounce_buffers, + -1, -1, 4096, + NULL, NULL, NULL + }, + { {"io_workers", PGC_SIGHUP, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ff56a1f0732c..2c6456e907f5 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -211,6 +211,8 @@ # -1 sets based on shared_buffers # (change requires restart) #io_workers = 3 # 1-32; +#io_bounce_buffers = -1 # -1 sets based on shared_buffers + # (change requires restart) # - Worker Processes - diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index d39f3e1b655c..81e7e27965a6 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -159,10 +159,11 @@ struct ResourceOwnerData LOCALLOCK *locks[MAX_RESOWNER_LOCKS]; /* list of owned locks */ /* - * AIO handles need be registered in critical sections and therefore - * cannot use the normal ResourceElem mechanism. + * AIO handles & bounce buffers need be registered in critical sections + * and therefore cannot use the normal ResourceElem mechanism. */ dlist_head aio_handles; + dlist_head aio_bounce_buffers; }; @@ -434,6 +435,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name) } dlist_init(&owner->aio_handles); + dlist_init(&owner->aio_bounce_buffers); return owner; } @@ -742,6 +744,13 @@ ResourceOwnerReleaseInternal(ResourceOwner owner, pgaio_io_release_resowner(node, !isCommit); } + + while (!dlist_is_empty(&owner->aio_bounce_buffers)) + { + dlist_node *node = dlist_head_node(&owner->aio_bounce_buffers); + + pgaio_bounce_buffer_release_resowner(node, !isCommit); + } } else if (phase == RESOURCE_RELEASE_LOCKS) { @@ -1111,3 +1120,15 @@ ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node) { dlist_delete_from(&owner->aio_handles, ioh_node); } + +void +ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node) +{ + dlist_push_tail(&owner->aio_bounce_buffers, ioh_node); +} + +void +ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *ioh_node) +{ + dlist_delete_from(&owner->aio_bounce_buffers, ioh_node); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 8b68b16d79da..d9c41fa426b3 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12493,4 +12493,13 @@ proargtypes => 'int4', prosrc => 'gist_stratnum_common' }, +# AIO related functions +{ oid => '9200', descr => 'information about in-progress asynchronous IOs', + proname => 'pg_get_aios', prorows => '100', proretset => 't', + provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,int4,int8,text,text,int8,int8,text,int2,int4,text,text,bool,bool,bool}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}', + prosrc => 'pg_get_aios' }, + ] diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 800ecbfd13b3..a8081d411b68 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -31,7 +31,8 @@ pg_noreturn extern void BackgroundWriterMain(const void *startup_data, size_t st pg_noreturn extern void CheckpointerMain(const void *startup_data, size_t startup_data_len); extern void RequestCheckpoint(int flags); -extern void CheckpointWriteDelay(int flags, double progress); +struct IOQueue; +extern void CheckpointWriteDelay(struct IOQueue *ioq, int flags, double progress); extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type); diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h index 9fe9d9ad9fa4..72d5680e767e 100644 --- a/src/include/storage/aio.h +++ b/src/include/storage/aio.h @@ -194,13 +194,16 @@ typedef enum PgAioHandleCallbackID PGAIO_HCB_INVALID = 0, PGAIO_HCB_MD_READV, + PGAIO_HCB_MD_WRITEV, PGAIO_HCB_SHARED_BUFFER_READV, + PGAIO_HCB_SHARED_BUFFER_WRITEV, PGAIO_HCB_LOCAL_BUFFER_READV, + PGAIO_HCB_LOCAL_BUFFER_WRITEV, } PgAioHandleCallbackID; -#define PGAIO_HCB_MAX PGAIO_HCB_LOCAL_BUFFER_READV +#define PGAIO_HCB_MAX PGAIO_HCB_LOCAL_BUFFER_WRITEV StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS), "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS"); @@ -352,6 +355,20 @@ extern bool pgaio_have_staged(void); +/* -------------------------------------------------------------------------------- + * Bounce Buffers + * -------------------------------------------------------------------------------- + */ + +extern PgAioBounceBuffer *pgaio_bounce_buffer_get(void); +extern void pgaio_io_assoc_bounce_buffer(PgAioHandle *ioh, PgAioBounceBuffer *bb); +extern uint32 pgaio_bounce_buffer_id(PgAioBounceBuffer *bb); +extern void pgaio_bounce_buffer_release(PgAioBounceBuffer *bb); +extern char *pgaio_bounce_buffer_buffer(PgAioBounceBuffer *bb); +extern void pgaio_bounce_buffer_release_resowner(struct dlist_node *bb_node, bool on_error); + + + /* -------------------------------------------------------------------------------- * Other * -------------------------------------------------------------------------------- @@ -364,6 +381,7 @@ extern void pgaio_closing_fd(int fd); /* GUCs */ extern PGDLLIMPORT int io_method; extern PGDLLIMPORT int io_max_concurrency; +extern PGDLLIMPORT int io_bounce_buffers; #endif /* AIO_H */ diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h index 7f18da2c8565..833f97361a1f 100644 --- a/src/include/storage/aio_internal.h +++ b/src/include/storage/aio_internal.h @@ -127,6 +127,12 @@ struct PgAioHandle /* raw result of the IO operation */ int32 result; + /* + * List of bounce_buffers owned by IO. It would suffice to use an index + * based linked list here. + */ + slist_head bounce_buffers; + /** * In which list the handle is registered, depends on the state: * - IDLE, in per-backend list @@ -182,11 +188,24 @@ struct PgAioHandle }; +/* typedef is in aio_types.h */ +struct PgAioBounceBuffer +{ + slist_node node; + struct ResourceOwnerData *resowner; + dlist_node resowner_node; + char *buffer; +}; + + typedef struct PgAioBackend { /* index into PgAioCtl->io_handles */ uint32 io_handle_off; + /* index into PgAioCtl->bounce_buffers */ + uint32 bounce_buffers_off; + /* IO Handles that currently are not used */ dclist_head idle_ios; @@ -217,6 +236,12 @@ typedef struct PgAioBackend * IOs being appended at the end. */ dclist_head in_flight_ios; + + /* Bounce Buffers that currently are not used */ + slist_head idle_bbs; + + /* see handed_out_io */ + PgAioBounceBuffer *handed_out_bb; } PgAioBackend; @@ -244,6 +269,15 @@ typedef struct PgAioCtl uint32 io_handle_count; PgAioHandle *io_handles; + + /* + * To perform AIO on buffers that are not located in shared memory (either + * because they are not in shared memory or because we need to operate on + * a copy, as e.g. the case for writes when checksums are in use) + */ + uint32 bounce_buffers_count; + PgAioBounceBuffer *bounce_buffers; + char *bounce_buffers_data; } PgAioCtl; diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h index 181833660778..3c18dade49cb 100644 --- a/src/include/storage/aio_types.h +++ b/src/include/storage/aio_types.h @@ -134,4 +134,6 @@ typedef struct PgAioReturn } PgAioReturn; +typedef struct PgAioBounceBuffer PgAioBounceBuffer; + #endif /* AIO_TYPES_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 72b36a4af26f..45c2b70b7360 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -21,6 +21,8 @@ #include "storage/buf.h" #include "storage/bufmgr.h" #include "storage/condition_variable.h" +#include "storage/io_queue.h" +#include "storage/latch.h" #include "storage/lwlock.h" #include "storage/procnumber.h" #include "storage/shmem.h" @@ -434,6 +436,12 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag); +/* solely to make it easier to write tests */ +extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait); +extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, + bool forget_owner, bool release_aio); + + /* freelist.c */ extern IOContext IOContextForStrategy(BufferAccessStrategy strategy); extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, @@ -478,6 +486,7 @@ extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio); extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait); extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln); +extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced); extern void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index f2192ceb2719..89e0ca11288b 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -174,7 +174,9 @@ extern PGDLLIMPORT int backend_flush_after; extern PGDLLIMPORT int bgwriter_flush_after; extern const PgAioHandleCallbacks aio_shared_buffer_readv_cb; +extern const PgAioHandleCallbacks aio_shared_buffer_writev_cb; extern const PgAioHandleCallbacks aio_local_buffer_readv_cb; +extern const PgAioHandleCallbacks aio_local_buffer_writev_cb; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -295,7 +297,8 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); -extern bool BgBufferSync(struct WritebackContext *wb_context); +struct IOQueue; +extern bool BgBufferSync(struct IOQueue *ioq, struct WritebackContext *wb_context); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index aeb67c498c59..000a3ab23f90 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -507,5 +507,6 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize); extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); extern void PageSetChecksumInplace(Page page, BlockNumber blkno); +extern bool PageNeedsChecksumCopy(Page page); #endif /* BUFPAGE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index b77d8e5e30e3..2cc7c5a47617 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -112,6 +112,7 @@ extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event extern ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info); extern ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info); extern int FileStartReadV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info); +extern int FileStartWriteV(struct PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info); extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info); diff --git a/src/include/storage/io_queue.h b/src/include/storage/io_queue.h new file mode 100644 index 000000000000..92b1e9afe6f5 --- /dev/null +++ b/src/include/storage/io_queue.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * io_queue.h + * Mechanism for tracking many IOs + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/io_queue.h + * + *------------------------------------------------------------------------- + */ +#ifndef IO_QUEUE_H +#define IO_QUEUE_H + +#include "storage/aio_types.h" + +struct IOQueue; +typedef struct IOQueue IOQueue; + +struct PgAioWaitRef; + +extern IOQueue *io_queue_create(int depth, int flags); +extern void io_queue_track(IOQueue *ioq, const PgAioWaitRef *iow); +extern void io_queue_wait_one(IOQueue *ioq); +extern void io_queue_wait_all(IOQueue *ioq); +extern bool io_queue_is_empty(IOQueue *ioq); +extern void io_queue_reserve(IOQueue *ioq); +extern PgAioHandle *io_queue_acquire_io(IOQueue *ioq); +extern void io_queue_free(IOQueue *ioq); + +#endif /* IO_QUEUE_H */ diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 9d7131eff438..47ae6c36c94f 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -21,6 +21,7 @@ #include "storage/sync.h" extern const PgAioHandleCallbacks aio_md_readv_cb; +extern const PgAioHandleCallbacks aio_md_writev_cb; /* md storage manager functionality */ extern void mdinit(void); @@ -45,6 +46,10 @@ extern void mdstartreadv(PgAioHandle *ioh, extern void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync); +extern void mdstartwritev(PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync); extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 856ebcda350e..f00b3763ac99 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -108,6 +108,11 @@ extern void smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync); +extern void smgrstartwritev(PgAioHandle *ioh, + SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, + bool skipFsync); extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); diff --git a/src/include/utils/resowner.h b/src/include/utils/resowner.h index aede4bfc820a..7e2ec2241693 100644 --- a/src/include/utils/resowner.h +++ b/src/include/utils/resowner.h @@ -168,5 +168,7 @@ extern void ResourceOwnerForgetLock(ResourceOwner owner, struct LOCALLOCK *local struct dlist_node; extern void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node); extern void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node); +extern void ResourceOwnerRememberAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node); +extern void ResourceOwnerForgetAioBounceBuffer(ResourceOwner owner, struct dlist_node *bb_node); #endif /* RESOWNER_H */ diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 4e4be3fa511a..aa1d27bbed31 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -14,6 +14,7 @@ SUBDIRS = \ oauth_validator \ plsample \ spgist_name_ops \ + test_aio \ test_bloomfilter \ test_copy_callbacks \ test_custom_rmgrs \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 2b0574514731..9de0057bd1d4 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -13,6 +13,7 @@ subdir('oauth_validator') subdir('plsample') subdir('spgist_name_ops') subdir('ssl_passphrase_callback') +subdir('test_aio') subdir('test_bloomfilter') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') diff --git a/src/test/modules/test_aio/.gitignore b/src/test/modules/test_aio/.gitignore new file mode 100644 index 000000000000..716e17f5a2ad --- /dev/null +++ b/src/test/modules/test_aio/.gitignore @@ -0,0 +1,2 @@ +# Generated subdirectories +/tmp_check/ diff --git a/src/test/modules/test_aio/Makefile b/src/test/modules/test_aio/Makefile new file mode 100644 index 000000000000..f53cc64671a8 --- /dev/null +++ b/src/test/modules/test_aio/Makefile @@ -0,0 +1,26 @@ +# src/test/modules/test_aio/Makefile + +PGFILEDESC = "test_aio - test code for AIO" + +MODULE_big = test_aio +OBJS = \ + $(WIN32RES) \ + test_aio.o + +EXTENSION = test_aio +DATA = test_aio--1.0.sql + +TAP_TESTS = 1 + +export enable_injection_points + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_aio +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_aio/meson.build b/src/test/modules/test_aio/meson.build new file mode 100644 index 000000000000..73d2fd68eaa1 --- /dev/null +++ b/src/test/modules/test_aio/meson.build @@ -0,0 +1,37 @@ +# Copyright (c) 2024-2025, PostgreSQL Global Development Group + +test_aio_sources = files( + 'test_aio.c', +) + +if host_system == 'windows' + test_aio_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_aio', + '--FILEDESC', 'test_aio - test code for AIO',]) +endif + +test_aio = shared_module('test_aio', + test_aio_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_aio + +test_install_data += files( + 'test_aio.control', + 'test_aio--1.0.sql', +) + +tests += { + 'name': 'test_aio', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_aio.pl', + 't/002_io_workers.pl', + ], + }, +} diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl new file mode 100644 index 000000000000..58a56c8cec24 --- /dev/null +++ b/src/test/modules/test_aio/t/001_aio.pl @@ -0,0 +1,1504 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + + +### +# Test io_method=worker +### +my $node_worker = create_node('worker'); +$node_worker->start(); + +test_generic('worker', $node_worker); +SKIP: +{ + skip 'Injection points not supported by this build', 1 + unless $ENV{enable_injection_points} eq 'yes'; + test_inject_worker('worker', $node_worker); +} + +$node_worker->stop(); + + +### +# Test io_method=io_uring +### + +if (have_io_uring()) +{ + my $node_uring = create_node('io_uring'); + $node_uring->start(); + test_generic('io_uring', $node_uring); + $node_uring->stop(); +} + + +### +# Test io_method=sync +### + +my $node_sync = create_node('sync'); + +# just to have one test not use the default auto-tuning + +$node_sync->append_conf( + 'postgresql.conf', qq( +io_max_concurrency=4 +)); + +$node_sync->start(); +test_generic('sync', $node_sync); +$node_sync->stop(); + +done_testing(); + + +### +# Test Helpers +### + +sub create_node +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + + my $io_method = shift; + + my $node = PostgreSQL::Test::Cluster->new($io_method); + + # Want to test initdb for each IO method, otherwise we could just reuse + # the cluster. + # + # Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the + # options specified by ->extra, if somebody puts -c io_method=xyz in + # PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we + # detect it. + local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS}; + if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS} + && $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/) + { + $ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method"; + } + + $node->init(extra => [ '-c', "io_method=$io_method" ]); + + $node->append_conf( + 'postgresql.conf', qq( +shared_preload_libraries=test_aio +log_min_messages = 'DEBUG3' +log_statement=all +log_error_verbosity=default +restart_after_crash=false +temp_buffers=100 +)); + + ok(1, "$io_method: initdb"); + + return $node; +} + +sub have_io_uring +{ + # To detect if io_uring is supported, we look at the error message for + # assigning an invalid value to an enum GUC, which lists all the valid + # options. We need to use -C to deal with running as administrator on + # windows, the superuser check is omitted if -C is used. + my ($stdout, $stderr) = + run_command [qw(postgres -C invalid -c io_method=invalid)]; + die "can't determine supported io_method values" + unless $stderr =~ m/Available values: ([^\.]+)\./; + my $methods = $1; + note "supported io_method values are: $methods"; + + return ($methods =~ m/io_uring/) ? 1 : 0; +} + +sub psql_like +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $io_method = shift; + my $psql = shift; + my $name = shift; + my $sql = shift; + my $expected_stdout = shift; + my $expected_stderr = shift; + my ($cmdret, $output); + + ($output, $cmdret) = $psql->query($sql); + + like($output, $expected_stdout, "$io_method: $name: expected stdout"); + like($psql->{stderr}, $expected_stderr, + "$io_method: $name: expected stderr"); + $psql->{stderr} = ''; + + return $output; +} + +sub query_wait_block +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $io_method = shift; + my $node = shift; + my $psql = shift; + my $name = shift; + my $sql = shift; + my $waitfor = shift; + + my $pid = $psql->query_safe('SELECT pg_backend_pid()'); + + $psql->{stdin} .= qq($sql;\n); + $psql->{run}->pump_nb(); + ok(1, "$io_method: $name: issued sql"); + + $node->poll_query_until('postgres', + qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid), + $waitfor); + ok(1, "$io_method: $name: observed $waitfor wait event"); +} + +# Returns count of checksum failures for the specified database or for shared +# relations, if $datname is undefined. +sub checksum_failures +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my $psql = shift; + my $datname = shift; + my $checksum_count; + my $checksum_last_failure; + + if (defined $datname) + { + $checksum_count = $psql->query_safe( + qq( +SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname'; +)); + $checksum_last_failure = $psql->query_safe( + qq( +SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname'; +)); + } + else + { + $checksum_count = $psql->query_safe( + qq( +SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL; +)); + $checksum_last_failure = $psql->query_safe( + qq( +SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL; +)); + } + + return $checksum_count, $checksum_last_failure; +} + +### +# Sub-tests +### + +# Sanity checks for the IO handle API +sub test_handle +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # leak warning: implicit xact + psql_like( + $io_method, + $psql, + "handle_get() leak in implicit xact", + qq(SELECT handle_get()), + qr/^$/, + qr/leaked AIO handle/, + "$io_method: leaky handle_get() warns"); + + # leak warning: explicit xact + psql_like( + $io_method, $psql, + "handle_get() leak in explicit xact", + qq(BEGIN; SELECT handle_get(); COMMIT), + qr/^$/, qr/leaked AIO handle/); + + + # leak warning: explicit xact, rollback + psql_like( + $io_method, + $psql, + "handle_get() leak in explicit xact, rollback", + qq(BEGIN; SELECT handle_get(); ROLLBACK;), + qr/^$/, + qr/leaked AIO handle/); + + # leak warning: subtrans + psql_like( + $io_method, + $psql, + "handle_get() leak in subxact", + qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;), + qr/^$/, + qr/leaked AIO handle/); + + # leak warning + error: released in different command (thus resowner) + psql_like( + $io_method, + $psql, + "handle_release() in different command", + qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;), + qr/^$/, + qr/leaked AIO handle.*release in unexpected state/ms); + + # no leak, release in same command + psql_like( + $io_method, + $psql, + "handle_release() in same command", + qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;), + qr/^$/, + qr/^$/); + + # normal handle use + psql_like($io_method, $psql, "handle_get_release()", + qq(SELECT handle_get_release()), + qr/^$/, qr/^$/); + + # should error out, API violation + psql_like( + $io_method, + $psql, + "handle_get_twice()", + qq(SELECT handle_get_twice()), + qr/^$/, + qr/ERROR: API violation: Only one IO can be handed out$/); + + # recover after error in implicit xact + psql_like( + $io_method, + $psql, + "handle error recovery in implicit xact", + qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()), + qr/^|ok$/, + qr/ERROR.*as you command/); + + # recover after error in implicit xact + psql_like( + $io_method, + $psql, + "handle error recovery in explicit xact", + qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;), + qr/^|ok$/, + qr/ERROR.*as you command/); + + # recover after error in subtrans + psql_like( + $io_method, + $psql, + "handle error recovery in explicit subxact", + qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;), + qr/^|ok$/, + qr/ERROR.*as you command/); + + $psql->quit(); +} + +# Sanity checks for the batchmode API +sub test_batchmode +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # leak warning & recovery: implicit xact + psql_like( + $io_method, + $psql, + "batch_start() leak & cleanup in implicit xact", + qq(SELECT batch_start()), + qr/^$/, + qr/open AIO batch at end/, + "$io_method: leaky batch_start() warns"); + + # leak warning & recovery: explicit xact + psql_like( + $io_method, + $psql, + "batch_start() leak & cleanup in explicit xact", + qq(BEGIN; SELECT batch_start(); COMMIT;), + qr/^$/, + qr/open AIO batch at end/, + "$io_method: leaky batch_start() warns"); + + + # leak warning & recovery: explicit xact, rollback + # + # XXX: This doesn't fail right now, due to not getting a chance to do + # something at transaction command commit. That's not a correctness issue, + # it just means it's a bit harder to find buggy code. + #psql_like($io_method, $psql, + # "batch_start() leak & cleanup after abort", + # qq(BEGIN; SELECT batch_start(); ROLLBACK;), + # qr/^$/, + # qr/open AIO batch at end/, "$io_method: leaky batch_start() warns"); + + # no warning, batch closed in same command + psql_like( + $io_method, + $psql, + "batch_start(), batch_end() works", + qq(SELECT batch_start() UNION ALL SELECT batch_end()), + qr/^$/, + qr/^$/, + "$io_method: batch_start(), batch_end()"); + + $psql->quit(); +} + +# Test that simple cases of invalid pages are reported +sub test_io_error +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + $psql->query_safe( + qq( +CREATE TEMPORARY TABLE tmp_corr(data int not null); +INSERT INTO tmp_corr SELECT generate_series(1, 10000); +SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true); +)); + + foreach my $tblname (qw(tbl_corr tmp_corr)) + { + my $invalid_page_re = + $tblname eq 'tbl_corr' + ? qr/invalid page in block 1 of relation base\/\d+\/\d+/ + : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/; + + # verify the error is reported in custom C code + psql_like( + $io_method, + $psql, + "read_rel_block_ll() of $tblname page", + qq(SELECT read_rel_block_ll('$tblname', 1)), + qr/^$/, + $invalid_page_re); + + # verify the error is reported for bufmgr reads, seq scan + psql_like( + $io_method, $psql, + "sequential scan of $tblname block fails", + qq(SELECT count(*) FROM $tblname), + qr/^$/, $invalid_page_re); + + # verify the error is reported for bufmgr reads, tid scan + psql_like( + $io_method, + $psql, + "tid scan of $tblname block fails", + qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'), + qr/^$/, + $invalid_page_re); + } + + $psql->quit(); +} + +# Test interplay between StartBufferIO and TerminateBufferIO +sub test_startwait_io +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + + ### Verify behavior for normal tables + + # create a buffer we can play around with + my $buf_id = psql_like( + $io_method, $psql_a, + "creation of toy buffer succeeds", + qq(SELECT buffer_create_toy('tbl_ok', 1)), + qr/^\d+$/, qr/^$/); + + # check that one backend can perform StartBufferIO + psql_like( + $io_method, + $psql_a, + "first StartBufferIO", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^t$/, + qr/^$/); + + # but not twice on the same buffer (non-waiting) + psql_like( + $io_method, + $psql_a, + "second StartBufferIO fails, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^f$/, + qr/^$/); + psql_like( + $io_method, + $psql_b, + "second StartBufferIO fails, other session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^f$/, + qr/^$/); + + # start io in a different session, will block + query_wait_block( + $io_method, + $node, + $psql_b, + "blocking start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + "BufferIo"); + + # Terminate the IO, without marking it as success, this should trigger the + # waiting session to be able to start the io + psql_like( + $io_method, + $psql_a, + "blocking start buffer io, terminating io, not valid", + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)), + qr/^$/, + qr/^$/); + + + # Because the IO was terminated, but not marked as valid, second session should get the right to start io + pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/); + ok(1, "$io_method: blocking start buffer io, can start io"); + + # terminate the IO again + $psql_b->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);) + ); + + + # same as the above scenario, but mark IO as having succeeded + psql_like( + $io_method, + $psql_a, + "blocking buffer io w/ success: first start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^t$/, + qr/^$/); + + # start io in a different session, will block + query_wait_block( + $io_method, + $node, + $psql_b, + "blocking start buffer io", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + "BufferIo"); + + # Terminate the IO, marking it as success + psql_like( + $io_method, + $psql_a, + "blocking start buffer io, terminating io, valid", + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)), + qr/^$/, + qr/^$/); + + # Because the IO was terminated, and marked as valid, second session should complete but not need io + pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/); + ok(1, "$io_method: blocking start buffer io, no need to start io"); + + # buffer is valid now, make it invalid again + $psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);)); + + + ### Verify behavior for temporary tables + + # Can't unfortunately share the code with the normal table case, there are + # too many behavioral differences. + + # create a buffer we can play around with + $psql_a->query_safe( + qq( +CREATE TEMPORARY TABLE tmp_ok(data int not null); +INSERT INTO tmp_ok SELECT generate_series(1, 10000); +)); + $buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);)); + + # check that one backend can perform StartLocalBufferIO + psql_like( + $io_method, + $psql_a, + "first StartLocalBufferIO", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer + # succeeds as well. This test mostly serves as a documentation of that + # fact. If we had actually started IO, it'd be different. + psql_like( + $io_method, + $psql_a, + "second StartLocalBufferIO succeeds, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Terminate the IO again, without marking it as a success + $psql_a->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);) + ); + psql_like( + $io_method, + $psql_a, + "StartLocalBufferIO after not marking valid succeeds, same session", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);), + qr/^t$/, + qr/^$/); + + # Terminate the IO again, marking it as a success + $psql_a->query_safe( + qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);) + ); + + # Now another StartLocalBufferIO should fail, this time because the buffer + # is already valid. + psql_like( + $io_method, + $psql_a, + "StartLocalBufferIO after marking valid fails", + qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);), + qr/^f$/, + qr/^$/); + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that if the backend issuing a read doesn't wait for the IO's +# completion, another backend can complete the IO +sub test_complete_foreign +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + # Issue IO without waiting for completion, then sleep + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);)); + + # Check that another backend can read the relevant block + psql_like( + $io_method, + $psql_b, + "completing read started by sleeping backend", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + # Issue IO without waiting for completion, then exit. + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);)); + $psql_a->reconnect_and_clear(); + + # Check that another backend can read the relevant block. This verifies + # that the exiting backend left the AIO in a sane state. + psql_like( + $io_method, + $psql_b, + "read buffer started by exited backend", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + # Read a tbl_corr block, then sleep. The other session will retry the IO + # and also fail. The easiest thing to verify that seems to be to check + # that both are in the log. + my $log_location = -s $node->logfile; + $psql_a->query_safe( + qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);)); + + psql_like( + $io_method, + $psql_b, + "completing read of tbl_corr block started by other backend", + qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1), + qr/^$/, + qr/invalid page in block/); + + # The log message issued for the read_rel_block_ll() should be logged as a LOG + $node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location); + ok(1, + "$io_method: completing read of tbl_corr block started by other backend: LOG message for background read" + ); + + # But for the SELECT, it should be an ERROR + $log_location = + $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location); + ok(1, + "$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read" + ); + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that we deal correctly with FDs being closed while IO is in progress +sub test_close_fd +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + psql_like( + $io_method, + $psql, + "close all FDs after read, waiting for results", + qq( + SELECT read_rel_block_ll('tbl_ok', 1, + wait_complete=>true, + batchmode_enter=>true, + smgrreleaseall=>true, + batchmode_exit=>true + );), + qr/^$/, + qr/^$/); + + psql_like( + $io_method, + $psql, + "close all FDs after read, no waiting", + qq( + SELECT read_rel_block_ll('tbl_ok', 1, + wait_complete=>false, + batchmode_enter=>true, + smgrreleaseall=>true, + batchmode_exit=>true + );), + qr/^$/, + qr/^$/); + + # Check that another backend can read the relevant block + psql_like( + $io_method, + $psql, + "close all FDs after read, no waiting, query works", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1), + qr/^1$/, + qr/^$/); + + $psql->quit(); +} + +# Tests using injection points. Mostly to exercise hard IO errors that are +# hard to trigger without using injection points. +sub test_inject +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # injected what we'd expect + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);)); + $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);)); + psql_like( + $io_method, $psql, + "injection point not triggering failure", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'), + qr/^1$/, qr/^$/); + + # injected a read shorter than a single block, expecting error + $psql->query_safe(qq(SELECT inj_io_short_read_attach(17);)); + $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);)); + psql_like( + $io_method, + $psql, + "single block short read fails", + qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/ + ); + + # shorten multi-block read to a single block, should retry + my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0); +SELECT invalidate_rel_block('tbl_ok', 1); +SELECT invalidate_rel_block('tbl_ok', 2); +SELECT invalidate_rel_block('tbl_ok', 3); +/* gap */ +SELECT invalidate_rel_block('tbl_ok', 5); +SELECT invalidate_rel_block('tbl_ok', 6); +SELECT invalidate_rel_block('tbl_ok', 7); +SELECT invalidate_rel_block('tbl_ok', 8);); + + $psql->query_safe($inval_query); + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);)); + psql_like( + $io_method, $psql, + "multi block short read (1 block) is retried", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # shorten multi-block read to two blocks, should retry + $psql->query_safe($inval_query); + $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);)); + + psql_like( + $io_method, $psql, + "multi block short read (2 blocks) is retried", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # verify that page verification errors are detected even as part of a + # shortened multi-block read (tbl_corr, block 1 is corrupted) + $psql->query_safe( + qq( +SELECT invalidate_rel_block('tbl_corr', 0); +SELECT invalidate_rel_block('tbl_corr', 1); +SELECT invalidate_rel_block('tbl_corr', 2); +SELECT inj_io_short_read_attach(8192); + )); + + psql_like( + $io_method, + $psql, + "shortened multi-block read detects invalid page", + qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'), + qr/^$/, + qr/ERROR:.*invalid page in block 1 of relation base\/.*/); + + # trigger a hard error, should error out + $psql->query_safe( + qq( +SELECT inj_io_short_read_attach(-errno_from_string('EIO')); +SELECT invalidate_rel_block('tbl_ok', 2); + )); + + psql_like( + $io_method, + $psql, + "first hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/ + ); + + psql_like( + $io_method, + $psql, + "second hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/ + ); + + $psql->query_safe(qq(SELECT inj_io_short_read_detach())); + + # now the IO should be ok. + psql_like( + $io_method, $psql, + "recovers after hard error", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + # trigger a different hard error, should error out + $psql->query_safe( + qq( +SELECT inj_io_short_read_attach(-errno_from_string('EROFS')); +SELECT invalidate_rel_block('tbl_ok', 2); + )); + psql_like( + $io_method, + $psql, + "different hard IO error is reported", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Read-only file system/ + ); + $psql->query_safe(qq(SELECT inj_io_short_read_detach())); + + $psql->quit(); +} + +# Tests using injection points, only for io_method=worker. +# +# io_method=worker has the special case of needing to reopen files. That can +# in theory fail, because the file could be gone. That's a hard path to test +# for real, so we use an injection point to trigger it. +sub test_inject_worker +{ + my $io_method = shift; + my $node = shift; + my ($ret, $output); + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # trigger a failure to reopen, should error out, but should recover + $psql->query_safe( + qq( +SELECT inj_io_reopen_attach(); +SELECT invalidate_rel_block('tbl_ok', 1); + )); + + psql_like( + $io_method, + $psql, + "failure to open: detected", + qq(SELECT count(*) FROM tbl_ok), + qr/^$/, + qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/ + ); + + $psql->query_safe(qq(SELECT inj_io_reopen_detach();)); + + # check that we indeed recover + psql_like( + $io_method, $psql, + "failure to open: recovers", + qq(SELECT count(*) FROM tbl_ok), + qr/^10000$/, qr/^$/); + + $psql->quit(); +} + +# Verify that we handle a relation getting removed (due to a rollback or a +# DROP TABLE) while IO is ongoing for that table. +sub test_invalidate +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + foreach my $persistency (qw(normal unlogged temporary)) + { + my $sql_persistency = $persistency eq 'normal' ? '' : $persistency; + my $tblname = $persistency . '_transactional'; + + my $create_sql = qq( +CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200); +); + + # Verify that outstanding read IO does not cause problems with + # AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ... + # -> Invalidate[Local]Buffer. + $psql->query_safe("BEGIN; $create_sql;"); + $psql->query_safe( + qq( +SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false); +)); + psql_like( + $io_method, + $psql, + "rollback of newly created $persistency table with outstanding IO", + qq(ROLLBACK), + qr/^$/, + qr/^$/); + + # Verify that outstanding read IO does not cause problems with + # CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ... + # -> Invalidate[Local]Buffer. + $psql->query_safe("BEGIN; $create_sql; COMMIT;"); + $psql->query_safe( + qq( +BEGIN; +SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false); +)); + + psql_like( + $io_method, $psql, + "drop $persistency table with outstanding IO", + qq(DROP TABLE $tblname), + qr/^$/, qr/^$/); + + psql_like($io_method, $psql, + "commit after drop $persistency table with outstanding IO", + qq(COMMIT), qr/^$/, qr/^$/); + } + + $psql->quit(); +} + +# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages +sub test_zero +{ + my $io_method = shift; + my $node = shift; + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + my $psql_b = $node->background_psql('postgres', on_error_stop => 0); + + foreach my $persistency (qw(normal temporary)) + { + my $sql_persistency = $persistency eq 'normal' ? '' : $persistency; + + $psql_a->query_safe( + qq( +CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_zero SELECT generate_series(1, 10000); +)); + + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true); +)); + + # Check that page validity errors are detected + psql_like( + $io_method, + $psql_a, + "$persistency: test reading of invalid block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 0 of relation base\/.*\/.*$/ + ); + + # Check that page validity errors are zeroed + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)), + qr/^$/, + qr/^psql::\d+: WARNING: invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/ + ); + + # And that once the corruption is fixed, we can read again + $psql_a->query( + qq( +SELECT modify_rel_block('tbl_zero', 0, zero=>true); +)); + $psql_a->{stderr} = ''; + + psql_like( + $io_method, + $psql_a, + "$persistency: test re-read of block 0", + qq( +SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)), + qr/^$/, + qr/^$/); + + # Check a page validity error in another block, to ensure we report + # the correct block number + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true); +)); + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 3", + qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);), + qr/^$/, + qr/^psql::\d+: WARNING: invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/ + ); + + + # Check a page validity error in another block, to ensure we report + # the correct block number + $psql_a->query_safe( + qq( +SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true); +SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true); +)); + # First test error + psql_like( + $io_method, + $psql_a, + "$persistency: test reading of invalid block 2,3 in larger read", + qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)), + qr/^$/, + qr/^psql::\d+: ERROR: 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/ + ); + + # Then test zeroing via ZERO_ON_ERROR flag + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR", + qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)), + qr/^$/, + qr/^psql::\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/ + ); + + # Then test zeroing vio zero_damaged_pages + psql_like( + $io_method, + $psql_a, + "$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages", + qq( +BEGIN; +SET LOCAL zero_damaged_pages = true; +SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false) +COMMIT; +), + qr/^$/, + qr/^psql::\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/ + ); + + $psql_a->query_safe(qq(COMMIT)); + + + # Verify that bufmgr.c IO detects page validity errors + $psql_a->query( + qq( +SELECT invalidate_rel_block('tbl_zero', g.i) +FROM generate_series(0, 15) g(i); +SELECT modify_rel_block('tbl_zero', 3, zero=>true); +)); + $psql_a->{stderr} = ''; + + psql_like( + $io_method, + $psql_a, + "$persistency: verify reading zero_damaged_pages=off", + qq( +SELECT count(*) FROM tbl_zero), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 2 of relation base\/.*\/.*$/ + ); + + # Verify that bufmgr.c IO zeroes out pages with page validity errors + psql_like( + $io_method, + $psql_a, + "$persistency: verify zero_damaged_pages=on", + qq( +BEGIN; +SET LOCAL zero_damaged_pages = true; +SELECT count(*) FROM tbl_zero; +COMMIT; +), + qr/^\d+$/, + qr/^psql::\d+: WARNING: invalid page in block 2 of relation base\/.*\/.*$/ + ); + + # Check that warnings/errors about page validity in an IO started by + # session A that session B might complete aren't logged visibly to + # session B. + # + # This will only ever trigger for io_method's like io_uring, that can + # complete IO's in a client backend. But it doesn't seem worth + # restricting to that. + # + # This requires cross-session access to the same relation, hence the + # restriction to non-temporary table. + if ($sql_persistency ne 'temporary') + { + # Create a corruption and then read the block without waiting for + # completion. + $psql_a->query(qq( +SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true); +SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true) +)); + + psql_like( + $io_method, + $psql_b, + "$persistency: test completing read by other session doesn't generate warning", + qq(SELECT count(*) > 0 FROM tbl_zero;), + qr/^t$/, qr/^$/); + } + + # Clean up + $psql_a->query_safe( + qq( +DROP TABLE tbl_zero; +)); + } + + $psql_a->{stderr} = ''; + + $psql_a->quit(); + $psql_b->quit(); +} + +# Test that we detect checksum failures and report them +sub test_checksum +{ + my $io_method = shift; + my $node = shift; + + my $psql_a = $node->background_psql('postgres', on_error_stop => 0); + + $psql_a->query_safe( + qq( +CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_normal SELECT generate_series(1, 5000); +SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true); + +CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_temp SELECT generate_series(1, 5000); +SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true); +)); + + # To be able to test checksum failures on shared rels we need a shared rel + # with invalid pages - which is a bit scary. pg_shseclabel seems like a + # good bet, as it's not accessed in a default configuration. + $psql_a->query_safe( + qq( +SELECT grow_rel('pg_shseclabel', 4); +SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true); +SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true); +)); + + + # Check that page validity errors are detected, checksums stats increase, normal rel + my ($cs_count_before, $cs_ts_before) = + checksum_failures($psql_a, 'postgres'); + psql_like( + $io_method, + $psql_a, + "normal rel: test reading of invalid block 3", + qq( +SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 3 of relation base\/\d+\/\d+$/ + ); + + my ($cs_count_after, $cs_ts_after) = + checksum_failures($psql_a, 'postgres'); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: normal rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: normal rel: checksum timestamp is not null"); + + + # Check that page validity errors are detected, checksums stats increase, temp rel + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres'); + psql_like( + $io_method, + $psql_a, + "temp rel: test reading of invalid block 4, valid block 5", + qq( +SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/ + ); + + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres'); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: temp rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: temp rel: checksum timestamp is not null"); + + + # Check that page validity errors are detected, checksums stats increase, shared rel + ($cs_count_before, $cs_ts_after) = checksum_failures($psql_a); + psql_like( + $io_method, + $psql_a, + "shared rel: reading of invalid blocks 2+3", + qq( +SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: ERROR: 2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/ + ); + + ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a); + + cmp_ok($cs_count_before + 1, + '<=', $cs_count_after, + "$io_method: shared rel: checksum count increased"); + cmp_ok($cs_ts_after, 'ne', '', + "$io_method: shared rel: checksum timestamp is not null"); + + + # and restore sanity + $psql_a->query( + qq( +SELECT modify_rel_block('pg_shseclabel', 1, zero=>true); +DROP TABLE tbl_normal; +)); + $psql_a->{stderr} = ''; + + $psql_a->quit(); +} + +# Verify checksum handling when creating database from an invalid database. +# This also serves as a minimal check that cross-database IO is handled +# reasonably. +sub test_checksum_createdb +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + $node->safe_psql('postgres', + 'CREATE DATABASE regression_createdb_source'); + + $node->safe_psql( + 'regression_createdb_source', qq( +CREATE EXTENSION test_aio; +CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000); +SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true); +)); + + my $createdb_sql = qq( +CREATE DATABASE regression_createdb_target +TEMPLATE regression_createdb_source +STRATEGY wal_log; +); + + # Verify that CREATE DATABASE of an invalid database fails and is + # accounted for accurately. + my ($cs_count_before, $cs_ts_before) = + checksum_failures($psql, 'regression_createdb_source'); + psql_like( + $io_method, + $psql, + "create database w/ wal strategy, invalid source", + $createdb_sql, + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 1 of relation base\/\d+\/\d+$/ + ); + my ($cs_count_after, $cs_ts_after) = + checksum_failures($psql, 'regression_createdb_source'); + cmp_ok($cs_count_before + 1, '<=', $cs_count_after, + "$io_method: create database w/ wal strategy, invalid source: checksum count increased" + ); + + # Verify that CREATE DATABASE of the fixed database succeeds. + $node->safe_psql( + 'regression_createdb_source', qq( +SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true); +)); + psql_like($io_method, $psql, + "create database w/ wal strategy, valid source", + $createdb_sql, qr/^$/, qr/^$/); + + $psql->quit(); +} + +# Test that we detect checksum failures and report them +# +# In several places we make sure that the server log actually contains +# individual information for each block involved in the IO. +sub test_ignore_checksum +{ + my $io_method = shift; + my $node = shift; + + my $psql = $node->background_psql('postgres', on_error_stop => 0); + + # Test setup + $psql->query_safe( + qq( +CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false); +INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000); +)); + + my $count_sql = "SELECT count(*) FROM tbl_cs_fail"; + my $invalidate_sql = qq( +SELECT invalidate_rel_block('tbl_cs_fail', g.i) +FROM generate_series(0, 6) g(i); +); + + my $expect = $psql->query_safe($count_sql); + + + # Very basic tests for ignore_checksum_failure=off / on + + $psql->query_safe( + qq( +SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true); +)); + + $psql->query_safe($invalidate_sql); + psql_like($io_method, $psql, + "reading block w/ wrong checksum with ignore_checksum_failure=off fails", + $count_sql, qr/^$/, qr/ERROR: invalid page in block/); + + $psql->query_safe("SET ignore_checksum_failure=on"); + + $psql->query_safe($invalidate_sql); + psql_like($io_method, $psql, + "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds", + $count_sql, + qr/^$expect$/, + qr/WARNING: ignoring (checksum failure|\d checksum failures)/); + + + # Verify that ignore_checksum_failure=off works in multi-block reads + + $psql->query_safe( + qq( +SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true); +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true); +)); + + my $log_location = -s $node->logfile; + psql_like( + $io_method, + $psql, + "test reading of checksum failed block 3, with ignore", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: WARNING: ignoring checksum failure in block 3/ + ); + + # Check that the log contains a LOG message about the failure + $log_location = + $node->wait_for_log(qr/LOG: ignoring checksum failure/, $log_location); + + # check that we error + psql_like( + $io_method, + $psql, + "test reading of valid block 2, checksum failed 3, invalid 4, zero=false with ignore", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 4 of relation base\/\d+\/\d+$/ + ); + + # Test multi-block read with different problems in different blocks + $psql->query( + qq( +SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true); +SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true); +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true); +SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true); +SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true); +)); + $psql->{stderr} = ''; + + $log_location = -s $node->logfile; + psql_like( + $io_method, + $psql, + "test reading of valid block 1, checksum failed 2, 3, invalid 3-5, zero=true", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);), + qr/^$/, + qr/^psql::\d+: WARNING: zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..5 of relation/ + ); + + + # Unfortunately have to scan the whole log since determining $log_location + # above in each of the tests, as wait_for_log() returns the size of the + # file. + + $node->wait_for_log(qr/LOG: ignoring checksum failure in block 2/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 2"); + + $node->wait_for_log(qr/LOG: invalid page in block 3 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about invalid page in block 3"); + + $node->wait_for_log(qr/LOG: invalid page in block 4 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 4"); + + $node->wait_for_log(qr/LOG: invalid page in block 5 of relation base.*; zeroing out page/, + $log_location); + ok(1, "$io_method: found information about checksum failure in block 5"); + + + # Reading a page with both an invalid header and an invalid checksum + $psql->query( + qq( +SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true); +)); + $psql->{stderr} = ''; + + psql_like( + $io_method, + $psql, + "test reading of block with both invalid header and invalid checksum, zero=false", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);), + qr/^$/, + qr/^psql::\d+: ERROR: invalid page in block 3 of relation/ + ); + + psql_like( + $io_method, + $psql, + "test reading of block 3 with both invalid header and invalid checksum, zero=true", + qq( +SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);), + qr/^$/, + qr/^psql::\d+: WARNING: invalid page in block 3 of relation base\/.*; zeroing out page/ + ); + + + $psql->quit(); +} + + +# Run all tests that are supported for all io_methods +sub test_generic +{ + my $io_method = shift; + my $node = shift; + + is($node->safe_psql('postgres', 'SHOW io_method'), + $io_method, "$io_method: io_method set correctly"); + + $node->safe_psql( + 'postgres', qq( +CREATE EXTENSION test_aio; +CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false); +CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false); + +INSERT INTO tbl_corr SELECT generate_series(1, 10000); +INSERT INTO tbl_ok SELECT generate_series(1, 10000); +SELECT grow_rel('tbl_corr', 16); +SELECT grow_rel('tbl_ok', 16); + +SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true); +CHECKPOINT; +)); + + test_handle($io_method, $node); + test_io_error($io_method, $node); + test_batchmode($io_method, $node); + test_startwait_io($io_method, $node); + test_complete_foreign($io_method, $node); + test_close_fd($io_method, $node); + test_invalidate($io_method, $node); + test_zero($io_method, $node); + test_checksum($io_method, $node); + test_ignore_checksum($io_method, $node); + test_checksum_createdb($io_method, $node); + + SKIP: + { + skip 'Injection points not supported by this build', 1 + unless $ENV{enable_injection_points} eq 'yes'; + test_inject($io_method, $node); + } +} diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl new file mode 100644 index 000000000000..af5fae15ea78 --- /dev/null +++ b/src/test/modules/test_aio/t/002_io_workers.pl @@ -0,0 +1,125 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use List::Util qw(shuffle); + + +my $node = PostgreSQL::Test::Cluster->new('worker'); +$node->init(); +$node->append_conf( + 'postgresql.conf', qq( +io_method=worker +)); + +$node->start(); + +# Test changing the number of I/O worker processes while also evaluating the +# handling of their termination. +test_number_of_io_workers_dynamic($node); + +$node->stop(); + +done_testing(); + + +sub test_number_of_io_workers_dynamic +{ + my $node = shift; + + my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers'); + + # Verify that worker count can't be set to 0 + change_number_of_io_workers($node, 0, $prev_worker_count, 1); + + # Verify that worker count can't be set to 33 (above the max) + change_number_of_io_workers($node, 33, $prev_worker_count, 1); + + # Try changing IO workers to a random value and verify that the worker + # count ends up as expected. Always test the min/max of workers. + # + # Valid range for io_workers is [1, 32]. 8 tests in total seems + # reasonable. + my @io_workers_range = shuffle(1 ... 32); + foreach my $worker_count (1, 32, @io_workers_range[ 0, 6 ]) + { + $prev_worker_count = + change_number_of_io_workers($node, $worker_count, + $prev_worker_count, 0); + } +} + +sub change_number_of_io_workers +{ + my $node = shift; + my $worker_count = shift; + my $prev_worker_count = shift; + my $expect_failure = shift; + my ($result, $stdout, $stderr); + + ($result, $stdout, $stderr) = + $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count"); + $node->safe_psql('postgres', 'SELECT pg_reload_conf()'); + + if ($expect_failure) + { + ok( $stderr =~ + /$worker_count is outside the valid range for parameter "io_workers"/, + "updating number of io_workers to $worker_count failed, as expected" + ); + + return $prev_worker_count; + } + else + { + is( $node->safe_psql('postgres', 'SHOW io_workers'), + $worker_count, + "updating number of io_workers from $prev_worker_count to $worker_count" + ); + + check_io_worker_count($node, $worker_count); + terminate_io_worker($node, $worker_count); + check_io_worker_count($node, $worker_count); + + return $worker_count; + } +} + +sub terminate_io_worker +{ + my $node = shift; + my $worker_count = shift; + my ($pid, $ret); + + # Select a random io worker + $pid = $node->safe_psql( + 'postgres', + qq(SELECT pid FROM pg_stat_activity WHERE + backend_type = 'io worker' ORDER BY RANDOM() LIMIT 1)); + + # terminate IO worker with SIGINT + is(PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'INT', $pid), + 0, "random io worker process signalled with INT"); + + # Check that worker exits + ok( $node->poll_query_until( + 'postgres', + qq(SELECT COUNT(*) FROM pg_stat_activity WHERE pid = $pid), '0'), + "random io worker process exited after signal"); +} + +sub check_io_worker_count +{ + my $node = shift; + my $worker_count = shift; + + ok( $node->poll_query_until( + 'postgres', + qq(SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'io worker'), + $worker_count), + "io worker count is $worker_count"); +} diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql new file mode 100644 index 000000000000..e2a812351662 --- /dev/null +++ b/src/test/modules/test_aio/test_aio--1.0.sql @@ -0,0 +1,129 @@ +/* src/test/modules/test_aio/test_aio--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_aio" to load this file. \quit + + +CREATE FUNCTION errno_from_string(sym text) +RETURNS pg_catalog.int4 STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +CREATE FUNCTION grow_rel(rel regclass, nblocks int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +CREATE FUNCTION modify_rel_block(rel regclass, blockno int, + zero bool DEFAULT false, + corrupt_header bool DEFAULT false, + corrupt_checksum bool DEFAULT false) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION read_rel_block_ll( + rel regclass, + blockno int, + nblocks int DEFAULT 1, + wait_complete bool DEFAULT true, + batchmode_enter bool DEFAULT false, + smgrreleaseall bool DEFAULT false, + batchmode_exit bool DEFAULT false, + zero_on_error bool DEFAULT false) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_create_toy(rel regclass, blockno int4) +RETURNS pg_catalog.int4 STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_call_start_io(buffer int, for_input bool, nowait bool) +RETURNS pg_catalog.bool STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed bool, io_error bool, release_aio bool) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + + +/* + * Handle related functions + */ +CREATE FUNCTION handle_get_and_error() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get_twice() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_get_release() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION handle_release_last() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +/* + * Batchmode related functions + */ +CREATE FUNCTION batch_start() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION batch_end() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + +CREATE FUNCTION bb_get_and_error() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION bb_get_twice() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION bb_get() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION bb_get_release() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION bb_release_last() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + + + +/* + * Injection point related functions + */ +CREATE FUNCTION inj_io_short_read_attach(result int) +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_short_read_detach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_reopen_attach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION inj_io_reopen_detach() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c new file mode 100644 index 000000000000..2a7f4378ef37 --- /dev/null +++ b/src/test/modules/test_aio/test_aio.c @@ -0,0 +1,859 @@ +/*------------------------------------------------------------------------- + * + * test_aio.c + * Helpers to write tests for AIO + * + * This module provides interface functions for C functionality to SQL, to + * make it possible to test AIO related behavior in a targeted way from SQL. + * It'd not generally be safe to export these functions to SQL, but for a test + * that's fine. + * + * Copyright (c) 2020-2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_aio/test_aio.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "fmgr.h" +#include "storage/aio.h" +#include "storage/aio_internal.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/rel.h" + + +PG_MODULE_MAGIC; + + +typedef struct InjIoErrorState +{ + bool enabled_short_read; + bool enabled_reopen; + + bool short_read_result_set; + int short_read_result; +} InjIoErrorState; + +static InjIoErrorState * inj_io_error_state; + +/* Shared memory init callbacks */ +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + + +static PgAioHandle *last_handle; +static PgAioBounceBuffer *last_bb; + + + +static void +test_aio_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(sizeof(InjIoErrorState)); +} + +static void +test_aio_shmem_startup(void) +{ + bool found; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* Create or attach to the shared memory state */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + inj_io_error_state = ShmemInitStruct("injection_points", + sizeof(InjIoErrorState), + &found); + + if (!found) + { + /* First time through, initialize */ + inj_io_error_state->enabled_short_read = false; + inj_io_error_state->enabled_reopen = false; + +#ifdef USE_INJECTION_POINTS + InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED", + "test_aio", + "inj_io_short_read", + NULL, + 0); + InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED"); + + InjectionPointAttach("AIO_WORKER_AFTER_REOPEN", + "test_aio", + "inj_io_reopen", + NULL, + 0); + InjectionPointLoad("AIO_WORKER_AFTER_REOPEN"); + +#endif + } + else + { + /* + * Pre-load the injection points now, so we can call them in a + * critical section. + */ +#ifdef USE_INJECTION_POINTS + InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED"); + InjectionPointLoad("AIO_WORKER_AFTER_REOPEN"); + elog(LOG, "injection point loaded"); +#endif + } + + LWLockRelease(AddinShmemInitLock); +} + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = test_aio_shmem_request; + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = test_aio_shmem_startup; +} + + +PG_FUNCTION_INFO_V1(errno_from_string); +Datum +errno_from_string(PG_FUNCTION_ARGS) +{ + const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0)); + + if (strcmp(sym, "EIO") == 0) + PG_RETURN_INT32(EIO); + else if (strcmp(sym, "EAGAIN") == 0) + PG_RETURN_INT32(EAGAIN); + else if (strcmp(sym, "EINTR") == 0) + PG_RETURN_INT32(EINTR); + else if (strcmp(sym, "ENOSPC") == 0) + PG_RETURN_INT32(ENOSPC); + else if (strcmp(sym, "EROFS") == 0) + PG_RETURN_INT32(EROFS); + + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_internal("%s is not a supported errno value", sym)); + PG_RETURN_INT32(0); +} + +PG_FUNCTION_INFO_V1(grow_rel); +Datum +grow_rel(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + uint32 nblocks = PG_GETARG_UINT32(1); + Relation rel; +#define MAX_BUFFERS_TO_EXTEND_BY 64 + Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY]; + + rel = relation_open(relid, AccessExclusiveLock); + + while (nblocks > 0) + { + uint32 extend_by_pages; + + extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY); + + ExtendBufferedRelBy(BMR_REL(rel), + MAIN_FORKNUM, + NULL, + 0, + extend_by_pages, + victim_buffers, + &extend_by_pages); + + nblocks -= extend_by_pages; + + for (uint32 i = 0; i < extend_by_pages; i++) + { + ReleaseBuffer(victim_buffers[i]); + } + } + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(modify_rel_block); +Datum +modify_rel_block(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + bool zero = PG_GETARG_BOOL(2); + bool corrupt_header = PG_GETARG_BOOL(3); + bool corrupt_checksum = PG_GETARG_BOOL(4); + Page page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + Relation rel; + Buffer buf; + PageHeader ph; + + rel = relation_open(relid, AccessExclusiveLock); + + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, NULL); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * copy the page to local memory, seems nicer than to directly modify in + * the buffer pool. + */ + memcpy(page, BufferGetPage(buf), BLCKSZ); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + ReleaseBuffer(buf); + + /* + * Don't want to have a buffer in-memory that's marked valid where the + * on-disk contents are invalid. Particularly not if the in-memory buffer + * could be dirty... + * + * While we hold an AEL on the relation nobody else should be able to read + * the buffer in. + * + * NB: This is probably racy, better don't copy this to non-test code. + */ + if (BufferIsLocal(buf)) + InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true); + else + EvictUnpinnedBuffer(buf); + + /* + * Now modify the page as asked for by the caller. + */ + if (zero) + memset(page, 0, BufferGetPageSize(buf)); + + if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum)) + PageInit(page, BufferGetPageSize(buf), 0); + + ph = (PageHeader) page; + + if (corrupt_header) + ph->pd_special = BLCKSZ + 1; + + if (corrupt_checksum) + { + bool successfully_corrupted = 0; + + /* + * Any single modification of the checksum could just end up being + * valid again. To be sure + */ + for (int i = 0; i < 100; i++) + { + uint16 verify_checksum; + uint16 old_checksum; + + old_checksum = ph->pd_checksum; + ph->pd_checksum = old_checksum + 1; + + elog(LOG, "corrupting checksum of blk %u from %u to %u", + blkno, old_checksum, ph->pd_checksum); + + verify_checksum = pg_checksum_page(page, blkno); + if (verify_checksum != ph->pd_checksum) + { + successfully_corrupted = true; + break; + } + } + + if (!successfully_corrupted) + elog(ERROR, "could not corrupt checksum, what's going on?"); + } + else + { + PageSetChecksumInplace(page, blkno); + } + + smgrwrite(RelationGetSmgr(rel), + MAIN_FORKNUM, blkno, page, true); + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +/* + * Ensures a buffer for rel & blkno is in shared buffers, without actually + * caring about the buffer contents. Used to set up test scenarios. + */ +static Buffer +create_toy_buffer(Relation rel, BlockNumber blkno) +{ + Buffer buf; + BufferDesc *buf_hdr; + uint32 buf_state; + bool was_pinned = false; + + /* place buffer in shared buffers without erroring out */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (RelationUsesLocalBuffers(rel)) + { + buf_hdr = GetLocalBufferDescriptor(-buf - 1); + buf_state = pg_atomic_read_u32(&buf_hdr->state); + } + else + { + buf_hdr = GetBufferDescriptor(buf - 1); + buf_state = LockBufHdr(buf_hdr); + } + + /* + * We should be the only backend accessing this buffer. This is just a + * small bit of belt-and-suspenders defense, none of this code should ever + * run in a cluster with real data. + */ + if (BUF_STATE_GET_REFCOUNT(buf_state) > 1) + was_pinned = true; + else + buf_state &= ~(BM_VALID | BM_DIRTY); + + if (RelationUsesLocalBuffers(rel)) + pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); + else + UnlockBufHdr(buf_hdr, buf_state); + + if (was_pinned) + elog(ERROR, "toy buffer %d was already pinned", + buf); + + return buf; +} + +/* + * A "low level" read. This does similar things to what + * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and + * less sanity). + */ +PG_FUNCTION_INFO_V1(read_rel_block_ll); +Datum +read_rel_block_ll(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + int nblocks = PG_GETARG_INT32(2); + bool wait_complete = PG_GETARG_BOOL(3); + bool batchmode_enter = PG_GETARG_BOOL(4); + bool call_smgrreleaseall = PG_GETARG_BOOL(5); + bool batchmode_exit = PG_GETARG_BOOL(6); + bool zero_on_error = PG_GETARG_BOOL(7); + Relation rel; + Buffer bufs[PG_IOV_MAX]; + BufferDesc *buf_hdrs[PG_IOV_MAX]; + Page pages[PG_IOV_MAX]; + uint8 srb_flags = 0; + PgAioReturn ior; + PgAioHandle *ioh; + PgAioWaitRef iow; + SMgrRelation smgr; + + if (nblocks <= 0 || nblocks > PG_IOV_MAX) + elog(ERROR, "nblocks is out of range"); + + rel = relation_open(relid, AccessExclusiveLock); + + for (int i = 0; i < nblocks; i++) + { + bufs[i] = create_toy_buffer(rel, blkno + i); + pages[i] = BufferGetBlock(bufs[i]); + buf_hdrs[i] = BufferIsLocal(bufs[i]) ? + GetLocalBufferDescriptor(-bufs[i] - 1) : + GetBufferDescriptor(bufs[i] - 1); + } + + smgr = RelationGetSmgr(rel); + + pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid); + + ioh = pgaio_io_acquire(CurrentResourceOwner, &ior); + pgaio_io_get_wref(ioh, &iow); + + if (RelationUsesLocalBuffers(rel)) + { + for (int i = 0; i < nblocks; i++) + StartLocalBufferIO(buf_hdrs[i], true, false); + pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL); + } + else + { + for (int i = 0; i < nblocks; i++) + StartBufferIO(buf_hdrs[i], true, false); + } + + pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks); + + if (zero_on_error | zero_damaged_pages) + srb_flags |= READ_BUFFERS_ZERO_ON_ERROR; + if (ignore_checksum_failure) + srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES; + + pgaio_io_register_callbacks(ioh, + RelationUsesLocalBuffers(rel) ? + PGAIO_HCB_LOCAL_BUFFER_READV : + PGAIO_HCB_SHARED_BUFFER_READV, + srb_flags); + + if (batchmode_enter) + pgaio_enter_batchmode(); + + smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno, + (void *) pages, nblocks); + + if (call_smgrreleaseall) + smgrreleaseall(); + + if (batchmode_exit) + pgaio_exit_batchmode(); + + for (int i = 0; i < nblocks; i++) + ReleaseBuffer(bufs[i]); + + if (wait_complete) + { + pgaio_wref_wait(&iow); + + if (ior.result.status != PGAIO_RS_OK) + pgaio_result_report(ior.result, + &ior.target_data, + ior.result.status == PGAIO_RS_ERROR ? + ERROR : WARNING); + } + + relation_close(rel, NoLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(invalidate_rel_block); +Datum +invalidate_rel_block(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + Relation rel; + PrefetchBufferResult pr; + Buffer buf; + + rel = relation_open(relid, AccessExclusiveLock); + + /* + * This is a gross hack, but there's no other API exposed that allows to + * get a buffer ID without actually reading the block in. + */ + pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno); + buf = pr.recent_buffer; + + if (BufferIsValid(buf)) + { + /* if the buffer contents aren't valid, this'll return false */ + if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf)) + { + BufferDesc *buf_hdr = BufferIsLocal(buf) ? + GetLocalBufferDescriptor(-buf - 1) + : GetBufferDescriptor(buf - 1); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY) + { + if (BufferIsLocal(buf)) + FlushLocalBuffer(buf_hdr, NULL); + else + FlushOneBuffer(buf); + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + + if (BufferIsLocal(buf)) + InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true); + else if (!EvictUnpinnedBuffer(buf)) + elog(ERROR, "couldn't evict"); + } + } + + relation_close(rel, AccessExclusiveLock); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(buffer_create_toy); +Datum +buffer_create_toy(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + BlockNumber blkno = PG_GETARG_UINT32(1); + Relation rel; + Buffer buf; + + rel = relation_open(relid, AccessExclusiveLock); + + buf = create_toy_buffer(rel, blkno); + ReleaseBuffer(buf); + + relation_close(rel, NoLock); + + PG_RETURN_INT32(buf); +} + +PG_FUNCTION_INFO_V1(buffer_call_start_io); +Datum +buffer_call_start_io(PG_FUNCTION_ARGS) +{ + Buffer buf = PG_GETARG_INT32(0); + bool for_input = PG_GETARG_BOOL(1); + bool nowait = PG_GETARG_BOOL(2); + bool can_start; + + if (BufferIsLocal(buf)) + can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1), + for_input, nowait); + else + can_start = StartBufferIO(GetBufferDescriptor(buf - 1), + for_input, nowait); + + /* + * For tests we don't want the resowner release preventing us from + * orchestrating odd scenarios. + */ + if (can_start && !BufferIsLocal(buf)) + ResourceOwnerForgetBufferIO(CurrentResourceOwner, + buf); + + ereport(LOG, + errmsg("buffer %d after StartBufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + PG_RETURN_BOOL(can_start); +} + +PG_FUNCTION_INFO_V1(buffer_call_terminate_io); +Datum +buffer_call_terminate_io(PG_FUNCTION_ARGS) +{ + Buffer buf = PG_GETARG_INT32(0); + bool for_input = PG_GETARG_BOOL(1); + bool succeed = PG_GETARG_BOOL(2); + bool io_error = PG_GETARG_BOOL(3); + bool release_aio = PG_GETARG_BOOL(4); + bool clear_dirty = false; + uint32 set_flag_bits = 0; + + if (io_error) + set_flag_bits |= BM_IO_ERROR; + + if (for_input) + { + clear_dirty = false; + + if (succeed) + set_flag_bits |= BM_VALID; + } + else + { + if (succeed) + clear_dirty = true; + } + + ereport(LOG, + errmsg("buffer %d before Terminate[Local]BufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + if (BufferIsLocal(buf)) + TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1), + clear_dirty, set_flag_bits, release_aio); + else + TerminateBufferIO(GetBufferDescriptor(buf - 1), + clear_dirty, set_flag_bits, false, release_aio); + + ereport(LOG, + errmsg("buffer %d after Terminate[Local]BufferIO: %s", + buf, DebugPrintBufferRefcount(buf)), + errhidestmt(true), errhidecontext(true)); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get); +Datum +handle_get(PG_FUNCTION_ARGS) +{ + last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_release_last); +Datum +handle_release_last(PG_FUNCTION_ARGS) +{ + if (!last_handle) + elog(ERROR, "no handle"); + + pgaio_io_release(last_handle); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_and_error); +Datum +handle_get_and_error(PG_FUNCTION_ARGS) +{ + pgaio_io_acquire(CurrentResourceOwner, NULL); + + elog(ERROR, "as you command"); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_twice); +Datum +handle_get_twice(PG_FUNCTION_ARGS) +{ + pgaio_io_acquire(CurrentResourceOwner, NULL); + pgaio_io_acquire(CurrentResourceOwner, NULL); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(handle_get_release); +Datum +handle_get_release(PG_FUNCTION_ARGS) +{ + PgAioHandle *handle; + + handle = pgaio_io_acquire(CurrentResourceOwner, NULL); + pgaio_io_release(handle); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(batch_start); +Datum +batch_start(PG_FUNCTION_ARGS) +{ + pgaio_enter_batchmode(); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(batch_end); +Datum +batch_end(PG_FUNCTION_ARGS) +{ + pgaio_exit_batchmode(); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(bb_get); +Datum +bb_get(PG_FUNCTION_ARGS) +{ + last_bb = pgaio_bounce_buffer_get(); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(bb_release_last); +Datum +bb_release_last(PG_FUNCTION_ARGS) +{ + if (!last_bb) + elog(ERROR, "no bb"); + + pgaio_bounce_buffer_release(last_bb); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(bb_get_and_error); +Datum +bb_get_and_error(PG_FUNCTION_ARGS) +{ + pgaio_bounce_buffer_get(); + + elog(ERROR, "as you command"); + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(bb_get_twice); +Datum +bb_get_twice(PG_FUNCTION_ARGS) +{ + pgaio_bounce_buffer_get(); + pgaio_bounce_buffer_get(); + + PG_RETURN_VOID(); +} + + +PG_FUNCTION_INFO_V1(bb_get_release); +Datum +bb_get_release(PG_FUNCTION_ARGS) +{ + PgAioBounceBuffer *bb; + + bb = pgaio_bounce_buffer_get(); + pgaio_bounce_buffer_release(bb); + + PG_RETURN_VOID(); +} + +#ifdef USE_INJECTION_POINTS +extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data); +extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data); + +void +inj_io_short_read(const char *name, const void *private_data) +{ + PgAioHandle *ioh; + + ereport(LOG, + errmsg("short read injection point called, is enabled: %d", + inj_io_error_state->enabled_reopen), + errhidestmt(true), errhidecontext(true)); + + if (inj_io_error_state->enabled_short_read) + { + ioh = pgaio_inj_io_get(); + + /* + * Only shorten reads that are actually longer than the target size, + * otherwise we can trigger over-reads. + */ + if (inj_io_error_state->short_read_result_set + && ioh->op == PGAIO_OP_READV + && inj_io_error_state->short_read_result <= ioh->result) + { + struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off]; + int32 old_result = ioh->result; + int32 new_result = inj_io_error_state->short_read_result; + int32 processed = 0; + + ereport(LOG, + errmsg("short read inject point, changing result from %d to %d", + old_result, new_result), + errhidestmt(true), errhidecontext(true)); + + /* + * The underlying IO actually completed OK, and thus the "invalid" + * portion of the IOV actually contains valid data. That can hide + * a lot of problems, e.g. if we were to wrongly mark a buffer, + * that wasn't read according to the shortened-read, IO as valid, + * the contents would look valid and we might miss a bug. + * + * To avoid that, iterate through the IOV and zero out the + * "failed" portion of the IO. + */ + for (int i = 0; i < ioh->op_data.read.iov_length; i++) + { + if (processed + iov[i].iov_len <= new_result) + processed += iov[i].iov_len; + else if (processed <= new_result) + { + uint32 ok_part = new_result - processed; + + memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part); + processed += iov[i].iov_len; + } + else + { + memset((char *) iov[i].iov_base, 0, iov[i].iov_len); + } + } + + ioh->result = new_result; + } + } +} + +void +inj_io_reopen(const char *name, const void *private_data) +{ + ereport(LOG, + errmsg("reopen injection point called, is enabled: %d", + inj_io_error_state->enabled_reopen), + errhidestmt(true), errhidecontext(true)); + + if (inj_io_error_state->enabled_reopen) + elog(ERROR, "injection point triggering failure to reopen "); +} +#endif + +PG_FUNCTION_INFO_V1(inj_io_short_read_attach); +Datum +inj_io_short_read_attach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_short_read = true; + inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0); + if (inj_io_error_state->short_read_result_set) + inj_io_error_state->short_read_result = PG_GETARG_INT32(0); +#else + elog(ERROR, "injection points not supported"); +#endif + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_short_read_detach); +Datum +inj_io_short_read_detach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_short_read = false; +#else + elog(ERROR, "injection points not supported"); +#endif + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_reopen_attach); +Datum +inj_io_reopen_attach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_reopen = true; +#else + elog(ERROR, "injection points not supported"); +#endif + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(inj_io_reopen_detach); +Datum +inj_io_reopen_detach(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + inj_io_error_state->enabled_reopen = false; +#else + elog(ERROR, "injection points not supported"); +#endif + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_aio/test_aio.control b/src/test/modules/test_aio/test_aio.control new file mode 100644 index 000000000000..cd91c3ed16b9 --- /dev/null +++ b/src/test/modules/test_aio/test_aio.control @@ -0,0 +1,3 @@ +comment = 'Test code for AIO' +default_version = '1.0' +module_pathname = '$libdir/test_aio' diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 954f549555e2..5588d83e1bfb 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -3132,6 +3132,12 @@ DROP USER regress_locktable_user; -- switch to superuser \c - CREATE ROLE regress_readallstats; +SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no has_table_privilege --------------------- @@ -3145,6 +3151,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT (1 row) GRANT pg_read_all_stats TO regress_readallstats; +SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes has_table_privilege --------------------- @@ -3159,6 +3171,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; +SELECT COUNT(*) >= 0 AS ok FROM pg_aios; + ok +---- + t +(1 row) + SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts; ok ---- diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 474789691357..d9533deb04e8 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1286,6 +1286,22 @@ drop table cchild; SELECT viewname, definition FROM pg_views WHERE schemaname = 'pg_catalog' ORDER BY viewname; +pg_aios| SELECT pid, + io_id, + io_generation, + state, + operation, + off, + length, + target, + handle_data_len, + raw_result, + result, + target_desc, + f_sync, + f_localmem, + f_buffered + FROM pg_get_aios() pg_get_aios(pid, io_id, io_generation, state, operation, off, length, target, handle_data_len, raw_result, result, target_desc, f_sync, f_localmem, f_buffered); pg_available_extension_versions| SELECT e.name, e.version, (x.extname IS NOT NULL) AS installed, diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index b81694c24f28..286b1d037569 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1919,16 +1919,19 @@ DROP USER regress_locktable_user; CREATE ROLE regress_readallstats; +SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; +SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; +SELECT COUNT(*) >= 0 AS ok FROM pg_aios; SELECT COUNT(*) >= 0 AS ok FROM pg_backend_memory_contexts; SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_allocations; RESET ROLE; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b66cecd87991..e89c501fa8af 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -349,6 +349,7 @@ BufferManagerRelation BufferStrategyControl BufferTag BufferUsage +BuffersToWrite BuildAccumulator BuiltinScript BulkInsertState @@ -1196,6 +1197,7 @@ IOContext IOFuncSelector IOObject IOOp +IOQueue IO_STATUS_BLOCK IPCompareMethod ITEM @@ -2137,6 +2139,7 @@ PermutationStep PermutationStepBlocker PermutationStepBlockerType PgAioBackend +PgAioBounceBuffer PgAioCtl PgAioHandle PgAioHandleCallbackID @@ -3021,6 +3024,7 @@ TocEntry TokenAuxData TokenizedAuthLine TrackItem +TrackedIO TransApplyAction TransInvalidationInfo TransState