--- /dev/null
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+###
+# Test io_method=worker
+###
+my $node_worker = create_node('worker');
+$node_worker->start();
+
+test_generic('worker', $node_worker);
+SKIP:
+{
+ skip 'Injection points not supported by this build', 1
+ unless $ENV{enable_injection_points} eq 'yes';
+ test_inject_worker('worker', $node_worker);
+}
+
+$node_worker->stop();
+
+
+###
+# Test io_method=io_uring
+###
+
+if (have_io_uring())
+{
+ my $node_uring = create_node('io_uring');
+ $node_uring->start();
+ test_generic('io_uring', $node_uring);
+ $node_uring->stop();
+}
+
+
+###
+# Test io_method=sync
+###
+
+my $node_sync = create_node('sync');
+
+# just to have one test not use the default auto-tuning
+
+$node_sync->append_conf(
+ 'postgresql.conf', qq(
+io_max_concurrency=4
+));
+
+$node_sync->start();
+test_generic('sync', $node_sync);
+$node_sync->stop();
+
+done_testing();
+
+
+###
+# Test Helpers
+###
+
+sub create_node
+{
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+ my $io_method = shift;
+
+ my $node = PostgreSQL::Test::Cluster->new($io_method);
+
+ # Want to test initdb for each IO method, otherwise we could just reuse
+ # the cluster.
+ #
+ # Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the
+ # options specified by ->extra, if somebody puts -c io_method=xyz in
+ # PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we
+ # detect it.
+ local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS};
+ if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS}
+ && $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/)
+ {
+ $ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method";
+ }
+
+ $node->init(extra => [ '-c', "io_method=$io_method" ]);
+
+ $node->append_conf(
+ 'postgresql.conf', qq(
+shared_preload_libraries=test_aio
+log_min_messages = 'DEBUG3'
+log_statement=all
+log_error_verbosity=default
+restart_after_crash=false
+temp_buffers=100
+));
+
+ ok(1, "$io_method: initdb");
+
+ return $node;
+}
+
+sub have_io_uring
+{
+ # To detect if io_uring is supported, we look at the error message for
+ # assigning an invalid value to an enum GUC, which lists all the valid
+ # options. We need to use -C to deal with running as administrator on
+ # windows, the superuser check is omitted if -C is used.
+ my ($stdout, $stderr) =
+ run_command [qw(postgres -C invalid -c io_method=invalid)];
+ die "can't determine supported io_method values"
+ unless $stderr =~ m/Available values: ([^\.]+)\./;
+ my $methods = $1;
+ note "supported io_method values are: $methods";
+
+ return ($methods =~ m/io_uring/) ? 1 : 0;
+}
+
+sub psql_like
+{
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+ my $io_method = shift;
+ my $psql = shift;
+ my $name = shift;
+ my $sql = shift;
+ my $expected_stdout = shift;
+ my $expected_stderr = shift;
+ my ($cmdret, $output);
+
+ ($output, $cmdret) = $psql->query($sql);
+
+ like($output, $expected_stdout, "$io_method: $name: expected stdout");
+ like($psql->{stderr}, $expected_stderr,
+ "$io_method: $name: expected stderr");
+ $psql->{stderr} = '';
+
+ return $output;
+}
+
+sub query_wait_block
+{
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+ my $io_method = shift;
+ my $node = shift;
+ my $psql = shift;
+ my $name = shift;
+ my $sql = shift;
+ my $waitfor = shift;
+
+ my $pid = $psql->query_safe('SELECT pg_backend_pid()');
+
+ $psql->{stdin} .= qq($sql;\n);
+ $psql->{run}->pump_nb();
+ ok(1, "$io_method: $name: issued sql");
+
+ $node->poll_query_until('postgres',
+ qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
+ $waitfor);
+ ok(1, "$io_method: $name: observed $waitfor wait event");
+}
+
+# Returns count of checksum failures for the specified database or for shared
+# relations, if $datname is undefined.
+sub checksum_failures
+{
+ local $Test::Builder::Level = $Test::Builder::Level + 1;
+ my $psql = shift;
+ my $datname = shift;
+ my $checksum_count;
+ my $checksum_last_failure;
+
+ if (defined $datname)
+ {
+ $checksum_count = $psql->query_safe(
+ qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname';
+));
+ $checksum_last_failure = $psql->query_safe(
+ qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname';
+));
+ }
+ else
+ {
+ $checksum_count = $psql->query_safe(
+ qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL;
+));
+ $checksum_last_failure = $psql->query_safe(
+ qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL;
+));
+ }
+
+ return $checksum_count, $checksum_last_failure;
+}
+
+###
+# Sub-tests
+###
+
+# Sanity checks for the IO handle API
+sub test_handle
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ # leak warning: implicit xact
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_get() leak in implicit xact",
+ qq(SELECT handle_get()),
+ qr/^$/,
+ qr/leaked AIO handle/,
+ "$io_method: leaky handle_get() warns");
+
+ # leak warning: explicit xact
+ psql_like(
+ $io_method, $psql,
+ "handle_get() leak in explicit xact",
+ qq(BEGIN; SELECT handle_get(); COMMIT),
+ qr/^$/, qr/leaked AIO handle/);
+
+
+ # leak warning: explicit xact, rollback
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_get() leak in explicit xact, rollback",
+ qq(BEGIN; SELECT handle_get(); ROLLBACK;),
+ qr/^$/,
+ qr/leaked AIO handle/);
+
+ # leak warning: subtrans
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_get() leak in subxact",
+ qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;),
+ qr/^$/,
+ qr/leaked AIO handle/);
+
+ # leak warning + error: released in different command (thus resowner)
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_release() in different command",
+ qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;),
+ qr/^$/,
+ qr/leaked AIO handle.*release in unexpected state/ms);
+
+ # no leak, release in same command
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_release() in same command",
+ qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;),
+ qr/^$/,
+ qr/^$/);
+
+ # normal handle use
+ psql_like($io_method, $psql, "handle_get_release()",
+ qq(SELECT handle_get_release()),
+ qr/^$/, qr/^$/);
+
+ # should error out, API violation
+ psql_like(
+ $io_method,
+ $psql,
+ "handle_get_twice()",
+ qq(SELECT handle_get_twice()),
+ qr/^$/,
+ qr/ERROR: API violation: Only one IO can be handed out$/);
+
+ # recover after error in implicit xact
+ psql_like(
+ $io_method,
+ $psql,
+ "handle error recovery in implicit xact",
+ qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()),
+ qr/^|ok$/,
+ qr/ERROR.*as you command/);
+
+ # recover after error in implicit xact
+ psql_like(
+ $io_method,
+ $psql,
+ "handle error recovery in explicit xact",
+ qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;),
+ qr/^|ok$/,
+ qr/ERROR.*as you command/);
+
+ # recover after error in subtrans
+ psql_like(
+ $io_method,
+ $psql,
+ "handle error recovery in explicit subxact",
+ qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;),
+ qr/^|ok$/,
+ qr/ERROR.*as you command/);
+
+ $psql->quit();
+}
+
+# Sanity checks for the batchmode API
+sub test_batchmode
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ # leak warning & recovery: implicit xact
+ psql_like(
+ $io_method,
+ $psql,
+ "batch_start() leak & cleanup in implicit xact",
+ qq(SELECT batch_start()),
+ qr/^$/,
+ qr/open AIO batch at end/,
+ "$io_method: leaky batch_start() warns");
+
+ # leak warning & recovery: explicit xact
+ psql_like(
+ $io_method,
+ $psql,
+ "batch_start() leak & cleanup in explicit xact",
+ qq(BEGIN; SELECT batch_start(); COMMIT;),
+ qr/^$/,
+ qr/open AIO batch at end/,
+ "$io_method: leaky batch_start() warns");
+
+
+ # leak warning & recovery: explicit xact, rollback
+ #
+ # XXX: This doesn't fail right now, due to not getting a chance to do
+ # something at transaction command commit. That's not a correctness issue,
+ # it just means it's a bit harder to find buggy code.
+ #psql_like($io_method, $psql,
+ # "batch_start() leak & cleanup after abort",
+ # qq(BEGIN; SELECT batch_start(); ROLLBACK;),
+ # qr/^$/,
+ # qr/open AIO batch at end/, "$io_method: leaky batch_start() warns");
+
+ # no warning, batch closed in same command
+ psql_like(
+ $io_method,
+ $psql,
+ "batch_start(), batch_end() works",
+ qq(SELECT batch_start() UNION ALL SELECT batch_end()),
+ qr/^$/,
+ qr/^$/,
+ "$io_method: batch_start(), batch_end()");
+
+ $psql->quit();
+}
+
+# Test that simple cases of invalid pages are reported
+sub test_io_error
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ $psql->query_safe(
+ qq(
+CREATE TEMPORARY TABLE tmp_corr(data int not null);
+INSERT INTO tmp_corr SELECT generate_series(1, 10000);
+SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
+));
+
+ foreach my $tblname (qw(tbl_corr tmp_corr))
+ {
+ my $invalid_page_re =
+ $tblname eq 'tbl_corr'
+ ? qr/invalid page in block 1 of relation base\/\d+\/\d+/
+ : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/;
+
+ # verify the error is reported in custom C code
+ psql_like(
+ $io_method,
+ $psql,
+ "read_rel_block_ll() of $tblname page",
+ qq(SELECT read_rel_block_ll('$tblname', 1)),
+ qr/^$/,
+ $invalid_page_re);
+
+ # verify the error is reported for bufmgr reads, seq scan
+ psql_like(
+ $io_method, $psql,
+ "sequential scan of $tblname block fails",
+ qq(SELECT count(*) FROM $tblname),
+ qr/^$/, $invalid_page_re);
+
+ # verify the error is reported for bufmgr reads, tid scan
+ psql_like(
+ $io_method,
+ $psql,
+ "tid scan of $tblname block fails",
+ qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'),
+ qr/^$/,
+ $invalid_page_re);
+ }
+
+ $psql->quit();
+}
+
+# Test interplay between StartBufferIO and TerminateBufferIO
+sub test_startwait_io
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+ my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+
+ ### Verify behavior for normal tables
+
+ # create a buffer we can play around with
+ my $buf_id = psql_like(
+ $io_method, $psql_a,
+ "creation of toy buffer succeeds",
+ qq(SELECT buffer_create_toy('tbl_ok', 1)),
+ qr/^\d+$/, qr/^$/);
+
+ # check that one backend can perform StartBufferIO
+ psql_like(
+ $io_method,
+ $psql_a,
+ "first StartBufferIO",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+ qr/^t$/,
+ qr/^$/);
+
+ # but not twice on the same buffer (non-waiting)
+ psql_like(
+ $io_method,
+ $psql_a,
+ "second StartBufferIO fails, same session",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+ qr/^f$/,
+ qr/^$/);
+ psql_like(
+ $io_method,
+ $psql_b,
+ "second StartBufferIO fails, other session",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+ qr/^f$/,
+ qr/^$/);
+
+ # start io in a different session, will block
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_b,
+ "blocking start buffer io",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+ "BufferIo");
+
+ # Terminate the IO, without marking it as success, this should trigger the
+ # waiting session to be able to start the io
+ psql_like(
+ $io_method,
+ $psql_a,
+ "blocking start buffer io, terminating io, not valid",
+ qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)),
+ qr/^$/,
+ qr/^$/);
+
+
+ # Because the IO was terminated, but not marked as valid, second session should get the right to start io
+ pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/);
+ ok(1, "$io_method: blocking start buffer io, can start io");
+
+ # terminate the IO again
+ $psql_b->query_safe(
+ qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+ );
+
+
+ # same as the above scenario, but mark IO as having succeeded
+ psql_like(
+ $io_method,
+ $psql_a,
+ "blocking buffer io w/ success: first start buffer io",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+ qr/^t$/,
+ qr/^$/);
+
+ # start io in a different session, will block
+ query_wait_block(
+ $io_method,
+ $node,
+ $psql_b,
+ "blocking start buffer io",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+ "BufferIo");
+
+ # Terminate the IO, marking it as success
+ psql_like(
+ $io_method,
+ $psql_a,
+ "blocking start buffer io, terminating io, valid",
+ qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)),
+ qr/^$/,
+ qr/^$/);
+
+ # Because the IO was terminated, and marked as valid, second session should complete but not need io
+ pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/);
+ ok(1, "$io_method: blocking start buffer io, no need to start io");
+
+ # buffer is valid now, make it invalid again
+ $psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);));
+
+
+ ### Verify behavior for temporary tables
+
+ # Can't unfortunately share the code with the normal table case, there are
+ # too many behavioral differences.
+
+ # create a buffer we can play around with
+ $psql_a->query_safe(
+ qq(
+CREATE TEMPORARY TABLE tmp_ok(data int not null);
+INSERT INTO tmp_ok SELECT generate_series(1, 10000);
+));
+ $buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);));
+
+ # check that one backend can perform StartLocalBufferIO
+ psql_like(
+ $io_method,
+ $psql_a,
+ "first StartLocalBufferIO",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+ qr/^t$/,
+ qr/^$/);
+
+ # Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer
+ # succeeds as well. This test mostly serves as a documentation of that
+ # fact. If we had actually started IO, it'd be different.
+ psql_like(
+ $io_method,
+ $psql_a,
+ "second StartLocalBufferIO succeeds, same session",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+ qr/^t$/,
+ qr/^$/);
+
+ # Terminate the IO again, without marking it as a success
+ $psql_a->query_safe(
+ qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+ );
+ psql_like(
+ $io_method,
+ $psql_a,
+ "StartLocalBufferIO after not marking valid succeeds, same session",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+ qr/^t$/,
+ qr/^$/);
+
+ # Terminate the IO again, marking it as a success
+ $psql_a->query_safe(
+ qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);)
+ );
+
+ # Now another StartLocalBufferIO should fail, this time because the buffer
+ # is already valid.
+ psql_like(
+ $io_method,
+ $psql_a,
+ "StartLocalBufferIO after marking valid fails",
+ qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+ qr/^f$/,
+ qr/^$/);
+
+ $psql_a->quit();
+ $psql_b->quit();
+}
+
+# Test that if the backend issuing a read doesn't wait for the IO's
+# completion, another backend can complete the IO
+sub test_complete_foreign
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+ my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+ # Issue IO without waiting for completion, then sleep
+ $psql_a->query_safe(
+ qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+
+ # Check that another backend can read the relevant block
+ psql_like(
+ $io_method,
+ $psql_b,
+ "completing read started by sleeping backend",
+ qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+ qr/^1$/,
+ qr/^$/);
+
+ # Issue IO without waiting for completion, then exit.
+ $psql_a->query_safe(
+ qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+ $psql_a->reconnect_and_clear();
+
+ # Check that another backend can read the relevant block. This verifies
+ # that the exiting backend left the AIO in a sane state.
+ psql_like(
+ $io_method,
+ $psql_b,
+ "read buffer started by exited backend",
+ qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+ qr/^1$/,
+ qr/^$/);
+
+ # Read a tbl_corr block, then sleep. The other session will retry the IO
+ # and also fail. The easiest thing to verify that seems to be to check
+ # that both are in the log.
+ my $log_location = -s $node->logfile;
+ $psql_a->query_safe(
+ qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);));
+
+ psql_like(
+ $io_method,
+ $psql_b,
+ "completing read of tbl_corr block started by other backend",
+ qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1),
+ qr/^$/,
+ qr/invalid page in block/);
+
+ # The log message issued for the read_rel_block_ll() should be logged as a LOG
+ $node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location);
+ ok(1,
+ "$io_method: completing read of tbl_corr block started by other backend: LOG message for background read"
+ );
+
+ # But for the SELECT, it should be an ERROR
+ $log_location =
+ $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location);
+ ok(1,
+ "$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read"
+ );
+
+ $psql_a->quit();
+ $psql_b->quit();
+}
+
+# Test that we deal correctly with FDs being closed while IO is in progress
+sub test_close_fd
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ psql_like(
+ $io_method,
+ $psql,
+ "close all FDs after read, waiting for results",
+ qq(
+ SELECT read_rel_block_ll('tbl_ok', 1,
+ wait_complete=>true,
+ batchmode_enter=>true,
+ smgrreleaseall=>true,
+ batchmode_exit=>true
+ );),
+ qr/^$/,
+ qr/^$/);
+
+ psql_like(
+ $io_method,
+ $psql,
+ "close all FDs after read, no waiting",
+ qq(
+ SELECT read_rel_block_ll('tbl_ok', 1,
+ wait_complete=>false,
+ batchmode_enter=>true,
+ smgrreleaseall=>true,
+ batchmode_exit=>true
+ );),
+ qr/^$/,
+ qr/^$/);
+
+ # Check that another backend can read the relevant block
+ psql_like(
+ $io_method,
+ $psql,
+ "close all FDs after read, no waiting, query works",
+ qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+ qr/^1$/,
+ qr/^$/);
+
+ $psql->quit();
+}
+
+# Tests using injection points. Mostly to exercise hard IO errors that are
+# hard to trigger without using injection points.
+sub test_inject
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ # injected what we'd expect
+ $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+ $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+ psql_like(
+ $io_method, $psql,
+ "injection point not triggering failure",
+ qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+ qr/^1$/, qr/^$/);
+
+ # injected a read shorter than a single block, expecting error
+ $psql->query_safe(qq(SELECT inj_io_short_read_attach(17);));
+ $psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+ psql_like(
+ $io_method,
+ $psql,
+ "single block short read fails",
+ qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+ qr/^$/,
+ qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/
+ );
+
+ # shorten multi-block read to a single block, should retry
+ my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0);
+SELECT invalidate_rel_block('tbl_ok', 1);
+SELECT invalidate_rel_block('tbl_ok', 2);
+SELECT invalidate_rel_block('tbl_ok', 3);
+/* gap */
+SELECT invalidate_rel_block('tbl_ok', 5);
+SELECT invalidate_rel_block('tbl_ok', 6);
+SELECT invalidate_rel_block('tbl_ok', 7);
+SELECT invalidate_rel_block('tbl_ok', 8););
+
+ $psql->query_safe($inval_query);
+ $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+ psql_like(
+ $io_method, $psql,
+ "multi block short read (1 block) is retried",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^10000$/, qr/^$/);
+
+ # shorten multi-block read to two blocks, should retry
+ $psql->query_safe($inval_query);
+ $psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);));
+
+ psql_like(
+ $io_method, $psql,
+ "multi block short read (2 blocks) is retried",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^10000$/, qr/^$/);
+
+ # verify that page verification errors are detected even as part of a
+ # shortened multi-block read (tbl_corr, block 1 is corrupted)
+ $psql->query_safe(
+ qq(
+SELECT invalidate_rel_block('tbl_corr', 0);
+SELECT invalidate_rel_block('tbl_corr', 1);
+SELECT invalidate_rel_block('tbl_corr', 2);
+SELECT inj_io_short_read_attach(8192);
+ ));
+
+ psql_like(
+ $io_method,
+ $psql,
+ "shortened multi-block read detects invalid page",
+ qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'),
+ qr/^$/,
+ qr/ERROR:.*invalid page in block 1 of relation base\/.*/);
+
+ # trigger a hard error, should error out
+ $psql->query_safe(
+ qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EIO'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+ ));
+
+ psql_like(
+ $io_method,
+ $psql,
+ "first hard IO error is reported",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^$/,
+ qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+ );
+
+ psql_like(
+ $io_method,
+ $psql,
+ "second hard IO error is reported",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^$/,
+ qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+ );
+
+ $psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+ # now the IO should be ok.
+ psql_like(
+ $io_method, $psql,
+ "recovers after hard error",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^10000$/, qr/^$/);
+
+ # trigger a different hard error, should error out
+ $psql->query_safe(
+ qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EROFS'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+ ));
+ psql_like(
+ $io_method,
+ $psql,
+ "different hard IO error is reported",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^$/,
+ qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Read-only file system/
+ );
+ $psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+ $psql->quit();
+}
+
+# Tests using injection points, only for io_method=worker.
+#
+# io_method=worker has the special case of needing to reopen files. That can
+# in theory fail, because the file could be gone. That's a hard path to test
+# for real, so we use an injection point to trigger it.
+sub test_inject_worker
+{
+ my $io_method = shift;
+ my $node = shift;
+ my ($ret, $output);
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ # trigger a failure to reopen, should error out, but should recover
+ $psql->query_safe(
+ qq(
+SELECT inj_io_reopen_attach();
+SELECT invalidate_rel_block('tbl_ok', 1);
+ ));
+
+ psql_like(
+ $io_method,
+ $psql,
+ "failure to open: detected",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^$/,
+ qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/
+ );
+
+ $psql->query_safe(qq(SELECT inj_io_reopen_detach();));
+
+ # check that we indeed recover
+ psql_like(
+ $io_method, $psql,
+ "failure to open: recovers",
+ qq(SELECT count(*) FROM tbl_ok),
+ qr/^10000$/, qr/^$/);
+
+ $psql->quit();
+}
+
+# Verify that we handle a relation getting removed (due to a rollback or a
+# DROP TABLE) while IO is ongoing for that table.
+sub test_invalidate
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ foreach my $persistency (qw(normal unlogged temporary))
+ {
+ my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+ my $tblname = $persistency . '_transactional';
+
+ my $create_sql = qq(
+CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200);
+);
+
+ # Verify that outstanding read IO does not cause problems with
+ # AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+ # -> Invalidate[Local]Buffer.
+ $psql->query_safe("BEGIN; $create_sql;");
+ $psql->query_safe(
+ qq(
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+ psql_like(
+ $io_method,
+ $psql,
+ "rollback of newly created $persistency table with outstanding IO",
+ qq(ROLLBACK),
+ qr/^$/,
+ qr/^$/);
+
+ # Verify that outstanding read IO does not cause problems with
+ # CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+ # -> Invalidate[Local]Buffer.
+ $psql->query_safe("BEGIN; $create_sql; COMMIT;");
+ $psql->query_safe(
+ qq(
+BEGIN;
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+
+ psql_like(
+ $io_method, $psql,
+ "drop $persistency table with outstanding IO",
+ qq(DROP TABLE $tblname),
+ qr/^$/, qr/^$/);
+
+ psql_like($io_method, $psql,
+ "commit after drop $persistency table with outstanding IO",
+ qq(COMMIT), qr/^$/, qr/^$/);
+ }
+
+ $psql->quit();
+}
+
+# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages
+sub test_zero
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+ my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+ foreach my $persistency (qw(normal temporary))
+ {
+ my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+
+ $psql_a->query_safe(
+ qq(
+CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_zero SELECT generate_series(1, 10000);
+));
+
+ $psql_a->query_safe(
+ qq(
+SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true);
+));
+
+ # Check that page validity errors are detected
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test reading of invalid block 0",
+ qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 0 of relation base\/.*\/.*$/
+ );
+
+ # Check that page validity errors are zeroed
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test zeroing of invalid block 0",
+ qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/
+ );
+
+ # And that once the corruption is fixed, we can read again
+ $psql_a->query(
+ qq(
+SELECT modify_rel_block('tbl_zero', 0, zero=>true);
+));
+ $psql_a->{stderr} = '';
+
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test re-read of block 0",
+ qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+ qr/^$/,
+ qr/^$/);
+
+ # Check a page validity error in another block, to ensure we report
+ # the correct block number
+ $psql_a->query_safe(
+ qq(
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test zeroing of invalid block 3",
+ qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/
+ );
+
+
+ # Check one read reporting multiple invalid blocks
+ $psql_a->query_safe(
+ qq(
+SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true);
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+ # First test error
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test reading of invalid block 2,3 in larger read",
+ qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/
+ );
+
+ # Then test zeroing via ZERO_ON_ERROR flag
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR",
+ qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+ );
+
+ # Then test zeroing via zero_damaged_pages
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages",
+ qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)
+COMMIT;
+),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL: Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+ );
+
+ $psql_a->query_safe(qq(COMMIT));
+
+
+ # Verify that bufmgr.c IO detects page validity errors
+ $psql_a->query(
+ qq(
+SELECT invalidate_rel_block('tbl_zero', g.i)
+FROM generate_series(0, 15) g(i);
+SELECT modify_rel_block('tbl_zero', 3, zero=>true);
+));
+ $psql_a->{stderr} = '';
+
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: verify reading zero_damaged_pages=off",
+ qq(
+SELECT count(*) FROM tbl_zero),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 2 of relation base\/.*\/.*$/
+ );
+
+ # Verify that bufmgr.c IO zeroes out pages with page validity errors
+ psql_like(
+ $io_method,
+ $psql_a,
+ "$persistency: verify zero_damaged_pages=on",
+ qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT count(*) FROM tbl_zero;
+COMMIT;
+),
+ qr/^\d+$/,
+ qr/^psql:<stdin>:\d+: WARNING: invalid page in block 2 of relation base\/.*\/.*$/
+ );
+
+ # Check that warnings/errors about page validity in an IO started by
+ # session A that session B might complete aren't logged visibly to
+ # session B.
+ #
+ # This will only ever trigger for io_method's like io_uring, that can
+ # complete IO's in a client backend. But it doesn't seem worth
+ # restricting to that.
+ #
+ # This requires cross-session access to the same relation, hence the
+ # restriction to non-temporary table.
+ if ($sql_persistency ne 'temporary')
+ {
+ # Create a corruption and then read the block without waiting for
+ # completion.
+ $psql_a->query(qq(
+SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true);
+SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true)
+));
+
+ psql_like(
+ $io_method,
+ $psql_b,
+ "$persistency: test completing read by other session doesn't generate warning",
+ qq(SELECT count(*) > 0 FROM tbl_zero;),
+ qr/^t$/, qr/^$/);
+ }
+
+ # Clean up
+ $psql_a->query_safe(
+ qq(
+DROP TABLE tbl_zero;
+));
+ }
+
+ $psql_a->{stderr} = '';
+
+ $psql_a->quit();
+ $psql_b->quit();
+}
+
+# Test that we detect checksum failures and report them
+sub test_checksum
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+
+ $psql_a->query_safe(
+ qq(
+CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_normal SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true);
+
+CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_temp SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true);
+));
+
+ # To be able to test checksum failures on shared rels we need a shared rel
+ # with invalid pages - which is a bit scary. pg_shseclabel seems like a
+ # good bet, as it's not accessed in a default configuration.
+ $psql_a->query_safe(
+ qq(
+SELECT grow_rel('pg_shseclabel', 4);
+SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true);
+));
+
+
+ # Check that page validity errors are detected, checksums stats increase, normal rel
+ my ($cs_count_before, $cs_ts_before) =
+ checksum_failures($psql_a, 'postgres');
+ psql_like(
+ $io_method,
+ $psql_a,
+ "normal rel: test reading of invalid block 3",
+ qq(
+SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation base\/\d+\/\d+$/
+ );
+
+ my ($cs_count_after, $cs_ts_after) =
+ checksum_failures($psql_a, 'postgres');
+
+ cmp_ok($cs_count_before + 1,
+ '<=', $cs_count_after,
+ "$io_method: normal rel: checksum count increased");
+ cmp_ok($cs_ts_after, 'ne', '',
+ "$io_method: normal rel: checksum timestamp is not null");
+
+
+ # Check that page validity errors are detected, checksums stats increase, temp rel
+ ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+ psql_like(
+ $io_method,
+ $psql_a,
+ "temp rel: test reading of invalid block 4, valid block 5",
+ qq(
+SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/
+ );
+
+ ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+
+ cmp_ok($cs_count_before + 1,
+ '<=', $cs_count_after,
+ "$io_method: temp rel: checksum count increased");
+ cmp_ok($cs_ts_after, 'ne', '',
+ "$io_method: temp rel: checksum timestamp is not null");
+
+
+ # Check that page validity errors are detected, checksums stats increase, shared rel
+ ($cs_count_before, $cs_ts_after) = checksum_failures($psql_a);
+ psql_like(
+ $io_method,
+ $psql_a,
+ "shared rel: reading of invalid blocks 2+3",
+ qq(
+SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: 2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL: Block 2 held first invalid page\.\nHINT:[^\n]+$/
+ );
+
+ ($cs_count_after, $cs_ts_after) = checksum_failures($psql_a);
+
+ cmp_ok($cs_count_before + 1,
+ '<=', $cs_count_after,
+ "$io_method: shared rel: checksum count increased");
+ cmp_ok($cs_ts_after, 'ne', '',
+ "$io_method: shared rel: checksum timestamp is not null");
+
+
+ # and restore sanity
+ $psql_a->query(
+ qq(
+SELECT modify_rel_block('pg_shseclabel', 1, zero=>true);
+DROP TABLE tbl_normal;
+));
+ $psql_a->{stderr} = '';
+
+ $psql_a->quit();
+}
+
+# Verify checksum handling when creating database from a database with an
+# invalid block. This also serves as a minimal check that cross-database IO is
+# handled reasonably.
+sub test_checksum_createdb
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ $node->safe_psql('postgres',
+ 'CREATE DATABASE regression_createdb_source');
+
+ $node->safe_psql(
+ 'regression_createdb_source', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000);
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+));
+
+ my $createdb_sql = qq(
+CREATE DATABASE regression_createdb_target
+TEMPLATE regression_createdb_source
+STRATEGY wal_log;
+);
+
+ # Verify that CREATE DATABASE of an invalid database fails and is
+ # accounted for accurately.
+ my ($cs_count_before, $cs_ts_before) =
+ checksum_failures($psql, 'regression_createdb_source');
+ psql_like(
+ $io_method,
+ $psql,
+ "create database w/ wal strategy, invalid source",
+ $createdb_sql,
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 1 of relation base\/\d+\/\d+$/
+ );
+ my ($cs_count_after, $cs_ts_after) =
+ checksum_failures($psql, 'regression_createdb_source');
+ cmp_ok($cs_count_before + 1, '<=', $cs_count_after,
+ "$io_method: create database w/ wal strategy, invalid source: checksum count increased"
+ );
+
+ # Verify that CREATE DATABASE of the fixed database succeeds.
+ $node->safe_psql(
+ 'regression_createdb_source', qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+));
+ psql_like($io_method, $psql,
+ "create database w/ wal strategy, valid source",
+ $createdb_sql, qr/^$/, qr/^$/);
+
+ $psql->quit();
+}
+
+# Test that we detect checksum failures and report them
+#
+# In several places we make sure that the server log actually contains
+# individual information for each block involved in the IO.
+sub test_ignore_checksum
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+ # Test setup
+ $psql->query_safe(
+ qq(
+CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000);
+));
+
+ my $count_sql = "SELECT count(*) FROM tbl_cs_fail";
+ my $invalidate_sql = qq(
+SELECT invalidate_rel_block('tbl_cs_fail', g.i)
+FROM generate_series(0, 6) g(i);
+);
+
+ my $expect = $psql->query_safe($count_sql);
+
+
+ # Very basic tests for ignore_checksum_failure=off / on
+
+ $psql->query_safe(
+ qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true);
+));
+
+ $psql->query_safe($invalidate_sql);
+ psql_like($io_method, $psql,
+ "reading block w/ wrong checksum with ignore_checksum_failure=off fails",
+ $count_sql, qr/^$/, qr/ERROR: invalid page in block/);
+
+ $psql->query_safe("SET ignore_checksum_failure=on");
+
+ $psql->query_safe($invalidate_sql);
+ psql_like($io_method, $psql,
+ "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds",
+ $count_sql,
+ qr/^$expect$/,
+ qr/WARNING: ignoring (checksum failure|\d checksum failures)/);
+
+
+ # Verify that ignore_checksum_failure=off works in multi-block reads
+
+ $psql->query_safe(
+ qq(
+SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+));
+
+ my $log_location = -s $node->logfile;
+ psql_like(
+ $io_method,
+ $psql,
+ "test reading of checksum failed block 3, with ignore",
+ qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: ignoring checksum failure in block 3/
+ );
+
+ # Check that the log contains a LOG message about the failure
+ $log_location =
+ $node->wait_for_log(qr/LOG: ignoring checksum failure/, $log_location);
+
+ # check that we error
+ psql_like(
+ $io_method,
+ $psql,
+ "test reading of valid block 2, checksum failed 3, invalid 4, zero=false with ignore",
+ qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 4 of relation base\/\d+\/\d+$/
+ );
+
+ # Test multi-block read with different problems in different blocks
+ $psql->query(
+ qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true);
+));
+ $psql->{stderr} = '';
+
+ $log_location = -s $node->logfile;
+ psql_like(
+ $io_method,
+ $psql,
+ "test reading of valid block 1, checksum failed 2, 3, invalid 3-5, zero=true",
+ qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..5 of relation/
+ );
+
+
+ # Unfortunately have to scan the whole log since determining $log_location
+ # above in each of the tests, as wait_for_log() returns the size of the
+ # file.
+
+ $node->wait_for_log(qr/LOG: ignoring checksum failure in block 2/,
+ $log_location);
+ ok(1, "$io_method: found information about checksum failure in block 2");
+
+ $node->wait_for_log(qr/LOG: invalid page in block 3 of relation base.*; zeroing out page/,
+ $log_location);
+ ok(1, "$io_method: found information about invalid page in block 3");
+
+ $node->wait_for_log(qr/LOG: invalid page in block 4 of relation base.*; zeroing out page/,
+ $log_location);
+ ok(1, "$io_method: found information about checksum failure in block 4");
+
+ $node->wait_for_log(qr/LOG: invalid page in block 5 of relation base.*; zeroing out page/,
+ $log_location);
+ ok(1, "$io_method: found information about checksum failure in block 5");
+
+
+ # Reading a page with both an invalid header and an invalid checksum
+ $psql->query(
+ qq(
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+));
+ $psql->{stderr} = '';
+
+ psql_like(
+ $io_method,
+ $psql,
+ "test reading of block with both invalid header and invalid checksum, zero=false",
+ qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation/
+ );
+
+ psql_like(
+ $io_method,
+ $psql,
+ "test reading of block 3 with both invalid header and invalid checksum, zero=true",
+ qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);),
+ qr/^$/,
+ qr/^psql:<stdin>:\d+: WARNING: invalid page in block 3 of relation base\/.*; zeroing out page/
+ );
+
+
+ $psql->quit();
+}
+
+
+# Run all tests that are supported for all io_methods
+sub test_generic
+{
+ my $io_method = shift;
+ my $node = shift;
+
+ is($node->safe_psql('postgres', 'SHOW io_method'),
+ $io_method, "$io_method: io_method set correctly");
+
+ $node->safe_psql(
+ 'postgres', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+
+INSERT INTO tbl_corr SELECT generate_series(1, 10000);
+INSERT INTO tbl_ok SELECT generate_series(1, 10000);
+SELECT grow_rel('tbl_corr', 16);
+SELECT grow_rel('tbl_ok', 16);
+
+SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true);
+CHECKPOINT;
+));
+
+ test_handle($io_method, $node);
+ test_io_error($io_method, $node);
+ test_batchmode($io_method, $node);
+ test_startwait_io($io_method, $node);
+ test_complete_foreign($io_method, $node);
+ test_close_fd($io_method, $node);
+ test_invalidate($io_method, $node);
+ test_zero($io_method, $node);
+ test_checksum($io_method, $node);
+ test_ignore_checksum($io_method, $node);
+ test_checksum_createdb($io_method, $node);
+
+ SKIP:
+ {
+ skip 'Injection points not supported by this build', 1
+ unless $ENV{enable_injection_points} eq 'yes';
+ test_inject($io_method, $node);
+ }
+}
--- /dev/null
+/*-------------------------------------------------------------------------
+ *
+ * test_aio.c
+ * Helpers to write tests for AIO
+ *
+ * This module provides interface functions for C functionality to SQL, to
+ * make it possible to test AIO related behavior in a targeted way from SQL.
+ * It'd not generally be safe to export these functions to SQL, but for a test
+ * that's fine.
+ *
+ * Copyright (c) 2020-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/test/modules/test_aio/test_aio.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "fmgr.h"
+#include "storage/aio.h"
+#include "storage/aio_internal.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/builtins.h"
+#include "utils/injection_point.h"
+#include "utils/rel.h"
+
+
+PG_MODULE_MAGIC;
+
+
+typedef struct InjIoErrorState
+{
+ bool enabled_short_read;
+ bool enabled_reopen;
+
+ bool short_read_result_set;
+ int short_read_result;
+} InjIoErrorState;
+
+static InjIoErrorState * inj_io_error_state;
+
+/* Shared memory init callbacks */
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+
+static PgAioHandle *last_handle;
+
+
+
+static void
+test_aio_shmem_request(void)
+{
+ if (prev_shmem_request_hook)
+ prev_shmem_request_hook();
+
+ RequestAddinShmemSpace(sizeof(InjIoErrorState));
+}
+
+static void
+test_aio_shmem_startup(void)
+{
+ bool found;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /* Create or attach to the shared memory state */
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+ inj_io_error_state = ShmemInitStruct("injection_points",
+ sizeof(InjIoErrorState),
+ &found);
+
+ if (!found)
+ {
+ /* First time through, initialize */
+ inj_io_error_state->enabled_short_read = false;
+ inj_io_error_state->enabled_reopen = false;
+
+#ifdef USE_INJECTION_POINTS
+ InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED",
+ "test_aio",
+ "inj_io_short_read",
+ NULL,
+ 0);
+ InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+
+ InjectionPointAttach("AIO_WORKER_AFTER_REOPEN",
+ "test_aio",
+ "inj_io_reopen",
+ NULL,
+ 0);
+ InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+
+#endif
+ }
+ else
+ {
+ /*
+ * Pre-load the injection points now, so we can call them in a
+ * critical section.
+ */
+#ifdef USE_INJECTION_POINTS
+ InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+ InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+ elog(LOG, "injection point loaded");
+#endif
+ }
+
+ LWLockRelease(AddinShmemInitLock);
+}
+
+void
+_PG_init(void)
+{
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+
+ prev_shmem_request_hook = shmem_request_hook;
+ shmem_request_hook = test_aio_shmem_request;
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = test_aio_shmem_startup;
+}
+
+
+PG_FUNCTION_INFO_V1(errno_from_string);
+Datum
+errno_from_string(PG_FUNCTION_ARGS)
+{
+ const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+ if (strcmp(sym, "EIO") == 0)
+ PG_RETURN_INT32(EIO);
+ else if (strcmp(sym, "EAGAIN") == 0)
+ PG_RETURN_INT32(EAGAIN);
+ else if (strcmp(sym, "EINTR") == 0)
+ PG_RETURN_INT32(EINTR);
+ else if (strcmp(sym, "ENOSPC") == 0)
+ PG_RETURN_INT32(ENOSPC);
+ else if (strcmp(sym, "EROFS") == 0)
+ PG_RETURN_INT32(EROFS);
+
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg_internal("%s is not a supported errno value", sym));
+ PG_RETURN_INT32(0);
+}
+
+PG_FUNCTION_INFO_V1(grow_rel);
+Datum
+grow_rel(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ uint32 nblocks = PG_GETARG_UINT32(1);
+ Relation rel;
+#define MAX_BUFFERS_TO_EXTEND_BY 64
+ Buffer victim_buffers[MAX_BUFFERS_TO_EXTEND_BY];
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ while (nblocks > 0)
+ {
+ uint32 extend_by_pages;
+
+ extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY);
+
+ ExtendBufferedRelBy(BMR_REL(rel),
+ MAIN_FORKNUM,
+ NULL,
+ 0,
+ extend_by_pages,
+ victim_buffers,
+ &extend_by_pages);
+
+ nblocks -= extend_by_pages;
+
+ for (uint32 i = 0; i < extend_by_pages; i++)
+ {
+ ReleaseBuffer(victim_buffers[i]);
+ }
+ }
+
+ relation_close(rel, NoLock);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(modify_rel_block);
+Datum
+modify_rel_block(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ BlockNumber blkno = PG_GETARG_UINT32(1);
+ bool zero = PG_GETARG_BOOL(2);
+ bool corrupt_header = PG_GETARG_BOOL(3);
+ bool corrupt_checksum = PG_GETARG_BOOL(4);
+ Page page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+ Relation rel;
+ Buffer buf;
+ PageHeader ph;
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+ RBM_ZERO_ON_ERROR, NULL);
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * copy the page to local memory, seems nicer than to directly modify in
+ * the buffer pool.
+ */
+ memcpy(page, BufferGetPage(buf), BLCKSZ);
+
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ ReleaseBuffer(buf);
+
+ /*
+ * Don't want to have a buffer in-memory that's marked valid where the
+ * on-disk contents are invalid. Particularly not if the in-memory buffer
+ * could be dirty...
+ *
+ * While we hold an AEL on the relation nobody else should be able to read
+ * the buffer in.
+ *
+ * NB: This is probably racy, better don't copy this to non-test code.
+ */
+ if (BufferIsLocal(buf))
+ InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+ else
+ EvictUnpinnedBuffer(buf);
+
+ /*
+ * Now modify the page as asked for by the caller.
+ */
+ if (zero)
+ memset(page, 0, BufferGetPageSize(buf));
+
+ if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum))
+ PageInit(page, BufferGetPageSize(buf), 0);
+
+ ph = (PageHeader) page;
+
+ if (corrupt_header)
+ ph->pd_special = BLCKSZ + 1;
+
+ if (corrupt_checksum)
+ {
+ bool successfully_corrupted = 0;
+
+ /*
+ * Any single modification of the checksum could just end up being
+ * valid again, due to e.g. corrupt_header changing the data in a way
+ * that'd result in the "corrupted" checksum, or the checksum already
+ * being invalid. Retry in that, unlikely, case.
+ */
+ for (int i = 0; i < 100; i++)
+ {
+ uint16 verify_checksum;
+ uint16 old_checksum;
+
+ old_checksum = ph->pd_checksum;
+ ph->pd_checksum = old_checksum + 1;
+
+ elog(LOG, "corrupting checksum of blk %u from %u to %u",
+ blkno, old_checksum, ph->pd_checksum);
+
+ verify_checksum = pg_checksum_page(page, blkno);
+ if (verify_checksum != ph->pd_checksum)
+ {
+ successfully_corrupted = true;
+ break;
+ }
+ }
+
+ if (!successfully_corrupted)
+ elog(ERROR, "could not corrupt checksum, what's going on?");
+ }
+ else
+ {
+ PageSetChecksumInplace(page, blkno);
+ }
+
+ smgrwrite(RelationGetSmgr(rel),
+ MAIN_FORKNUM, blkno, page, true);
+
+ relation_close(rel, NoLock);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Ensures a buffer for rel & blkno is in shared buffers, without actually
+ * caring about the buffer contents. Used to set up test scenarios.
+ */
+static Buffer
+create_toy_buffer(Relation rel, BlockNumber blkno)
+{
+ Buffer buf;
+ BufferDesc *buf_hdr;
+ uint32 buf_state;
+ bool was_pinned = false;
+
+ /* place buffer in shared buffers without erroring out */
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (RelationUsesLocalBuffers(rel))
+ {
+ buf_hdr = GetLocalBufferDescriptor(-buf - 1);
+ buf_state = pg_atomic_read_u32(&buf_hdr->state);
+ }
+ else
+ {
+ buf_hdr = GetBufferDescriptor(buf - 1);
+ buf_state = LockBufHdr(buf_hdr);
+ }
+
+ /*
+ * We should be the only backend accessing this buffer. This is just a
+ * small bit of belt-and-suspenders defense, none of this code should ever
+ * run in a cluster with real data.
+ */
+ if (BUF_STATE_GET_REFCOUNT(buf_state) > 1)
+ was_pinned = true;
+ else
+ buf_state &= ~(BM_VALID | BM_DIRTY);
+
+ if (RelationUsesLocalBuffers(rel))
+ pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+ else
+ UnlockBufHdr(buf_hdr, buf_state);
+
+ if (was_pinned)
+ elog(ERROR, "toy buffer %d was already pinned",
+ buf);
+
+ return buf;
+}
+
+/*
+ * A "low level" read. This does similar things to what
+ * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and
+ * less sanity).
+ */
+PG_FUNCTION_INFO_V1(read_rel_block_ll);
+Datum
+read_rel_block_ll(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ BlockNumber blkno = PG_GETARG_UINT32(1);
+ int nblocks = PG_GETARG_INT32(2);
+ bool wait_complete = PG_GETARG_BOOL(3);
+ bool batchmode_enter = PG_GETARG_BOOL(4);
+ bool call_smgrreleaseall = PG_GETARG_BOOL(5);
+ bool batchmode_exit = PG_GETARG_BOOL(6);
+ bool zero_on_error = PG_GETARG_BOOL(7);
+ Relation rel;
+ Buffer bufs[PG_IOV_MAX];
+ BufferDesc *buf_hdrs[PG_IOV_MAX];
+ Page pages[PG_IOV_MAX];
+ uint8 srb_flags = 0;
+ PgAioReturn ior;
+ PgAioHandle *ioh;
+ PgAioWaitRef iow;
+ SMgrRelation smgr;
+
+ if (nblocks <= 0 || nblocks > PG_IOV_MAX)
+ elog(ERROR, "nblocks is out of range");
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ for (int i = 0; i < nblocks; i++)
+ {
+ bufs[i] = create_toy_buffer(rel, blkno + i);
+ pages[i] = BufferGetBlock(bufs[i]);
+ buf_hdrs[i] = BufferIsLocal(bufs[i]) ?
+ GetLocalBufferDescriptor(-bufs[i] - 1) :
+ GetBufferDescriptor(bufs[i] - 1);
+ }
+
+ smgr = RelationGetSmgr(rel);
+
+ pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid);
+
+ ioh = pgaio_io_acquire(CurrentResourceOwner, &ior);
+ pgaio_io_get_wref(ioh, &iow);
+
+ if (RelationUsesLocalBuffers(rel))
+ {
+ for (int i = 0; i < nblocks; i++)
+ StartLocalBufferIO(buf_hdrs[i], true, false);
+ pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL);
+ }
+ else
+ {
+ for (int i = 0; i < nblocks; i++)
+ StartBufferIO(buf_hdrs[i], true, false);
+ }
+
+ pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks);
+
+ if (zero_on_error | zero_damaged_pages)
+ srb_flags |= READ_BUFFERS_ZERO_ON_ERROR;
+ if (ignore_checksum_failure)
+ srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
+
+ pgaio_io_register_callbacks(ioh,
+ RelationUsesLocalBuffers(rel) ?
+ PGAIO_HCB_LOCAL_BUFFER_READV :
+ PGAIO_HCB_SHARED_BUFFER_READV,
+ srb_flags);
+
+ if (batchmode_enter)
+ pgaio_enter_batchmode();
+
+ smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno,
+ (void *) pages, nblocks);
+
+ if (call_smgrreleaseall)
+ smgrreleaseall();
+
+ if (batchmode_exit)
+ pgaio_exit_batchmode();
+
+ for (int i = 0; i < nblocks; i++)
+ ReleaseBuffer(bufs[i]);
+
+ if (wait_complete)
+ {
+ pgaio_wref_wait(&iow);
+
+ if (ior.result.status != PGAIO_RS_OK)
+ pgaio_result_report(ior.result,
+ &ior.target_data,
+ ior.result.status == PGAIO_RS_ERROR ?
+ ERROR : WARNING);
+ }
+
+ relation_close(rel, NoLock);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(invalidate_rel_block);
+Datum
+invalidate_rel_block(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ BlockNumber blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ PrefetchBufferResult pr;
+ Buffer buf;
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ /*
+ * This is a gross hack, but there's no other API exposed that allows to
+ * get a buffer ID without actually reading the block in.
+ */
+ pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno);
+ buf = pr.recent_buffer;
+
+ if (BufferIsValid(buf))
+ {
+ /* if the buffer contents aren't valid, this'll return false */
+ if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf))
+ {
+ BufferDesc *buf_hdr = BufferIsLocal(buf) ?
+ GetLocalBufferDescriptor(-buf - 1)
+ : GetBufferDescriptor(buf - 1);
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY)
+ {
+ if (BufferIsLocal(buf))
+ FlushLocalBuffer(buf_hdr, NULL);
+ else
+ FlushOneBuffer(buf);
+ }
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buf);
+
+ if (BufferIsLocal(buf))
+ InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+ else if (!EvictUnpinnedBuffer(buf))
+ elog(ERROR, "couldn't evict");
+ }
+ }
+
+ relation_close(rel, AccessExclusiveLock);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(buffer_create_toy);
+Datum
+buffer_create_toy(PG_FUNCTION_ARGS)
+{
+ Oid relid = PG_GETARG_OID(0);
+ BlockNumber blkno = PG_GETARG_UINT32(1);
+ Relation rel;
+ Buffer buf;
+
+ rel = relation_open(relid, AccessExclusiveLock);
+
+ buf = create_toy_buffer(rel, blkno);
+ ReleaseBuffer(buf);
+
+ relation_close(rel, NoLock);
+
+ PG_RETURN_INT32(buf);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_start_io);
+Datum
+buffer_call_start_io(PG_FUNCTION_ARGS)
+{
+ Buffer buf = PG_GETARG_INT32(0);
+ bool for_input = PG_GETARG_BOOL(1);
+ bool nowait = PG_GETARG_BOOL(2);
+ bool can_start;
+
+ if (BufferIsLocal(buf))
+ can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+ for_input, nowait);
+ else
+ can_start = StartBufferIO(GetBufferDescriptor(buf - 1),
+ for_input, nowait);
+
+ /*
+ * For tests we don't want the resowner release preventing us from
+ * orchestrating odd scenarios.
+ */
+ if (can_start && !BufferIsLocal(buf))
+ ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+ buf);
+
+ ereport(LOG,
+ errmsg("buffer %d after StartBufferIO: %s",
+ buf, DebugPrintBufferRefcount(buf)),
+ errhidestmt(true), errhidecontext(true));
+
+ PG_RETURN_BOOL(can_start);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_terminate_io);
+Datum
+buffer_call_terminate_io(PG_FUNCTION_ARGS)
+{
+ Buffer buf = PG_GETARG_INT32(0);
+ bool for_input = PG_GETARG_BOOL(1);
+ bool succeed = PG_GETARG_BOOL(2);
+ bool io_error = PG_GETARG_BOOL(3);
+ bool release_aio = PG_GETARG_BOOL(4);
+ bool clear_dirty = false;
+ uint32 set_flag_bits = 0;
+
+ if (io_error)
+ set_flag_bits |= BM_IO_ERROR;
+
+ if (for_input)
+ {
+ clear_dirty = false;
+
+ if (succeed)
+ set_flag_bits |= BM_VALID;
+ }
+ else
+ {
+ if (succeed)
+ clear_dirty = true;
+ }
+
+ ereport(LOG,
+ errmsg("buffer %d before Terminate[Local]BufferIO: %s",
+ buf, DebugPrintBufferRefcount(buf)),
+ errhidestmt(true), errhidecontext(true));
+
+ if (BufferIsLocal(buf))
+ TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+ clear_dirty, set_flag_bits, release_aio);
+ else
+ TerminateBufferIO(GetBufferDescriptor(buf - 1),
+ clear_dirty, set_flag_bits, false, release_aio);
+
+ ereport(LOG,
+ errmsg("buffer %d after Terminate[Local]BufferIO: %s",
+ buf, DebugPrintBufferRefcount(buf)),
+ errhidestmt(true), errhidecontext(true));
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get);
+Datum
+handle_get(PG_FUNCTION_ARGS)
+{
+ last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_release_last);
+Datum
+handle_release_last(PG_FUNCTION_ARGS)
+{
+ if (!last_handle)
+ elog(ERROR, "no handle");
+
+ pgaio_io_release(last_handle);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_and_error);
+Datum
+handle_get_and_error(PG_FUNCTION_ARGS)
+{
+ pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+ elog(ERROR, "as you command");
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_twice);
+Datum
+handle_get_twice(PG_FUNCTION_ARGS)
+{
+ pgaio_io_acquire(CurrentResourceOwner, NULL);
+ pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_release);
+Datum
+handle_get_release(PG_FUNCTION_ARGS)
+{
+ PgAioHandle *handle;
+
+ handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+ pgaio_io_release(handle);
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_start);
+Datum
+batch_start(PG_FUNCTION_ARGS)
+{
+ pgaio_enter_batchmode();
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_end);
+Datum
+batch_end(PG_FUNCTION_ARGS)
+{
+ pgaio_exit_batchmode();
+ PG_RETURN_VOID();
+}
+
+#ifdef USE_INJECTION_POINTS
+extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data);
+extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data);
+
+void
+inj_io_short_read(const char *name, const void *private_data)
+{
+ PgAioHandle *ioh;
+
+ ereport(LOG,
+ errmsg("short read injection point called, is enabled: %d",
+ inj_io_error_state->enabled_reopen),
+ errhidestmt(true), errhidecontext(true));
+
+ if (inj_io_error_state->enabled_short_read)
+ {
+ ioh = pgaio_inj_io_get();
+
+ /*
+ * Only shorten reads that are actually longer than the target size,
+ * otherwise we can trigger over-reads.
+ */
+ if (inj_io_error_state->short_read_result_set
+ && ioh->op == PGAIO_OP_READV
+ && inj_io_error_state->short_read_result <= ioh->result)
+ {
+ struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off];
+ int32 old_result = ioh->result;
+ int32 new_result = inj_io_error_state->short_read_result;
+ int32 processed = 0;
+
+ ereport(LOG,
+ errmsg("short read inject point, changing result from %d to %d",
+ old_result, new_result),
+ errhidestmt(true), errhidecontext(true));
+
+ /*
+ * The underlying IO actually completed OK, and thus the "invalid"
+ * portion of the IOV actually contains valid data. That can hide
+ * a lot of problems, e.g. if we were to wrongly mark a buffer,
+ * that wasn't read according to the shortened-read, IO as valid,
+ * the contents would look valid and we might miss a bug.
+ *
+ * To avoid that, iterate through the IOV and zero out the
+ * "failed" portion of the IO.
+ */
+ for (int i = 0; i < ioh->op_data.read.iov_length; i++)
+ {
+ if (processed + iov[i].iov_len <= new_result)
+ processed += iov[i].iov_len;
+ else if (processed <= new_result)
+ {
+ uint32 ok_part = new_result - processed;
+
+ memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part);
+ processed += iov[i].iov_len;
+ }
+ else
+ {
+ memset((char *) iov[i].iov_base, 0, iov[i].iov_len);
+ }
+ }
+
+ ioh->result = new_result;
+ }
+ }
+}
+
+void
+inj_io_reopen(const char *name, const void *private_data)
+{
+ ereport(LOG,
+ errmsg("reopen injection point called, is enabled: %d",
+ inj_io_error_state->enabled_reopen),
+ errhidestmt(true), errhidecontext(true));
+
+ if (inj_io_error_state->enabled_reopen)
+ elog(ERROR, "injection point triggering failure to reopen ");
+}
+#endif
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_attach);
+Datum
+inj_io_short_read_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_short_read = true;
+ inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0);
+ if (inj_io_error_state->short_read_result_set)
+ inj_io_error_state->short_read_result = PG_GETARG_INT32(0);
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_detach);
+Datum
+inj_io_short_read_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_short_read = false;
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_attach);
+Datum
+inj_io_reopen_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_reopen = true;
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+
+ PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_detach);
+Datum
+inj_io_reopen_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ inj_io_error_state->enabled_reopen = false;
+#else
+ elog(ERROR, "injection points not supported");
+#endif
+ PG_RETURN_VOID();
+}