diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml index 091982f62ad5..55bc46849db5 100644 --- a/doc/src/sgml/ref/pg_combinebackup.sgml +++ b/doc/src/sgml/ref/pg_combinebackup.sgml @@ -137,6 +137,35 @@ PostgreSQL documentation + + + + + + Use hard links instead of copying files to the synthetic backup. + Reconstruction of the synthetic backup might be faster (no file copying) + and use less disk space, but care must be taken when using the output + directory, because any modifications to that directory (for example, + starting the server) can also affect the input directories. Likewise, + changes to the input directories (for example, starting the server on + the full backup) could affect the output directory. Thus, this option + is best used when the input directories are only copies that will be + removed after pg_combinebackup has completed. + + + + Requires that the input backups and the output directory are in the + same file system. + + + + If a backup manifest is not available or does not contain checksum of + the right type, hard links will still be created, but the file will be + also read block-by-block for the checksum calculation. + + + + @@ -167,7 +196,8 @@ PostgreSQL documentation Perform regular file copy. This is the default. (See also - and .) + , , and + /.) diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c index 4e27814839c4..97ecda5a66dd 100644 --- a/src/bin/pg_combinebackup/copy_file.c +++ b/src/bin/pg_combinebackup/copy_file.c @@ -40,6 +40,9 @@ static void copy_file_copyfile(const char *src, const char *dst, pg_checksum_context *checksum_ctx); #endif +static void copy_file_link(const char *src, const char *dest, + pg_checksum_context *checksum_ctx); + /* * Copy a regular file, optionally computing a checksum, and emitting * appropriate debug messages. But if we're in dry-run mode, then just emit @@ -69,7 +72,13 @@ copy_file(const char *src, const char *dst, } #ifdef WIN32 - copy_method = COPY_METHOD_COPYFILE; + /* + * We have no specific switch to enable CopyFile on Windows, because + * it's supported (as far as we know) on all Windows machines. So, + * automatically enable it unless some other strategy was selected. + */ + if (copy_method == COPY_METHOD_COPY) + copy_method = COPY_METHOD_COPYFILE; #endif /* Determine the name of the copy strategy for use in log messages. */ @@ -93,6 +102,10 @@ copy_file(const char *src, const char *dst, strategy_implementation = copy_file_copyfile; break; #endif + case COPY_METHOD_LINK: + strategy_name = "link"; + strategy_implementation = copy_file_link; + break; } if (dry_run) @@ -304,3 +317,21 @@ copy_file_copyfile(const char *src, const char *dst, checksum_file(src, checksum_ctx); } #endif /* WIN32 */ + +/* + * copy_file_link + * Hard-links a file from src to dest. + * + * If needed, also reads the file and calculates the checksum. + */ +static void +copy_file_link(const char *src, const char *dest, + pg_checksum_context *checksum_ctx) +{ + if (link(src, dest) < 0) + pg_fatal("error while linking file from \"%s\" to \"%s\": %m", + src, dest); + + /* if needed, calculate checksum of the file */ + checksum_file(src, checksum_ctx); +} diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h index 92f104115bbc..5a8517629c72 100644 --- a/src/bin/pg_combinebackup/copy_file.h +++ b/src/bin/pg_combinebackup/copy_file.h @@ -25,6 +25,7 @@ typedef enum CopyMethod #ifdef WIN32 COPY_METHOD_COPYFILE, #endif + COPY_METHOD_LINK, } CopyMethod; extern void copy_file(const char *src, const char *dst, diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build index 0c4fd9e62702..e80a4756a7f4 100644 --- a/src/bin/pg_combinebackup/meson.build +++ b/src/bin/pg_combinebackup/meson.build @@ -37,6 +37,7 @@ tests += { 't/007_wal_level_minimal.pl', 't/008_promote.pl', 't/009_no_full_file.pl', + 't/010_hardlink.pl', ], } } diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 5864ec574fb6..d480dc74436e 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -135,6 +135,7 @@ main(int argc, char *argv[]) {"no-sync", no_argument, NULL, 'N'}, {"output", required_argument, NULL, 'o'}, {"tablespace-mapping", required_argument, NULL, 'T'}, + {"link", no_argument, NULL, 'k'}, {"manifest-checksums", required_argument, NULL, 1}, {"no-manifest", no_argument, NULL, 2}, {"sync-method", required_argument, NULL, 3}, @@ -172,7 +173,7 @@ main(int argc, char *argv[]) opt.copy_method = COPY_METHOD_COPY; /* process command-line options */ - while ((c = getopt_long(argc, argv, "dnNo:T:", + while ((c = getopt_long(argc, argv, "dknNo:T:", long_options, &optindex)) != -1) { switch (c) @@ -181,6 +182,9 @@ main(int argc, char *argv[]) opt.debug = true; pg_logging_increase_verbosity(); break; + case 'k': + opt.copy_method = COPY_METHOD_LINK; + break; case 'n': opt.dry_run = true; break; @@ -424,6 +428,11 @@ main(int argc, char *argv[]) } } + /* Warn about the possibility of compromising the backups, when link mode */ + if (opt.copy_method == COPY_METHOD_LINK) + pg_log_warning("--link mode was used; any modifications to the output " + "directory may destructively modify input directories"); + /* It's a success, so don't remove the output directories. */ reset_directory_cleanup_list(); exit(0); @@ -761,6 +770,7 @@ help(const char *progname) printf(_(" %s [OPTION]... DIRECTORY...\n"), progname); printf(_("\nOptions:\n")); printf(_(" -d, --debug generate lots of debugging output\n")); + printf(_(" -k, --link link files instead of copying\n")); printf(_(" -n, --dry-run do not actually do anything\n")); printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); printf(_(" -o, --output=DIRECTORY output directory\n")); diff --git a/src/bin/pg_combinebackup/t/010_hardlink.pl b/src/bin/pg_combinebackup/t/010_hardlink.pl new file mode 100644 index 000000000000..a0ee419090cf --- /dev/null +++ b/src/bin/pg_combinebackup/t/010_hardlink.pl @@ -0,0 +1,169 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# This test aims to validate that hard links are created as expected in the +# output directory, when running pg_combinebackup with --link mode. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Set up a new database instance. +my $primary = PostgreSQL::Test::Cluster->new('primary'); +$primary->init(has_archiving => 1, allows_streaming => 1); +$primary->append_conf('postgresql.conf', 'summarize_wal = on'); +# We disable autovacuum to prevent "something else" to modify our test tables. +$primary->append_conf('postgresql.conf', 'autovacuum = off'); +$primary->start; + +# Create a couple of tables (~264KB each). +# Note: Cirrus CI runs some tests with a very small segment size, so, in that +# environment, a single table of 264KB would have both a segment with a link +# count of 1 and also one with a link count of 2. But in a normal installation, +# segment size is 1GB. Therefore, we use 2 different tables here: for test_1, +# all segments (or the only one) will have two hard links; for test_2, the +# last segment (or the only one) will have 1 hard link, and any others will +# have 2. +my $query = <<'EOM'; +CREATE TABLE test_%s AS + SELECT x.id::bigint, + repeat('a', 1600) AS value + FROM generate_series(1, 100) AS x(id); +EOM + +$primary->safe_psql('postgres', sprintf($query, '1')); +$primary->safe_psql('postgres', sprintf($query, '2')); + +# Fetch information about the data files. +$query = <<'EOM'; +SELECT pg_relation_filepath(oid) +FROM pg_class +WHERE relname = 'test_%s'; +EOM + +my $test_1_path = $primary->safe_psql('postgres', sprintf($query, '1')); +note "test_1 path is $test_1_path"; + +my $test_2_path = $primary->safe_psql('postgres', sprintf($query, '2')); +note "test_2 path is $test_2_path"; + +# Take a full backup. +my $backup1path = $primary->backup_dir . '/backup1'; +$primary->command_ok( + [ + 'pg_basebackup', + '--pgdata' => $backup1path, + '--no-sync', + '--checkpoint' => 'fast', + '--wal-method' => 'none' + ], + "full backup"); + +# Perform an insert that touches a page of the last segment of the data file of +# table test_2. +$primary->safe_psql('postgres', <backup_dir . '/backup2'; +$primary->command_ok( + [ + 'pg_basebackup', + '--pgdata' => $backup2path, + '--no-sync', + '--checkpoint' => 'fast', + '--wal-method' => 'none', + '--incremental' => $backup1path . '/backup_manifest' + ], + "incremental backup"); + +# Restore the incremental backup and use it to create a new node. +my $restore = PostgreSQL::Test::Cluster->new('restore'); +$restore->init_from_backup( + $primary, 'backup2', + combine_with_prior => ['backup1'], + combine_mode => '--link'); + +# Ensure files have the expected count of hard links. We expect all data files +# from test_1 to contain 2 hard links, because they were not touched between the +# full and incremental backups, and the last data file of table test_2 to +# contain a single hard link because of changes in its last page. +my $test_1_full_path = join('/', $restore->data_dir, $test_1_path); +check_data_file($test_1_full_path, 2); + +my $test_2_full_path = join('/', $restore->data_dir, $test_2_path); +check_data_file($test_2_full_path, 1); + +# OK, that's all. +done_testing(); + + +# Given the path to the first segment of a data file, inspect its parent +# directory to find all the segments of that data file, and make sure all the +# segments contain 2 hard links. The last one must have the given number of hard +# links. +# +# Parameters: +# * data_file: path to the first segment of a data file, as per the output of +# pg_relation_filepath. +# * last_segment_nlinks: the number of hard links expected in the last segment +# of the given data file. +sub check_data_file +{ + my ($data_file, $last_segment_nlinks) = @_; + + my @data_file_segments = ($data_file); + + # Start checking for additional segments + my $segment_number = 1; + + while (1) + { + my $next_segment = $data_file . '.' . $segment_number; + + # If the file exists and is a regular file, add it to the list + if (-f $next_segment) + { + push @data_file_segments, $next_segment; + $segment_number++; + } + # Stop the loop if the file doesn't exist + else + { + last; + } + } + + # All segments of the given data file should contain 2 hard links, except + # for the last one, which should match the given number of links. + my $last_segment = pop @data_file_segments; + + for my $segment (@data_file_segments) + { + # Get the file's stat information of each segment + my $nlink_count = get_hard_link_count($segment); + ok($nlink_count == 2, "File '$segment' has 2 hard links"); + } + + # Get the file's stat information of the last segment + my $nlink_count = get_hard_link_count($last_segment); + ok($nlink_count == $last_segment_nlinks, + "File '$last_segment' has $last_segment_nlinks hard link(s)"); +} + + +# Subroutine to get hard link count of a given file. +# Receives the path to a file, and returns the number of hard links of +# that file. +sub get_hard_link_count +{ + my ($file) = @_; + + # Get file stats + my @stats = stat($file); + my $nlink = $stats[3]; # Number of hard links + + return $nlink; +}