From: glyn Date: Thu, 10 Dec 2015 14:02:12 +0000 (+0000) Subject: Add check_replication_slots check to check the delay on any replication slots. X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=a3ea5253a6e657e8500d758b95e9cc181509a8fd;p=check_postgres.git Add check_replication_slots check to check the delay on any replication slots. "Delay" is measured as size of transaction logs retained for the slot e.g: check_postgres_replication_slots -db=TEST -H=192.168.0.106 -warning=32M -critical=64M --- diff --git a/check_postgres.pl b/check_postgres.pl index 61a1d883d..7bb2835a8 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -187,6 +187,8 @@ our %msg = ( 'no-match-set' => q{No matching settings found due to exclusion/inclusion options}, 'no-match-table' => q{No matching tables found due to exclusion/inclusion options}, 'no-match-user' => q{No matching entries found due to user exclusion/inclusion options}, + 'no-match-slot' => q{No matching replication slots found due to exclusion/inclusion options}, + 'no-match-slotok' => q{No replication slots found}, 'no-parse-psql' => q{Could not parse psql output!}, 'no-time-hires' => q{Cannot find Time::HiRes, needed if 'showtime' is true}, 'opt-output-invalid' => q{Invalid output: must be 'nagios' or 'mrtg' or 'simple' or 'cacti'}, @@ -259,6 +261,7 @@ our %msg = ( 'rep-timeout' => q{Row was not replicated. Timeout: $1}, 'rep-unknown' => q{Replication check failed}, 'rep-wrongvals' => q{Cannot test replication: values are not the right ones ('$1' not '$2' nor '$3')}, + 'repslot-version' => q{Database must be version 9.4 or higher to check replication slots}, 'runcommand-err' => q{Unknown error inside of the "run_command" function}, 'runcommand-nodb' => q{No target databases could be found}, 'runcommand-nodupe' => q{Could not dupe STDERR}, @@ -444,6 +447,8 @@ our %msg = ( 'no-match-set' => q{No se encuentran opciones de configuración coincidentes debido a las opciones de exclusión/inclusión}, 'no-match-table' => q{No se encuentran tablas coincidentes debido a las opciones de exclusión/inclusión}, 'no-match-user' => q{No se encuentran entradas coincidentes debido a las opciones de exclusión/inclusión}, + 'no-match-slot' => q{No se encuentran ranuras de replicación coincidentes debido a las opciones de exclusión/inclusión}, + 'no-match-slotok' => q{No se encuentran ranuras de replicación}, 'no-parse-psql' => q{No se pudo interpretar la salida de psql!}, 'no-time-hires' => q{No se encontró Time::HiRes, necesario si 'showtime' es verdadero}, 'opt-output-invalid' => q{Formato de salida inválido: debe ser 'nagios' o 'mrtg' o 'simple' o 'cacti'}, @@ -515,6 +520,7 @@ our %msg = ( 'rep-timeout' => q{La fila no fue replicada. Timeout: $1}, 'rep-unknown' => q{Chequeo de replicación fallido}, 'rep-wrongvals' => q{No puedo verificar la replicación: los valores no son correctos ('$1' no '$2' ni '$3')}, + 'repslot-version' => q{La base de datos debe ser version 9.4 o superior para ver las ranuras de replicación}, 'runcommand-err' => q{Error desconocido en la función "run_command"}, 'runcommand-nodb' => q{No se encontró ninguna base de datos buscada}, 'runcommand-nodupe' => q{No fue posible duplicar STDERR}, @@ -700,6 +706,8 @@ our %msg = ( 'no-match-set' => q{Aucun paramètre trouvé à cause des options d'exclusion/inclusion}, 'no-match-table' => q{Aucune table trouvée à cause des options d'exclusion/inclusion}, 'no-match-user' => q{Aucune entrée trouvée à cause options d'exclusion/inclusion}, + 'no-match-slot' => q{Aucune fentes de réplication trouvée à cause options d'exclusion/inclusion}, + 'no-match-slotok' => q{Pas de fentes de réplication trouvé}, 'no-parse-psql' => q{N'a pas pu analyser la sortie de psql !}, 'no-time-hires' => q{N'a pas trouvé le module Time::HiRes, nécessaire quand « showtime » est activé}, 'opt-output-invalid' => q{Sortie invalide : doit être 'nagios' ou 'mrtg' ou 'simple' ou 'cacti'}, @@ -771,6 +779,7 @@ our %msg = ( 'rep-timeout' => q{La ligne n'a pas été répliquée. Délai dépassé : $1}, 'rep-unknown' => q{Échec du test de la réplication}, 'rep-wrongvals' => q{Ne peut pas tester la réplication : les valeurs ne sont pas les bonnes (ni '$1' ni '$2' ni '$3')}, + 'repslot-version' => q{Base de données doit être la version 9.4 ou ultérieure pour vérifier fentes de réplication}, 'runcommand-err' => q{Erreur inconnue de la fonction « run_command »}, 'runcommand-nodb' => q{Aucune base de données cible trouvée}, 'runcommand-nodupe' => q{N'a pas pu dupliqué STDERR}, @@ -1438,6 +1447,7 @@ our $action_info = { query_runtime => [0, 'Check how long a specific query takes to run.'], query_time => [1, 'Checks the maximum running time of current queries.'], replicate_row => [0, 'Verify a simple update gets replicated to another server.'], + replication_slots => [1, 'Check the replication delay for replication slots'], same_schema => [0, 'Verify that two databases have the exact same tables, columns, etc.'], sequence => [0, 'Checks remaining calls left in sequences.'], settings_checksum => [0, 'Check that no settings have changed since the last check.'], @@ -2013,6 +2023,7 @@ our %testaction = ( fsm_pages => 'VERSION: 8.2 MAX: 8.3', fsm_relations => 'VERSION: 8.2 MAX: 8.3', hot_standby_delay => 'VERSION: 9.0', + replication_slots => 'VERSION: 9.4', listener => 'MAX: 8.4', ); if ($opt{test}) { @@ -2208,6 +2219,9 @@ check_archive_ready() if $action eq 'archive_ready'; ## Check the replication delay in hot standby setup check_hot_standby_delay() if $action eq 'hot_standby_delay'; +## Check the delay on replication slots. warning and critical are sizes +check_replication_slots() if $action eq 'replication_slots'; + ## Check the maximum transaction age of all connections check_txn_time() if $action eq 'txn_time'; @@ -5157,6 +5171,91 @@ sub check_hot_standby_delay { } ## end of check_hot_standby_delay +sub check_replication_slots { + + ## Check the delay on one or more replication slots + ## Supports: Nagios, MRTG + ## mrtg reports the largest two delays + ## By default, checks all replication slots + ## Can check specific one(s) with include + ## Can ignore some with exclude + ## Warning and critical are bytes + ## Valid units: b, k, m, g, t, e + ## All above may be written as plural or with a trailing 'b' + + my ($warning, $critical) = validate_range({type => 'size'}); + + $SQL = qq{ + WITH slots AS (SELECT slot_name, + slot_type, + coalesce(restart_lsn, '0/0'::pg_lsn) AS slot_lsn, + coalesce(pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn),0) AS delta, + active + FROM pg_replication_slots) + SELECT *, pg_size_pretty(delta) AS delta_pretty FROM slots; + }; + + if ($opt{perflimit}) { + $SQL .= " ORDER BY 1 DESC LIMIT $opt{perflimit}"; + } + + my $info = run_command($SQL, { regex => qr{\d+}, emptyok => 1, } ); + my $found = 0; + + for $db (@{$info->{db}}) { + my $max = -1; + $found = 1; + my %s; + + for my $r (@{$db->{slurp}}) { + if (skip_item($r->{slot_name})) { + $max = -2 if ($max == -1 ); + next; + } + if ($r->{delta} >= $max) { + $max = $r->{delta}; + } + $s{$r->{slot_name}} = [$r->{delta},$r->{delta_pretty},$r->{slot_type},$r->{slot_lsn},$r->{active}]; + } + if ($MRTG) { + do_mrtg({one => $max, msg => "SLOT: $db->{slot_name}"}); + } + if ($max < 0) { + $stats{$db->{dbname}} = 0; + add_ok msg('no-match-slotok') if ($max == -1); + add_unknown msg('no-match-slot') if ($max == -2); + next; + } + + my $msg = ''; + for (sort {$s{$b}[0] <=> $s{$a}[0] or $a cmp $b } keys %s) { + $msg .= "$_: $s{$_}[1] ($s{$_}[2] $s{$_}[3] " . ($s{$_}[4] eq 't'?'active':'inactive') .") "; + $db->{perf} .= sprintf ' %s=%s;%s;%s', + perfname($_), $s{$_}[0], $warning, $critical; + } + if (length $critical and $max >= $critical) { + add_critical $msg; + } + elsif (length $warning and $max >= $warning) { + add_warning $msg; + } + else { + add_ok $msg; + } + } + + ## If no results, probably a version problem + if (!$found and keys %unknown) { + (my $first) = values %unknown; + if ($first->[0][0] =~ /pg_replication_slots/) { + ndie msg('repslot-version'); + } + } + + return; + +} ## end of check_replication_slot_delay + sub check_last_analyze { my $auto = shift || ''; @@ -9642,6 +9741,18 @@ For MRTG output, returns on the first line the time in seconds the replication t The maximum time is set to 4 minutes 30 seconds: if no replication has taken place in that long a time, an error is thrown. +=head2 B + +(C) Check the quantity of WAL retained for any replication +slots in the target database cluster. This is handy for monitoring environments where all WAL archiving +and replication is taking place over replication slots. + +Warning and critical are total bytes retained for the slot. E.g: + + check_postgres_replication_slots --port=5432 --host=yellow -warning=32M -critical=64M + +Specific named slots can be monitored using --include/--exclude + =head2 B (C) Verifies that two or more databases are identical as far as their diff --git a/t/02_replication_slots.t b/t/02_replication_slots.t new file mode 100644 index 000000000..1158a127f --- /dev/null +++ b/t/02_replication_slots.t @@ -0,0 +1,119 @@ +#!perl + +## Test the "replication_slots" action + +use 5.006; +use strict; +use warnings; +use Data::Dumper; +use Test::More tests => 20; +use lib 't','.'; +use CP_Testing; + +use vars qw/$dbh $result $t $port $host $dbname/; + +my $cp = CP_Testing->new( {default_action => 'replication_slots'} ); + +$dbh = $cp->test_database_handle(); +$dbh->{AutoCommit} = 1; +$port = $cp->get_port(); +$host = $cp->get_host(); +$dbname = $cp->get_dbname; + +diag "Connected as $port:$host:$dbname\n"; + +my $S = q{Action 'replication_slots'}; +my $label = 'POSTGRES_REPLICATION_SLOTS'; + +$t = qq{$S self-identifies correctly}; +$result = $cp->run(q{-w 0}); +like ($result, qr{^$label}, $t); + +$t = qq{$S identifies host}; +like ($result, qr{host:$host}, $t); + +$t = qq{$S reports no replication slots}; +like ($result, qr{No replication slots found}, $t); + +$t = qq{$S accepts valid -w input}; +for my $arg ( + '1 MB', + '1 GB', + ) { + like ($cp->run(qq{-w "$arg"}), qr{^$label}, "$t ($arg)"); +} + +$t = qq{$S rejects invalid -w input}; +for my $arg ( + '-1 MB', + 'abc' + ) { + like ($cp->run(qq{-w "$arg"}), qr{^ERROR: Invalid size}, "$t ($arg)"); +} + +$dbh->do ("SELECT * FROM pg_create_physical_replication_slot('cp_testing_slot')"); + +$t = qq{$S reports physical replication slots}; +$result = $cp->run(q{-w 0}); +like ($result, qr{cp_testing_slot.*physical}, $t); + +$t=qq{$S reports ok on physical replication slots when warning level is specified and not exceeded}; +$result = $cp->run(q{-w 1MB}); +like ($result, qr{^$label OK:}, $t); + +$t=qq{$S reports ok on physical replication slots when critical level is specified and not exceeded}; +$result = $cp->run(q{-c 1MB}); +like ($result, qr{^$label OK:}, $t); + +$dbh->do ("SELECT pg_drop_replication_slot('cp_testing_slot')"); + +# To do more tests on physical slots we'd actually have to kick off some activity by performing a connection to them (.. use pg_receivexlog or similar??) + +$dbh->do ("SELECT * FROM pg_create_logical_replication_slot('cp_testing_slot', 'test_decoding')"); + +$t = qq{$S reports logical replication slots}; +$result = $cp->run(q{-w 0}); +like ($result, qr{cp_testing_slot.*logical}, $t); + +$t=qq{$S reports ok on logical replication slots when warning level is specified and not exceeded}; +$result = $cp->run(q{-w 1MB}); +like ($result, qr{^$label OK:}, $t); + +$t=qq{$S reports ok on logical replication slots when critical level is specified and not exceeded}; +$result = $cp->run(q{-c 1MB}); +like ($result, qr{^$label OK:}, $t); + +$dbh->do ("CREATE TABLE cp_testing_table (a text); INSERT INTO cp_testing_table SELECT a || repeat('A',1024) FROM generate_series(1,1024) a; DROP TABLE cp_testing_table;"); + + +$t=qq{$S reports warning on logical replication slots when warning level is specified and is exceeded}; +$result = $cp->run(q{-w 1MB}); +like ($result, qr{^$label WARNING:}, $t); + +$t=qq{$S reports critical on logical replication slots when critical level is specified and is exceeded}; +$result = $cp->run(q{-c 1MB}); +like ($result, qr{^$label CRITICAL:}, $t); + +$t=qq{$S works when include has valid replication slot}; +$result = $cp->run(q{-w 1MB --include=cp_testing_slot}); +like ($result, qr{^$label WARNING:.*cp_testing_slot}, $t); + +$t=qq{$S works when include matches no replication slots}; +$result = $cp->run(q{-w 1MB --include=foobar}); +like ($result, qr{^$label UNKNOWN:.*No matching replication slots}, $t); + +$t=qq{$S returnes correct performance data with include}; +$result = $cp->run(q{-w 1MB --include=cp_testing_slot}); +like ($result, qr{ \| time=\d\.\d\ds cp_testing_slot=\d+}, $t); + +$t=qq{$S works when exclude excludes no replication slots}; +$result = $cp->run(q{-w 10MB --exclude=foobar}); +like ($result, qr{^$label OK:.*cp_testing_slot}, $t); + +$t=qq{$S works when exclude excludes all replication slots}; +$result = $cp->run(q{-w 10MB --exclude=cp_testing_slot}); +like ($result, qr{^$label UNKNOWN:.*No matching replication slots}, $t); + +$dbh->do ("SELECT pg_drop_replication_slot('cp_testing_slot')"); + +exit; diff --git a/t/CP_Testing.pm b/t/CP_Testing.pm index 95ed1e0fd..7d4b6e9a3 100644 --- a/t/CP_Testing.pm +++ b/t/CP_Testing.pm @@ -143,6 +143,13 @@ sub test_database_handle { print $cfh qq{max_fsm_pages = 99999\n}; } + ## >= 9.4 + if ($imaj > 9 or ($imaj==9 and $imin >= 4)) { + print $cfh qq{max_replication_slots = 2\n}; + print $cfh qq{wal_level = logical\n}; + print $cfh qq{max_wal_senders = 2\n}; + } + print $cfh "\n"; close $cfh or die qq{Could not close "$cfile": $!\n};