Kill dead-end children when there's nothing else left
Previously, the postmaster would never try to kill dead-end child processes, even if there were no other processes left. A dead-end backend will eventually exit, when authentication_timeout expires, but if a dead-end backend is the only thing that's preventing the server from shutting down, it seems better to kill it immediately. It's particularly important, if there was a bug in the early startup code that prevented a dead-end child from timing out and exiting normally. Includes a test for that case where a dead-end backend previously prevented the server from shutting down. Reviewed-by: Andres Freund <andres@anarazel.de> Discussion: https://www.postgresql.org/message-id/a102f15f-eac4-4ff2-af02-f9ff209ec66f@iki.fi
This commit is contained in:
parent
18d67a8d7d
commit
bb861414fe
@ -2985,10 +2985,11 @@ PostmasterStateMachine(void)
|
|||||||
if (Shutdown >= ImmediateShutdown || FatalError)
|
if (Shutdown >= ImmediateShutdown || FatalError)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Start waiting for dead_end children to die. This state
|
* Stop any dead_end children and stop creating new ones.
|
||||||
* change causes ServerLoop to stop creating new ones.
|
|
||||||
*/
|
*/
|
||||||
pmState = PM_WAIT_DEAD_END;
|
pmState = PM_WAIT_DEAD_END;
|
||||||
|
ConfigurePostmasterWaitSet(false);
|
||||||
|
SignalChildren(SIGQUIT, btmask(B_DEAD_END_BACKEND));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We already SIGQUIT'd the archiver and stats processes, if
|
* We already SIGQUIT'd the archiver and stats processes, if
|
||||||
@ -3027,9 +3028,10 @@ PostmasterStateMachine(void)
|
|||||||
*/
|
*/
|
||||||
FatalError = true;
|
FatalError = true;
|
||||||
pmState = PM_WAIT_DEAD_END;
|
pmState = PM_WAIT_DEAD_END;
|
||||||
|
ConfigurePostmasterWaitSet(false);
|
||||||
|
|
||||||
/* Kill the walsenders and archiver too */
|
/* Kill the walsenders and archiver too */
|
||||||
SignalChildren(SIGQUIT, btmask_all_except(B_DEAD_END_BACKEND));
|
SignalChildren(SIGQUIT, BTYPE_MASK_ALL);
|
||||||
if (PgArchPID != 0)
|
if (PgArchPID != 0)
|
||||||
signal_child(PgArchPID, SIGQUIT);
|
signal_child(PgArchPID, SIGQUIT);
|
||||||
}
|
}
|
||||||
@ -3048,14 +3050,13 @@ PostmasterStateMachine(void)
|
|||||||
if (PgArchPID == 0 && CountChildren(btmask_all_except(B_DEAD_END_BACKEND)) == 0)
|
if (PgArchPID == 0 && CountChildren(btmask_all_except(B_DEAD_END_BACKEND)) == 0)
|
||||||
{
|
{
|
||||||
pmState = PM_WAIT_DEAD_END;
|
pmState = PM_WAIT_DEAD_END;
|
||||||
|
ConfigurePostmasterWaitSet(false);
|
||||||
|
SignalChildren(SIGTERM, BTYPE_MASK_ALL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pmState == PM_WAIT_DEAD_END)
|
if (pmState == PM_WAIT_DEAD_END)
|
||||||
{
|
{
|
||||||
/* Don't allow any new socket connection events. */
|
|
||||||
ConfigurePostmasterWaitSet(false);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
|
* PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
|
||||||
* (ie, no dead_end children remain), and the archiver is gone too.
|
* (ie, no dead_end children remain), and the archiver is gone too.
|
||||||
@ -3381,12 +3382,12 @@ SignalChildren(int signal, BackendTypeMask targetMask)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Send a termination signal to children. This considers all of our children
|
* Send a termination signal to children. This considers all of our children
|
||||||
* processes, except syslogger and dead_end backends.
|
* processes, except syslogger.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
TerminateChildren(int signal)
|
TerminateChildren(int signal)
|
||||||
{
|
{
|
||||||
SignalChildren(signal, btmask_all_except(B_DEAD_END_BACKEND));
|
SignalChildren(signal, BTYPE_MASK_ALL);
|
||||||
if (StartupPID != 0)
|
if (StartupPID != 0)
|
||||||
{
|
{
|
||||||
signal_child(StartupPID, signal);
|
signal_child(StartupPID, signal);
|
||||||
|
@ -1194,6 +1194,9 @@ this to fail. Otherwise, tests might fail to detect server crashes.
|
|||||||
With optional extra param fail_ok => 1, returns 0 for failure
|
With optional extra param fail_ok => 1, returns 0 for failure
|
||||||
instead of bailing out.
|
instead of bailing out.
|
||||||
|
|
||||||
|
The optional extra param timeout can be used to pass the pg_ctl
|
||||||
|
--timeout option.
|
||||||
|
|
||||||
=cut
|
=cut
|
||||||
|
|
||||||
sub stop
|
sub stop
|
||||||
@ -1209,8 +1212,11 @@ sub stop
|
|||||||
return 1 unless defined $self->{_pid};
|
return 1 unless defined $self->{_pid};
|
||||||
|
|
||||||
print "### Stopping node \"$name\" using mode $mode\n";
|
print "### Stopping node \"$name\" using mode $mode\n";
|
||||||
$ret = PostgreSQL::Test::Utils::system_log('pg_ctl', '-D', $pgdata,
|
my @cmd = ('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
|
||||||
'-m', $mode, 'stop');
|
if ($params{timeout}) {
|
||||||
|
push(@cmd, ('--timeout', $params{timeout}));
|
||||||
|
}
|
||||||
|
$ret = PostgreSQL::Test::Utils::system_log(@cmd);
|
||||||
|
|
||||||
if ($ret != 0)
|
if ($ret != 0)
|
||||||
{
|
{
|
||||||
|
@ -7,6 +7,7 @@ tests += {
|
|||||||
'tap': {
|
'tap': {
|
||||||
'tests': [
|
'tests': [
|
||||||
't/001_connection_limits.pl',
|
't/001_connection_limits.pl',
|
||||||
|
't/002_start_stop.pl',
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
98
src/test/postmaster/t/002_start_stop.pl
Normal file
98
src/test/postmaster/t/002_start_stop.pl
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
|
||||||
|
# Copyright (c) 2021-2024, PostgreSQL Global Development Group
|
||||||
|
|
||||||
|
# Test postmaster start and stop state machine.
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings FATAL => 'all';
|
||||||
|
use PostgreSQL::Test::Cluster;
|
||||||
|
use PostgreSQL::Test::Utils;
|
||||||
|
use Test::More;
|
||||||
|
|
||||||
|
#
|
||||||
|
# Test that dead-end backends don't prevent the server from shutting
|
||||||
|
# down.
|
||||||
|
#
|
||||||
|
# Dead-end backends can linger until they reach
|
||||||
|
# authentication_timeout. We use a long authentication_timeout and a
|
||||||
|
# much shorter timeout for the "pg_ctl stop" operation, to test that
|
||||||
|
# if dead-end backends are killed at fast shut down. If they're not,
|
||||||
|
# "pg_ctl stop" will error out before the authentication timeout kicks
|
||||||
|
# in and cleans up the dead-end backends.
|
||||||
|
my $authentication_timeout = $PostgreSQL::Test::Utils::timeout_default;
|
||||||
|
my $stop_timeout = $authentication_timeout / 2;
|
||||||
|
|
||||||
|
# Initialize the server with low connection limits, to test dead-end backends
|
||||||
|
my $node = PostgreSQL::Test::Cluster->new('main');
|
||||||
|
$node->init;
|
||||||
|
$node->append_conf('postgresql.conf', "max_connections = 5");
|
||||||
|
$node->append_conf('postgresql.conf', "max_wal_senders = 0");
|
||||||
|
$node->append_conf('postgresql.conf', "autovacuum_max_workers = 1");
|
||||||
|
$node->append_conf('postgresql.conf', "max_worker_processes = 1");
|
||||||
|
$node->append_conf('postgresql.conf', "log_connections = on");
|
||||||
|
$node->append_conf('postgresql.conf', "log_min_messages = debug2");
|
||||||
|
$node->append_conf('postgresql.conf',
|
||||||
|
"authentication_timeout = '$authentication_timeout s'");
|
||||||
|
$node->append_conf('postgresql.conf', 'trace_connection_negotiation=on');
|
||||||
|
$node->start;
|
||||||
|
|
||||||
|
if (!$node->raw_connect_works())
|
||||||
|
{
|
||||||
|
plan skip_all => "this test requires working raw_connect()";
|
||||||
|
}
|
||||||
|
|
||||||
|
my @raw_connections = ();
|
||||||
|
|
||||||
|
# Open a lot of TCP (or Unix domain socket) connections to use up all
|
||||||
|
# the connection slots. Beyond a certain number (roughly 2x
|
||||||
|
# max_connections), they will be "dead-end backends".
|
||||||
|
for (my $i = 0; $i <= 20; $i++)
|
||||||
|
{
|
||||||
|
my $sock = $node->raw_connect();
|
||||||
|
|
||||||
|
# On a busy system, the server might reject connections if
|
||||||
|
# postmaster cannot accept() them fast enough. The exact limit
|
||||||
|
# and behavior depends on the platform. To make this reliable,
|
||||||
|
# we attempt SSL negotiation on each connection before opening
|
||||||
|
# next one. The server will reject the SSL negotations, but
|
||||||
|
# when it does so, we know that the backend has been launched
|
||||||
|
# and we should be able to open another connection.
|
||||||
|
|
||||||
|
# SSLRequest packet consists of packet length followed by
|
||||||
|
# NEGOTIATE_SSL_CODE.
|
||||||
|
my $negotiate_ssl_code = pack("Nnn", 8, 1234, 5679);
|
||||||
|
my $sent = $sock->send($negotiate_ssl_code);
|
||||||
|
|
||||||
|
# Read reply. We expect the server to reject it with 'N'
|
||||||
|
my $reply = "";
|
||||||
|
$sock->recv($reply, 1);
|
||||||
|
is($reply, "N", "dead-end connection $i");
|
||||||
|
|
||||||
|
push(@raw_connections, $sock);
|
||||||
|
}
|
||||||
|
|
||||||
|
# When all the connection slots are in use, new connections will fail
|
||||||
|
# before even looking up the user. Hence you now get "sorry, too many
|
||||||
|
# clients already" instead of "role does not exist" error. Test that
|
||||||
|
# to ensure that we have used up all the slots.
|
||||||
|
$node->connect_fails("dbname=postgres user=invalid_user",
|
||||||
|
"connect ",
|
||||||
|
expected_stderr => qr/FATAL: sorry, too many clients already/);
|
||||||
|
|
||||||
|
# Open one more connection, to really ensure that we have at least one
|
||||||
|
# dead-end backend.
|
||||||
|
my $sock = $node->raw_connect();
|
||||||
|
|
||||||
|
# Test that the dead-end backends don't prevent the server from stopping.
|
||||||
|
$node->stop('fast', timeout => $stop_timeout);
|
||||||
|
|
||||||
|
$node->start();
|
||||||
|
$node->connect_ok("dbname=postgres", "works after restart");
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
foreach my $socket (@raw_connections)
|
||||||
|
{
|
||||||
|
$socket->close();
|
||||||
|
}
|
||||||
|
|
||||||
|
done_testing();
|
Loading…
x
Reference in New Issue
Block a user