Kill dead-end children when there's nothing else left

Previously, the postmaster would never try to kill dead-end child processes, even if there were no other processes left. A dead-end backend will eventually exit, when authentication_timeout expires, but if a dead-end backend is the only thing that's preventing the server from shutting down, it seems better to kill it immediately. It's particularly important, if there was a bug in the early startup code that prevented a dead-end child from timing out and exiting normally. Includes a test for that case where a dead-end backend previously prevented the server from shutting down. Reviewed-by: Andres Freund <andres@anarazel.de> Discussion: https://www.postgresql.org/message-id/a102f15f-eac4-4ff2-af02-f9ff209ec66f@iki.fi
2024-11-14 16:12:04 +02:00 · 2024-11-14 16:12:04 +02:00 · bb861414fe
commit bb861414fe
parent 18d67a8d7d
4 changed files with 116 additions and 10 deletions
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@ -2985,10 +2985,11 @@ PostmasterStateMachine(void)
 			if (Shutdown >= ImmediateShutdown || FatalError)
 			{
 				/*
-				 * Start waiting for dead_end children to die.  This state
+				 * Stop any dead_end children and stop creating new ones.
 				 * change causes ServerLoop to stop creating new ones.
 				 */
 				pmState = PM_WAIT_DEAD_END;
 				ConfigurePostmasterWaitSet(false);
 				SignalChildren(SIGQUIT, btmask(B_DEAD_END_BACKEND));
 				/*
 				 * We already SIGQUIT'd the archiver and stats processes, if
@ -3027,9 +3028,10 @@ PostmasterStateMachine(void)
 					 */
 					FatalError = true;
 					pmState = PM_WAIT_DEAD_END;
 					ConfigurePostmasterWaitSet(false);
 					/* Kill the walsenders and archiver too */
-					SignalChildren(SIGQUIT, btmask_all_except(B_DEAD_END_BACKEND));
+					SignalChildren(SIGQUIT, BTYPE_MASK_ALL);
 					if (PgArchPID != 0)
 						signal_child(PgArchPID, SIGQUIT);
 				}
@ -3048,14 +3050,13 @@ PostmasterStateMachine(void)
 		if (PgArchPID == 0 && CountChildren(btmask_all_except(B_DEAD_END_BACKEND)) == 0)
 		{
 			pmState = PM_WAIT_DEAD_END;
 			ConfigurePostmasterWaitSet(false);
 			SignalChildren(SIGTERM, BTYPE_MASK_ALL);
 		}
 	}
 	if (pmState == PM_WAIT_DEAD_END)
 	{
 		/* Don't allow any new socket connection events. */
 		ConfigurePostmasterWaitSet(false);
 		/*
 		 * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
 		 * (ie, no dead_end children remain), and the archiver is gone too.
@ -3381,12 +3382,12 @@ SignalChildren(int signal, BackendTypeMask targetMask)
 /*
 * Send a termination signal to children.  This considers all of our children
- * processes, except syslogger and dead_end backends.
+ * processes, except syslogger.
 */
 static void
 TerminateChildren(int signal)
 {
-	SignalChildren(signal, btmask_all_except(B_DEAD_END_BACKEND));
+	SignalChildren(signal, BTYPE_MASK_ALL);
 	if (StartupPID != 0)
 	{
 		signal_child(StartupPID, signal);
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@ -1194,6 +1194,9 @@ this to fail.  Otherwise, tests might fail to detect server crashes.
 With optional extra param fail_ok => 1, returns 0 for failure
 instead of bailing out.
 The optional extra param timeout can be used to pass the pg_ctl
 --timeout option.
 =cut
 sub stop
@ -1209,8 +1212,11 @@ sub stop
 	return 1 unless defined $self->{_pid};
 	print "### Stopping node \"$name\" using mode $mode\n";
-	$ret = PostgreSQL::Test::Utils::system_log('pg_ctl', '-D', $pgdata,
+	my @cmd = ('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
-		'-m', $mode, 'stop');
+	if ($params{timeout}) {
 		push(@cmd, ('--timeout', $params{timeout}));
 	}
 	$ret = PostgreSQL::Test::Utils::system_log(@cmd);
 	if ($ret != 0)
 	{
--- a/src/test/postmaster/meson.build
+++ b/src/test/postmaster/meson.build
@ -7,6 +7,7 @@ tests += {
  'tap': {
    'tests': [
      't/001_connection_limits.pl',
      't/002_start_stop.pl',
    ],
  },
 }
--- a/src/test/postmaster/t/002_start_stop.pl
+++ b/src/test/postmaster/t/002_start_stop.pl
@ -0,0 +1,98 @@
 # Copyright (c) 2021-2024, PostgreSQL Global Development Group
 # Test postmaster start and stop state machine.
 use strict;
 use warnings FATAL => 'all';
 use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
 use Test::More;
 #
 # Test that dead-end backends don't prevent the server from shutting
 # down.
 #
 # Dead-end backends can linger until they reach
 # authentication_timeout. We use a long authentication_timeout and a
 # much shorter timeout for the "pg_ctl stop" operation, to test that
 # if dead-end backends are killed at fast shut down. If they're not,
 # "pg_ctl stop" will error out before the authentication timeout kicks
 # in and cleans up the dead-end backends.
 my $authentication_timeout = $PostgreSQL::Test::Utils::timeout_default;
 my $stop_timeout = $authentication_timeout / 2;
 # Initialize the server with low connection limits, to test dead-end backends
 my $node = PostgreSQL::Test::Cluster->new('main');
 $node->init;
 $node->append_conf('postgresql.conf', "max_connections = 5");
 $node->append_conf('postgresql.conf', "max_wal_senders = 0");
 $node->append_conf('postgresql.conf', "autovacuum_max_workers = 1");
 $node->append_conf('postgresql.conf', "max_worker_processes = 1");
 $node->append_conf('postgresql.conf', "log_connections = on");
 $node->append_conf('postgresql.conf', "log_min_messages = debug2");
 $node->append_conf('postgresql.conf',
 	"authentication_timeout = '$authentication_timeout s'");
 $node->append_conf('postgresql.conf', 'trace_connection_negotiation=on');
 $node->start;
 if (!$node->raw_connect_works())
 {
 	plan skip_all => "this test requires working raw_connect()";
 }
 my @raw_connections = ();
 # Open a lot of TCP (or Unix domain socket) connections to use up all
 # the connection slots. Beyond a certain number (roughly 2x
 # max_connections), they will be "dead-end backends".
 for (my $i = 0; $i <= 20; $i++)
 {
 	my $sock = $node->raw_connect();
 	# On a busy system, the server might reject connections if
 	# postmaster cannot accept() them fast enough. The exact limit
 	# and behavior depends on the platform. To make this reliable,
 	# we attempt SSL negotiation on each connection before opening
 	# next one. The server will reject the SSL negotations, but
 	# when it does so, we know that the backend has been launched
 	# and we should be able to open another connection.
 	# SSLRequest packet consists of packet length followed by
 	# NEGOTIATE_SSL_CODE.
 	my $negotiate_ssl_code = pack("Nnn", 8, 1234, 5679);
 	my $sent = $sock->send($negotiate_ssl_code);
 	# Read reply. We expect the server to reject it with 'N'
 	my $reply = "";
 	$sock->recv($reply, 1);
 	is($reply, "N", "dead-end connection $i");
 	push(@raw_connections, $sock);
 }
 # When all the connection slots are in use, new connections will fail
 # before even looking up the user. Hence you now get "sorry, too many
 # clients already" instead of "role does not exist" error. Test that
 # to ensure that we have used up all the slots.
 $node->connect_fails("dbname=postgres user=invalid_user",
 	"connect ",
 	expected_stderr => qr/FATAL:  sorry, too many clients already/);
 # Open one more connection, to really ensure that we have at least one
 # dead-end backend.
 my $sock = $node->raw_connect();
 # Test that the dead-end backends don't prevent the server from stopping.
 $node->stop('fast', timeout => $stop_timeout);
 $node->start();
 $node->connect_ok("dbname=postgres", "works after restart");
 # Clean up
 foreach my $socket (@raw_connections)
 {
 	$socket->close();
 }
 done_testing();