2022-01-31 15:27:25 +11:00
#!/bin/bash
#
# Healthcheck script for MariaDB
#
# Runs various tests on the MariaDB server to check its health. Pass the tests
# to run as arguments. If all tests succeed, the server is considered healthy,
# otherwise it's not.
#
# Arguments are processed in strict order. Set replication_* options before
# the --replication option. This allows a different set of replication checks
# on different connections.
#
2022-02-10 17:54:19 +11:00
# --su{=|-mysql} is option to run the healthcheck as a different unix user.
2022-01-31 15:27:25 +11:00
# Useful if mysql@localhost user exists with unix socket authentication
2022-02-10 17:54:19 +11:00
# Using this option disregards previous options set, so should usually be the
2022-01-31 15:27:25 +11:00
# first option.
#
# Some tests require SQL privileges.
#
2022-03-01 10:30:10 +11:00
# TEST MINIMUM GRANTS REQUIRED
2022-01-31 15:27:25 +11:00
# connect none*
2022-02-07 19:38:24 +11:00
# innodb_initialized USAGE
2022-01-31 15:27:25 +11:00
# innodb_buffer_pool_loaded USAGE
# galera_online USAGE
2023-09-15 18:29:17 +10:00
# galera_ready USAGE
2022-03-01 10:30:10 +11:00
# replication REPLICATION_CLIENT (<10.5)or REPLICA MONITOR (10.5+)
2022-01-31 15:27:25 +11:00
# mariadbupgrade none, however unix user permissions on datadir
#
2024-06-20 18:00:53 +10:00
# The SQL user used is the default for the mariadb client. This can be the unix user
2022-01-31 15:27:25 +11:00
# if no user(or password) is set in the [mariadb-client] section of a configuration
# file. --defaults-{file,extra-file,group-suffix} can specify a file/configuration
# different from elsewhere.
#
# Note * though denied error message will result in error log without
2024-06-25 15:30:16 +10:00
# any permissions. USAGE recommend to avoid this.
2022-01-31 15:27:25 +11:00
set -eo pipefail
_process_sql( )
{
2024-06-20 18:00:53 +10:00
mariadb ${ nodefaults : +--no-defaults } \
2022-01-31 15:27:25 +11:00
${ def [ 'file' ] : +--defaults-file= ${ def [ 'file' ] } } \
${ def [ 'extra_file' ] : +--defaults-extra-file= ${ def [ 'extra_file' ] } } \
${ def [ 'group_suffix' ] : +--defaults-group-suffix= ${ def [ 'group_suffix' ] } } \
2024-06-11 12:35:20 +10:00
--skip-ssl --skip-ssl-verify-server-cert \
2024-06-25 15:30:16 +10:00
--protocol socket \
2022-01-31 15:27:25 +11:00
-B " $@ "
}
# TESTS
# CONNECT
#
# Tests that a connection can be made over TCP, the final state
# of the entrypoint and is listening. The authentication used
# isn't tested.
connect( )
{
2024-06-25 15:30:16 +10:00
local s
# short cut mechanism, to work with --require-secure-transport
s = $( _process_sql --skip-column-names -e 'select @@skip_networking' )
case " $s " in
0| 1)
connect_s = $s
return " $s " ;
; ;
esac
2024-09-02 12:40:28 +10:00
# falling back to tcp if there wasn't a connection answer.
s = $( mariadb ${ nodefaults : +--no-defaults } \
2022-01-31 15:27:25 +11:00
${ def [ 'file' ] : +--defaults-file= ${ def [ 'file' ] } } \
${ def [ 'extra_file' ] : +--defaults-extra-file= ${ def [ 'extra_file' ] } } \
${ def [ 'group_suffix' ] : +--defaults-group-suffix= ${ def [ 'group_suffix' ] } } \
2024-06-11 12:35:20 +10:00
--skip-ssl --skip-ssl-verify-server-cert \
2024-09-02 12:40:28 +10:00
-h localhost --protocol tcp \
--skip-column-names --batch --skip-print-query-on-error \
-e 'select @@skip_networking' 2>& 1)
case " $s " in
1) # skip-networking=1 (no network)
; &
ERROR\ 2002\ \( HY000\) :*)
# cannot connect
connect_s = 1
; ;
0) # skip-networking=0
; &
ERROR\ 1820\ \( HY000\) *) # password expire
; &
ERROR\ 4151\ \( HY000\) :*) # account locked
; &
ERROR\ 1226\ \( 42000\) *) # resource limit exceeded
; &
ERROR\ 1[ 0-9] [ 0-9] [ 0-9] \ \( 28000\) :*)
# grep access denied and other 28000 client errors - we did connect
connect_s = 0
; ;
*)
>& 2 echo " Unknown error $s "
connect_s = 1
; ;
esac
2024-06-25 15:30:16 +10:00
return $connect_s
2022-01-31 15:27:25 +11:00
}
2022-02-07 19:38:24 +11:00
# INNODB_INITIALIZED
#
# This tests that the crash recovery of InnoDB has completed
# along with all the other things required to make it to a healthy
# operational state. Note this may return true in the early
# states of initialization. Use with a connect test to avoid
# these false positives.
innodb_initialized( )
{
local s
2023-10-20 13:15:17 +02:00
s = $( _process_sql --skip-column-names -e "select 1 from information_schema.ENGINES WHERE engine='innodb' AND support in ('YES', 'DEFAULT', 'ENABLED')" )
2022-02-07 19:38:24 +11:00
[ " $s " = = 1 ]
}
2022-01-31 15:27:25 +11:00
# INNODB_BUFFER_POOL_LOADED
#
# Tests the load of the innodb buffer pool as been complete
# implies innodb_buffer_pool_load_at_startup=1 (default), or if
# manually SET innodb_buffer_pool_load_now=1
innodb_buffer_pool_loaded( )
{
local s
2023-10-20 13:15:17 +02:00
s = $( _process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='Innodb_buffer_pool_load_status'" )
2022-01-31 15:27:25 +11:00
if [ [ $s = ~ 'load completed' ] ] ; then
return 0
fi
return 1
}
2022-02-07 19:38:24 +11:00
# GALERA_ONLINE
2022-01-31 15:27:25 +11:00
#
# Tests that the galera node is in the SYNCed state
2022-02-07 19:38:24 +11:00
galera_online( )
2022-01-31 15:27:25 +11:00
{
local s
2023-10-20 13:15:17 +02:00
s = $( _process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='WSREP_LOCAL_STATE'" )
2022-01-31 15:27:25 +11:00
# 4 from https://galeracluster.com/library/documentation/node-states.html#node-state-changes
# not https://xkcd.com/221/
if [ [ $s -eq 4 ] ] ; then
return 0
fi
return 1
}
2023-09-15 18:29:17 +10:00
# GALERA_READY
#
# Tests that the Galera provider is ready.
galera_ready( )
{
local s
2023-10-20 13:15:17 +02:00
s = $( _process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='WSREP_READY'" )
2023-09-15 18:29:17 +10:00
if [ " $s " = "ON" ] ; then
return 0
fi
return 1
}
2022-01-31 15:27:25 +11:00
# REPLICATION
#
# Tests the replication has the required set of functions:
# --replication_all -> Checks all replication sources
# --replication_name=n -> sets the multisource connection name tested
# --replication_io -> IO thread is running
# --replication_sql -> SQL thread is running
# --replication_seconds_behind_master=n -> less than or equal this seconds of delay
# --replication_sql_remaining_delay=n -> less than or equal this seconds of remaining delay
# (ref: https://mariadb.com/kb/en/delayed-replication/)
replication( )
{
# SHOW REPLICA available 10.5+
# https://github.com/koalaman/shellcheck/issues/2383
# shellcheck disable=SC2016,SC2026
2023-06-20 16:39:17 +10:00
_process_sql -e " SHOW ${ repl [ 'all' ] : +all } REPLICA ${ repl [ 'all' ] : +S } ${ repl [ 'name' ] : + '${repl[' name ']}' } STATUS\G " | \
2022-01-31 15:27:25 +11:00
{
# required for trim of leading space.
shopt -s extglob
# Row header
read -t 5 -r
# read timeout
[ $? -gt 128 ] && return 1
while IFS = ":" read -t 1 -r n v; do
# Trim leading space
n = ${ n ##+([[ : space : ]]) }
# Leading space on all values by the \G format needs to be trimmed.
v = ${ v : 1 }
case " $n " in
Slave_IO_Running)
if [ -n " ${ repl [ 'io' ] } " ] && [ " $v " = 'No' ] ; then
return 1
fi
; ;
Slave_SQL_Running)
if [ -n " ${ repl [ 'sql' ] } " ] && [ " $v " = 'No' ] ; then
return 1
fi
; ;
Seconds_Behind_Master)
# A NULL value is the IO thread not running:
if [ -n " ${ repl [ 'seconds_behind_master' ] } " ] &&
{ [ " $v " = NULL ] ||
( ( " ${ repl [ 'seconds_behind_master' ] } " < " $v " ) ) ; } ; then
return 1
fi
; ;
SQL_Remaining_Delay)
# Unlike Seconds_Behind_Master, sql_remaining_delay will hit NULL
# once replication is caught up - https://mariadb.com/kb/en/delayed-replication/
if [ -n " ${ repl [ 'sql_remaining_delay' ] } " ] &&
[ " $v " != NULL ] &&
( ( " ${ repl [ 'sql_remaining_delay' ] } " < " $v " ) ) ; then
return 1
fi
; ;
esac
done
# read timeout
[ $? -gt 128 ] && return 1
return 0
}
2023-01-18 08:34:31 +11:00
# reachable in command not found(?)
# shellcheck disable=SC2317
2022-01-31 15:27:25 +11:00
return $?
}
# mariadbupgrade
#
2024-06-20 18:00:53 +10:00
# Test the lock on the file $datadir/mariadb_upgrade_info
2022-01-31 15:27:25 +11:00
# https://jira.mariadb.org/browse/MDEV-27068
mariadbupgrade( )
{
2024-06-20 18:00:53 +10:00
local f = " $datadir /mariadb_upgrade_info "
2022-01-31 15:27:25 +11:00
if [ -r " $f " ] ; then
flock --exclusive --nonblock -n 9 9<" $f "
return $?
fi
return 0
}
# MAIN
if [ $# -eq 0 ] ; then
echo "At least one argument required" >& 2
exit 1
fi
2022-03-07 16:34:11 +11:00
#ENDOFSUBSTITUTIONS
2023-02-25 09:47:39 +11:00
# Marks the end of mysql -> mariadb name changes in 10.6+
2022-01-31 15:27:25 +11:00
# Global variables used by tests
declare -A repl
declare -A def
nodefaults =
2024-06-25 15:30:16 +10:00
connect_s =
2022-01-31 15:27:25 +11:00
datadir = /var/lib/mysql
Allow healthcheck@{127.0.0.1,::1,localhost} to exist to facilitate healthcheck --connect
healthcheck@{127.0.0.1,::1,localhost} users are granted USAGE by default, which
is enough for the non-replication healthchecks in healtcheck.sh.
The env variable MARIADB_HEALTHCHECK_GRANTS can replace USAGE with any
comma separated set of grants.
On initialization a generated password is created and saved in
$DATADIR/.my-healthcheck.cnf along with the server port and socket. If the
command args or default configuration file changes this may become out
of date. Because the password is generated in configuration file the
'#', comment, and '=' characters cannot be part of this password.
The healthcheck.cnf configuration file also sets protocol=tcp to
enforce indirectly that --connect being a standard part of the test. This is
required as starts of the service under --skip-networking should
never be considered healthy.
The healthcheck script also has the --defaults-extra-file set to this
.my-healthcheck.cnf file, if it exists (backwards compatible on
previously created datadirs), so that all new healthcheck invokations
use the authentication here by default.
The compatibility with old instances, without the .my-healthcheck.cnf is
preserved by non setting --defaults-extra-file.
The healthcheck --connect will increment the server status variable Aborted_connects
for each check, however now connection_error* counts are changed.
This also prevents any invalid password errors showing up in the
container log.
Closes #430
2023-05-16 10:12:28 +10:00
if [ -f $datadir /.my-healthcheck.cnf ] ; then
def[ 'extra_file' ] = $datadir /.my-healthcheck.cnf
fi
2022-01-31 15:27:25 +11:00
_repl_param_check( )
{
case " $1 " in
seconds_behind_master) ; &
sql_remaining_delay)
if [ -z " ${ repl [ 'io' ] } " ] ; then
repl[ 'io' ] = 1
echo " Forcing --replication_io=1, $1 requires IO thread to be running " >& 2
fi
; ;
all)
if [ -n " ${ repl [ 'name' ] } " ] ; then
unset 'repl[name]'
2022-10-22 20:46:49 +02:00
echo "Option --replication_all incompatible with specified source --replication_name, clearing replication_name" >& 2
2022-01-31 15:27:25 +11:00
fi
; ;
name)
if [ -n " ${ repl [ 'all' ] } " ] ; then
unset 'repl[all]'
echo "Option --replication_name incompatible with --replication_all, clearing replication_all" >& 2
fi
; ;
esac
}
_test_exists( ) {
2022-03-01 10:30:10 +11:00
declare -F " $1 " > /dev/null
2022-01-31 15:27:25 +11:00
return $?
}
while [ $# -gt 0 ] ; do
case " $1 " in
--su= *)
2022-02-21 09:25:52 +11:00
u = " ${ 1 #*= } "
2022-01-31 15:27:25 +11:00
shift
exec gosu " ${ u } " " ${ BASH_SOURCE [0] } " " $@ "
; ;
2022-03-01 10:30:10 +11:00
--su)
shift
u = $1
shift
exec gosu " $u " " ${ BASH_SOURCE [0] } " " $@ "
; ;
2022-01-31 15:27:25 +11:00
--su-mysql)
shift
exec gosu mysql " ${ BASH_SOURCE [0] } " " $@ "
; ;
--replication_*= *)
# Change the n to what is between _ and = and make lower case
n = ${ 1 #*_ }
n = ${ n %%=* }
n = ${ n ,,* }
# v is after the =
v = ${ 1 #*= }
repl[ $n ] = $v
_repl_param_check " $n "
; ;
--replication_*)
# Without =, look for a non --option next as the value,
# otherwise treat it as an "enable", just equate to 1.
# Clearing option is possible with "--replication_X="
n = ${ 1 #*_ }
n = ${ n ,,* }
if [ " ${ 2 : 0 : 2 } " = = '--' ] ; then
repl[ $n ] = 1
else
repl[ $n ] = $2
shift
fi
_repl_param_check " $n "
; ;
--datadir= *)
datadir = ${ 1 #*= }
; ;
2022-03-01 10:30:10 +11:00
--datadir)
shift
datadir = ${ 1 }
; ;
2022-01-31 15:27:25 +11:00
--no-defaults)
2023-10-20 23:25:38 +11:00
def = ( )
2022-01-31 15:27:25 +11:00
nodefaults = 1
; ;
--defaults-file= *| --defaults-extra-file= *| --defaults-group-suffix= *)
n = ${ 1 : 11 } # length --defaults-
n = ${ n %%=* }
n = ${ n //-/_ }
# v is after the =
v = ${ 1 #*= }
def[ $n ] = $v
nodefaults =
; ;
--defaults-file| --defaults-extra-file| --defaults-group-suffix)
n = ${ 1 : 11 } # length --defaults-
n = ${ n //-/_ }
if [ " ${ 2 : 0 : 2 } " = = '--' ] ; then
def[ $n ] = ""
else
def[ $n ] = $2
shift
fi
nodefaults =
; ;
2024-09-17 17:26:03 +10:00
--no-connect)
# used for /docker-entrypoint-initdb.d scripts
# where you definately don't want a connection test
connect_s = 0
; ;
2022-01-31 15:27:25 +11:00
--*)
test = ${ 1 #-- }
; ;
*)
echo " Unknown healthcheck option $1 " >& 2
exit 1
esac
if [ -n " $test " ] ; then
if ! _test_exists " $test " ; then
2022-03-01 10:30:10 +11:00
echo " healthcheck unknown option or test ' $test ' " >& 2
2022-01-31 15:27:25 +11:00
exit 1
elif ! " $test " ; then
echo " healthcheck $test failed " >& 2
exit 1
fi
2022-03-01 10:30:10 +11:00
test =
2022-01-31 15:27:25 +11:00
fi
shift
done
2024-09-02 12:40:28 +10:00
if [ " $connect_s " != "0" ] ; then
# we didn't pass a connnect test, so the current success status is suspicious
2024-06-25 15:30:16 +10:00
# return what connect thinks.
connect
exit $?
fi