MINOR: promex: Add agent check status/code/duration metrics

In the Prometheus exporter, the last health check status is already exposed,
with its code and duration in seconds. The server status is also exposed.
But the information about the agent check are not available. It is not
really handy because when a server status is changed because of the agent,
it is not obvious by looking to the Prometheus metrics. Indeed, the server
may reported as DOWN for instance, while the health check status still
reports a success. Being able to get the agent status in that case could be
valuable.

So now, the last agent check status is exposed, with its code and duration
in seconds. Following metrics can be grabbe now:

  * haproxy_server_agent_status
  * haproxy_server_agent_code
  * haproxy_server_agent_duration_seconds

Note that unlike the other metrics, no per-backend aggregated metric is
exposed.

This patch is related to issue #2983.
This commit is contained in:
Christopher Faulet 2025-05-22 09:37:09 +02:00
parent 0ac41ff97e
commit 7244f16ac4
3 changed files with 39 additions and 3 deletions

View File

@ -389,6 +389,9 @@ listed below. Metrics from extra counters are not listed.
| haproxy_server_max_connect_time_seconds |
| haproxy_server_max_response_time_seconds |
| haproxy_server_max_total_time_seconds |
| haproxy_server_agent_status |
| haproxy_server_agent_code |
| haproxy_server_agent_duration_seconds |
| haproxy_server_internal_errors_total |
| haproxy_server_unsafe_idle_connections_current |
| haproxy_server_safe_idle_connections_current |

View File

@ -173,6 +173,8 @@ const struct ist promex_st_metric_desc[ST_I_PX_MAX] = {
[ST_I_PX_CTIME] = IST("Avg. connect time for last 1024 successful connections."),
[ST_I_PX_RTIME] = IST("Avg. response time for last 1024 successful connections."),
[ST_I_PX_TTIME] = IST("Avg. total time for last 1024 successful connections."),
[ST_I_PX_AGENT_STATUS] = IST("Status of last agent check, per state label value."),
[ST_I_PX_AGENT_DURATION] = IST("Total duration of the latest server agent check, in seconds."),
[ST_I_PX_QT_MAX] = IST("Maximum observed time spent in the queue"),
[ST_I_PX_CT_MAX] = IST("Maximum observed time spent waiting for a connection to complete"),
[ST_I_PX_RT_MAX] = IST("Maximum observed time spent waiting for a server response"),
@ -1342,6 +1344,7 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
secs = (double)sv->check.duration / 1000.0;
val = mkf_flt(FN_DURATION, secs);
break;
case ST_I_PX_REQ_TOT:
if (px->mode != PR_MODE_HTTP) {
sv = NULL;
@ -1364,6 +1367,36 @@ static int promex_dump_srv_metrics(struct appctx *appctx, struct htx *htx)
labels[lb_idx+1].value = promex_hrsp_code[ctx->field_num - ST_I_PX_HRSP_1XX];
break;
case ST_I_PX_AGENT_STATUS:
if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
goto next_sv;
for (; ctx->obj_state < HCHK_STATUS_SIZE; ctx->obj_state++) {
if (get_check_status_result(ctx->obj_state) < CHK_RES_FAILED)
continue;
val = mkf_u32(FO_STATUS, sv->agent.status == ctx->obj_state);
check_state = get_check_status_info(ctx->obj_state);
labels[lb_idx+1].name = ist("state");
labels[lb_idx+1].value = ist(check_state);
if (!promex_dump_ts(appctx, prefix, name, desc,
type,
&val, labels, &out, max))
goto full;
}
ctx->obj_state = 0;
goto next_sv;
case ST_I_PX_AGENT_CODE:
if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) != CHK_ST_ENABLED)
goto next_sv;
val = mkf_u32(FN_OUTPUT, (sv->agent.status < HCHK_STATUS_L57DATA) ? 0 : sv->agent.code);
break;
case ST_I_PX_AGENT_DURATION:
if (sv->agent.status < HCHK_STATUS_CHECKED)
goto next_sv;
secs = (double)sv->agent.duration / 1000.0;
val = mkf_flt(FN_DURATION, secs);
break;
default:
break;
}

View File

@ -111,9 +111,9 @@ const struct stat_col stat_cols_px[ST_I_PX_MAX] = {
[ST_I_PX_CTIME] = { .name = "ctime", .alt_name = "connect_time_average_seconds", .desc = "Time spent waiting for a connection to complete, in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
[ST_I_PX_RTIME] = { .name = "rtime", .alt_name = "response_time_average_seconds", .desc = "Time spent waiting for a server response, in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
[ST_I_PX_TTIME] = { .name = "ttime", .alt_name = "total_time_average_seconds", .desc = "Total request+response time (request+queue+connect+response+processing), in milliseconds, averaged over the 1024 last requests (backend/server)", .cap = STATS_PX_CAP___BS },
[ST_I_PX_AGENT_STATUS] = { .name = "agent_status", .alt_name = NULL, .desc = "Status report of the server's latest agent check, prefixed with '*' if a check is currently in progress" },
[ST_I_PX_AGENT_CODE] = { .name = "agent_code", .alt_name = NULL, .desc = "Status code reported by the latest server agent check" },
[ST_I_PX_AGENT_DURATION] = { .name = "agent_duration", .alt_name = NULL, .desc = "Total duration of the latest server agent check, in milliseconds" },
[ST_I_PX_AGENT_STATUS] = { .name = "agent_status", .alt_name = "agent_status", .desc = "Status report of the server's latest agent check, prefixed with '*' if a check is currently in progress", .cap = STATS_PX_CAP____S },
[ST_I_PX_AGENT_CODE] = { .name = "agent_code", .alt_name = "agent_code", .desc = "Status code reported by the latest server agent check", .cap = STATS_PX_CAP____S },
[ST_I_PX_AGENT_DURATION] = { .name = "agent_duration", .alt_name = "agent_duration_seconds", .desc = "Total duration of the latest server agent check, in milliseconds", .cap = STATS_PX_CAP____S, },
[ST_I_PX_CHECK_DESC] = { .name = "check_desc", .alt_name = NULL, .desc = "Textual description of the latest health check report for this server" },
[ST_I_PX_AGENT_DESC] = { .name = "agent_desc", .alt_name = NULL, .desc = "Textual description of the latest agent check report for this server" },
[ST_I_PX_CHECK_RISE] = { .name = "check_rise", .alt_name = NULL, .desc = "Number of successful health checks before declaring a server UP (server 'rise' setting)" },