From eb440250ff99da16caa47fcc68bc5c8a5c726240 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 27 Jul 2022 16:58:51 -0700 Subject: [PATCH] pkg/clusterconditions/promql: Cap PromQL queries at 5 minutes In some clusters, these PromQL queries can hang for hours, possibly forever [1]. I think we have a 30s default KeepAlive timeout [2], but apparently there's enough socket traffic to keep from tripping that. This adds a 5m cap to the PromQL calls, although I'm not particularly attached to that particular number. We can always raise it if we start seeing timeouts in Insights for queries where taking that long seems reasonable. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=2109374#c12 [2]: https://pkg.go.dev/github.com/prometheus/client_golang/api#pkg-variables --- pkg/clusterconditions/promql/promql.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/clusterconditions/promql/promql.go b/pkg/clusterconditions/promql/promql.go index b4c949d7f7..5b84069995 100644 --- a/pkg/clusterconditions/promql/promql.go +++ b/pkg/clusterconditions/promql/promql.go @@ -27,6 +27,9 @@ type PromQL struct { // HTTPClientConfig holds the client configuration for connecting to the Prometheus service. HTTPClientConfig config.HTTPClientConfig + + // QueryTimeout limits the amount of time we wait before giving up on the Prometheus query. + QueryTimeout time.Duration } var promql = &cache.Cache{ @@ -41,6 +44,7 @@ var promql = &cache.Cache{ CAFile: "/etc/tls/service-ca/service-ca.crt", }, }, + QueryTimeout: 5 * time.Minute, }, MinBetweenMatches: 10 * time.Minute, MinForCondition: time.Hour, @@ -79,8 +83,16 @@ func (p *PromQL) Match(ctx context.Context, condition *configv1.ClusterCondition } v1api := prometheusv1.NewAPI(client) + + queryContext := ctx + if p.QueryTimeout > 0 { + var cancel context.CancelFunc + queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout) + defer cancel() + } + klog.V(2).Infof("evaluate %s cluster condition: %q", condition.Type, condition.PromQL.PromQL) - result, warnings, err := v1api.Query(ctx, condition.PromQL.PromQL, time.Now()) + result, warnings, err := v1api.Query(queryContext, condition.PromQL.PromQL, time.Now()) if err != nil { return false, fmt.Errorf("executing PromQL query: %w", err) }