From c70f8f963c326fecef238a086ecc7ec9df456c94 Mon Sep 17 00:00:00 2001 From: Yao Wu Date: Thu, 26 Jul 2018 15:34:05 -0700 Subject: [PATCH 1/2] Add activator metrics and dashboard --- cmd/activator/main.go | 67 +- config/activator.yaml | 2 +- .../100-scaling-configmap.yaml | 1973 ++++++++++------- .../300-prometheus/100-scrape-config.yaml | 27 + pkg/activator/activator.go | 7 +- pkg/activator/dedupe.go | 31 +- pkg/activator/dedupe_test.go | 97 +- pkg/activator/revision.go | 15 +- pkg/activator/revision_test.go | 48 +- pkg/activator/stats_reporter.go | 200 ++ pkg/activator/stats_reporter_test.go | 141 ++ pkg/controller/names.go | 4 + .../route/resources/virtual_service.go | 1 + .../route/resources/virtual_service_test.go | 14 +- pkg/controller/route/route_test.go | 12 +- 15 files changed, 1723 insertions(+), 916 deletions(-) create mode 100644 pkg/activator/stats_reporter.go create mode 100644 pkg/activator/stats_reporter_test.go diff --git a/cmd/activator/main.go b/cmd/activator/main.go index 69e80f03682a..a28892f38175 100644 --- a/cmd/activator/main.go +++ b/cmd/activator/main.go @@ -39,6 +39,8 @@ import ( "github.com/knative/serving/pkg/signals" "github.com/knative/serving/pkg/system" "github.com/knative/serving/third_party/h2c" + "go.opencensus.io/exporter/prometheus" + "go.opencensus.io/stats/view" "go.uber.org/zap" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" @@ -52,18 +54,21 @@ const ( ) type activationHandler struct { - act activator.Activator - logger *zap.SugaredLogger + act activator.Activator + logger *zap.SugaredLogger + reporter activator.StatsReporter } // retryRoundTripper retries on 503's for up to 60 seconds. The reason is there is // a small delay for k8s to include the ready IP in service. // https://github.com/knative/serving/issues/660#issuecomment-384062553 type retryRoundTripper struct { - logger *zap.SugaredLogger + logger *zap.SugaredLogger + reporter activator.StatsReporter + start time.Time } -func (rrt retryRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) { +func (rrt *retryRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) { var err error var reqBody *bytes.Reader @@ -113,26 +118,38 @@ func (rrt retryRoundTripper) RoundTrip(r *http.Request) (*http.Response, error) resp, err = transport.RoundTrip(r) } - // TODO: add metrics for number of tries and the response code. + if resp != nil { rrt.logger.Infof("It took %d tries to get response code %d", i, resp.StatusCode) + namespace := r.Header.Get(controller.GetRevisionHeaderNamespace()) + name := r.Header.Get(controller.GetRevisionHeaderName()) + config := r.Header.Get(controller.GetConfigurationHeader()) + rrt.reporter.ReportResponseCount(namespace, config, name, resp.StatusCode, i, 1.0) + rrt.reporter.ReportResponseTime(namespace, config, name, resp.StatusCode, time.Now().Sub(rrt.start)) } return resp, err } func (a *activationHandler) handler(w http.ResponseWriter, r *http.Request) { + namespace := r.Header.Get(controller.GetRevisionHeaderNamespace()) + name := r.Header.Get(controller.GetRevisionHeaderName()) + config := r.Header.Get(controller.GetConfigurationHeader()) + start := time.Now() + if r.ContentLength > maxUploadBytes { w.WriteHeader(http.StatusRequestEntityTooLarge) + a.reporter.ReportResponseCount(namespace, config, name, http.StatusRequestEntityTooLarge, 1, 1.0) + a.reporter.ReportResponseTime(namespace, config, name, http.StatusRequestEntityTooLarge, time.Now().Sub(start)) return } - namespace := r.Header.Get(controller.GetRevisionHeaderNamespace()) - name := r.Header.Get(controller.GetRevisionHeaderName()) - endpoint, status, err := a.act.ActiveEndpoint(namespace, name) + endpoint, status, err := a.act.ActiveEndpoint(namespace, config, name) if err != nil { msg := fmt.Sprintf("Error getting active endpoint: %v", err) - a.logger.Error(msg) http.Error(w, msg, int(status)) + a.logger.Errorf(msg) + a.reporter.ReportResponseCount(namespace, config, name, int(status), 1, 1.0) + a.reporter.ReportResponseTime(namespace, config, name, int(status), time.Now().Sub(start)) return } target := &url.URL{ @@ -140,8 +157,10 @@ func (a *activationHandler) handler(w http.ResponseWriter, r *http.Request) { Host: fmt.Sprintf("%s:%d", endpoint.FQDN, endpoint.Port), } proxy := httputil.NewSingleHostReverseProxy(target) - proxy.Transport = retryRoundTripper{ - logger: a.logger, + proxy.Transport = &retryRoundTripper{ + logger: a.logger, + reporter: a.reporter, + start: start, } // TODO: Clear the host to avoid 404's. @@ -179,9 +198,22 @@ func main() { logger.Fatal("Error building serving clientset: %v", zap.Error(err)) } - a := activator.NewRevisionActivator(kubeClient, servingClient, logger) - a = activator.NewDedupingActivator(a) - ah := &activationHandler{a, logger} + logger.Info("Initializing OpenCensus Prometheus exporter.") + promExporter, err := prometheus.NewExporter(prometheus.Options{Namespace: "activator"}) + if err != nil { + logger.Fatal("Failed to create the Prometheus exporter: %v", zap.Error(err)) + } + view.RegisterExporter(promExporter) + view.SetReportingPeriod(10 * time.Second) + + reporter, err := activator.NewStatsReporter() + if err != nil { + logger.Fatal("Failed to create stats reporter: %v", zap.Error(err)) + } + + a := activator.NewRevisionActivator(kubeClient, servingClient, logger, reporter) + a = activator.NewDedupingActivator(a, servingClient, logger, reporter) + ah := &activationHandler{a, logger, reporter} // set up signals so we handle the first shutdown signal gracefully stopCh := signals.SetupSignalHandler() @@ -197,6 +229,9 @@ func main() { logger.Fatalf("failed to start configuration manager: %v", err) } - http.HandleFunc("/", ah.handler) - h2c.ListenAndServe(":8080", nil) + // Start the endpoint for Prometheus scraping + mux := http.NewServeMux() + mux.HandleFunc("/", ah.handler) + mux.Handle("/metrics", promExporter) + h2c.ListenAndServe(":8080", mux) } diff --git a/config/activator.yaml b/config/activator.yaml index 28a0ef3d66eb..64c89d789103 100644 --- a/config/activator.yaml +++ b/config/activator.yaml @@ -36,7 +36,7 @@ spec: # and substituted here. image: github.com/knative/serving/cmd/activator ports: - - name: http + - name: activator-port containerPort: 8080 args: # Disable glog writing into stderr. Our code doesn't use glog diff --git a/config/monitoring/150-elasticsearch/100-scaling-configmap.yaml b/config/monitoring/150-elasticsearch/100-scaling-configmap.yaml index a601e96cdeea..64c92e89841c 100644 --- a/config/monitoring/150-elasticsearch/100-scaling-configmap.yaml +++ b/config/monitoring/150-elasticsearch/100-scaling-configmap.yaml @@ -19,841 +19,1154 @@ metadata: namespace: monitoring data: scaling-dashboard.json: |+ - { - "__inputs": [ - { - "name": "prometheus", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.3" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Knative Serving - Scaling Debugging", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "id": null, - "iteration": 1527886043818, - "links": [ - - ], - "panels": [ - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 14, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 11, - "w": 24, - "x": 0, - "y": 1 - }, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "sum(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Actual Pods", - "refId": "A" - }, - { - "expr": "sum(autoscaler_requested_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Requested Pods", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Revision Pod Counts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": "Concurrency", - "logBase": 1, - "max": "1", - "min": null, - "show": false - } - ] + { + "__inputs":[ + { + "name":"prometheus", + "label":"prometheus", + "description":"", + "type":"datasource", + "pluginId":"prometheus", + "pluginName":"Prometheus" } - ], - "title": "Revision Pod Counts", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 1 - }, - "id": 18, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 13 - }, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Cores requested", - "refId": "A" - }, - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"}[1m]))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Cores used", - "refId": "B" - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Core limit", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Revision CPU Usage", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + ], + "__requires":[ + { + "type":"grafana", + "id":"grafana", + "name":"Grafana", + "version":"5.0.3" }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 13 - }, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Memory requested", - "refId": "A" - }, - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Memory used", - "refId": "B" - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "intervalFactor": 1, - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Memory Usage", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "decbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "title": "Resource Usages", - "type": "row" - }, - { - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 2 - }, - "id": 16, - "panels": [ - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 3 - }, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "sum(autoscaler_desired_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"}) ", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Desired Pods", - "refId": "A" - }, - { - "expr": "sum(autoscaler_observed_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Observed Pods", - "refId": "B" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Pod Counts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 13 - }, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Panic Mode", - "color": "#ea6460", - "dashes": true, - "fill": 2, - "linewidth": 2, - "steppedLine": true, - "yaxis": 2 - }, - { - "alias": "Target Concurrency Per Pod", - "color": "#0a50a1", - "dashes": true, - "steppedLine": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "sum(autoscaler_observed_stable_concurrency{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Stable Concurrency", - "refId": "A" - }, - { - "expr": "sum(autoscaler_observed_panic_concurrency{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Panic Concurrency", - "refId": "B" - }, - { - "expr": "sum(autoscaler_target_concurrency_per_pod{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Target Concurrency Per Pod", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Observed Concurrency", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + { + "id":"graph", + "name":"Graph", + "type":"panel", + "version":"5.0.0" }, - { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "decimals": null, - "fill": 1, - "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 22 - }, - "id": 12, - "legend": { - "avg": false, - "current": false, - "hideZero": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Panic Mode", - "color": "#e24d42", - "linewidth": 2, - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "sum(autoscaler_panic_mode{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} )", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Panic Mode", - "refId": "A" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Panic Mode", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": "1.0", - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] + { + "type":"datasource", + "id":"prometheus", + "name":"Prometheus", + "version":"5.0.0" } - ], - "title": "Debugging Metrics", - "type": "row" - } - ], - "refresh": false, - "schemaVersion": 16, - "style": "dark", - "tags": [ - - ], - "templating": { - "list": [ - { - "allValue": null, - "current": { - + ], + "annotations":{ + "list":[ + { + "builtIn":1, + "datasource":"-- Grafana --", + "enable":true, + "hide":true, + "iconColor":"rgba(0, 211, 255, 1)", + "name":"Annotations & Alerts", + "type":"dashboard" + } + ] + }, + "description":"Knative Serving - Scaling Debugging", + "editable":false, + "gnetId":null, + "graphTooltip":0, + "id":null, + "iteration":1527886043818, + "links":[ + + ], + "panels":[ + { + + "collapsed":true, + "gridPos":{ + "h":1, + "w":24, + "x":0, + "y":0 + }, + "id":14, + "panels":[ + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":11, + "w":24, + "x":0, + "y":1 + }, + "id":2, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":true, + "targets":[ + { + "expr":"sum(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "interval":"1s", + "intervalFactor":1, + "legendFormat":"Actual Pods", + "refId":"A" + }, + { + "expr":"sum(autoscaler_requested_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "interval":"1s", + "intervalFactor":1, + "legendFormat":"Requested Pods", + "refId":"C" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Revision Pod Counts", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "decimals":null, + "format":"short", + "label":"Concurrency", + "logBase":1, + "max":"1", + "min":null, + "show":false + } + ] + } + ], + "title":"Revision Pod Counts", + "type":"row" }, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(autoscaler_actual_pod_count, configuration_namespace)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - + { + "collapsed":true, + "gridPos":{ + "h":1, + "w":24, + "x":0, + "y":1 + }, + "id":18, + "panels":[ + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":9, + "w":12, + "x":0, + "y":13 + }, + "id":4, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":false, + "targets":[ + { + "expr":"sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", + "format":"time_series", + "interval":"", + "intervalFactor":1, + "legendFormat":"Cores requested", + "refId":"A" + }, + { + "expr":"sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"}[1m]))", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"Cores used", + "refId":"B" + }, + { + "expr":"sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"Core limit", + "refId":"C" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Revision CPU Usage", + "tooltip":{ + "shared":true, + "sort":2, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "decimals":null, + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":false + } + ] + }, + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":9, + "w":12, + "x":12, + "y":13 + }, + "id":6, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":false, + "targets":[ + { + "expr":"sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", + "format":"time_series", + "interval":"", + "intervalFactor":1, + "legendFormat":"Memory requested", + "refId":"A" + }, + { + "expr":"sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"})", + "format":"time_series", + "hide":false, + "intervalFactor":1, + "legendFormat":"Memory used", + "refId":"B" + }, + { + "expr":"sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", + "format":"time_series", + "intervalFactor":1, + "refId":"C" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Pod Memory Usage", + "tooltip":{ + "shared":true, + "sort":2, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"decbytes", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":false + } + ] + } + ], + "title":"Resource Usages", + "type":"row" }, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Configuration", - "multi": false, - "name": "configuration", - "options": [ - - ], - "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\"}, configuration)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - + { + "collapsed":true, + "gridPos":{ + "h":1, + "w":24, + "x":0, + "y":2 + }, + "id":16, + "panels":[ + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":10, + "w":24, + "x":0, + "y":3 + }, + "id":10, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":true, + "targets":[ + { + "expr":"sum(autoscaler_desired_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"}) ", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"Desired Pods", + "refId":"A" + }, + { + "expr":"sum(autoscaler_observed_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"Observed Pods", + "refId":"B" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Pod Counts", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + } + ] + }, + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":9, + "w":24, + "x":0, + "y":13 + }, + "id":8, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + { + "alias":"Panic Mode", + "color":"#ea6460", + "dashes":true, + "fill":2, + "linewidth":2, + "steppedLine":true, + "yaxis":2 + }, + { + "alias":"Target Concurrency Per Pod", + "color":"#0a50a1", + "dashes":true, + "steppedLine":false + } + ], + "spaceLength":10, + "stack":false, + "steppedLine":true, + "targets":[ + { + "expr":"sum(autoscaler_observed_stable_concurrency{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "interval":"1s", + "intervalFactor":1, + "legendFormat":"60 Second Average Concurrency", + "refId":"A" + }, + { + "expr":"sum(autoscaler_observed_panic_concurrency{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "interval":"1s", + "intervalFactor":1, + "legendFormat":"6 Second Average Panic Concurrency", + "refId":"B" + }, + { + "expr":"sum(autoscaler_target_concurrency_per_pod{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"})", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"60 Second Target Concurrency", + "refId":"C" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Observed Concurrency", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"short", + "label":"", + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "format":"short", + "label":"", + "logBase":1, + "max":null, + "min":null, + "show":false + } + ] + }, + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "decimals":null, + "fill":1, + "gridPos":{ + "h":9, + "w":24, + "x":0, + "y":22 + }, + "id":12, + "legend":{ + "avg":false, + "current":false, + "hideZero":false, + "max":false, + "min":false, + "show":false, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + { + "alias":"Panic Mode", + "color":"#e24d42", + "linewidth":2, + "yaxis":2 + } + ], + "spaceLength":10, + "stack":false, + "steppedLine":true, + "targets":[ + { + "expr":"sum(autoscaler_panic_mode{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} )", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"Panic Mode", + "refId":"A" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Panic Mode", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"short", + "label":null, + "logBase":1, + "max":"1.0", + "min":"0", + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":false + } + ] + } + ], + "title":"Autoscaler Metrics", + "type":"row" }, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Revision", - "multi": false, - "name": "revision", - "options": [ - - ], - "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\"}, revision)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - + { + "collapsed":true, + "gridPos":{ + "h":1, + "w":24, + "x":0, + "y":3 + }, + "id":20, + "panels":[ + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":11, + "w":24, + "x":0, + "y":1 + }, + "id":22, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":false, + "targets":[ + { + "expr":"label_replace(sum(increase(activator_revision_request_count{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (destination_revision), \"destination_revision\", \"$2\", \"destination_revision\", \"$configuration(-+)(.*)\")", + "format":"time_series", + "interval":"", + "intervalFactor":1, + "legendFormat":"{{destination_revision}}", + "refId":"A" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Request Count in last minute by Revision", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"none", + "label":null, + "logBase":1, + "max":null, + "min":"0", + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + } + ] + }, + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":10, + "w":24, + "x":0, + "y":12 + }, + "id":24, + "legend":{ + "avg":false, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":false + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":false, + "targets":[ + { + "expr":"round(sum(increase(activator_revision_response_count{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (response_code))", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"{{ response_code }}", + "refId":"A" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Response Count in last minute by Response Code", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"none", + "label":null, + "logBase":1, + "max":null, + "min":"0", + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + } + ] + }, + { + "aliasColors":{ + + }, + "bars":false, + "dashLength":10, + "dashes":false, + "datasource":"prometheus", + "fill":1, + "gridPos":{ + "h":10, + "w":24, + "x":0, + "y":32 + }, + "id":28, + "legend":{ + "avg":true, + "current":false, + "max":false, + "min":false, + "show":true, + "total":false, + "values":true + }, + "lines":true, + "linewidth":1, + "links":[ + + ], + "nullPointMode":"null", + "percentage":false, + "pointradius":5, + "points":false, + "renderer":"flot", + "seriesOverrides":[ + + ], + "spaceLength":10, + "stack":false, + "steppedLine":false, + "targets":[ + { + "expr":"label_replace(histogram_quantile(0.50, sum(rate(activator_response_time_msec_bucket{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (destination_revision, le)), \"destination_revision\", \"$2\", \"destination_revision\", \"$configuration(-+)(.*)\")", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"{{ destination_revision }} (p50)", + "refId":"A" + }, + { + "expr":"label_replace(histogram_quantile(0.90, sum(rate(activator_response_time_msec_bucket{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (destination_revision, le)), \"destination_revision\", \"$2\", \"destination_revision\", \"$configuration(-+)(.*)\")", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"{{ destination_revision }} (p90)", + "refId":"B" + }, + { + "expr":"label_replace(histogram_quantile(0.95, sum(rate(activator_response_time_msec_bucket{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (destination_revision, le)), \"destination_revision\", \"$2\", \"destination_revision\", \"$configuration(-+)(.*)\")", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"{{ destination_revision }} (p95)", + "refId":"C" + }, + { + "expr":"label_replace(histogram_quantile(0.99, sum(rate(activator_response_time_msec_bucket{destination_namespace=\"$namespace\", destination_configuration=~\"$configuration\",destination_revision=~\"$revision\"}[1m])) by (destination_revision, le)), \"destination_revision\", \"$2\", \"destination_revision\", \"$configuration(-+)(.*)\")", + "format":"time_series", + "intervalFactor":1, + "legendFormat":"{{ destination_revision }} (p99)", + "refId":"D" + } + ], + "thresholds":[ + + ], + "timeFrom":null, + "timeShift":null, + "title":"Response Time in last minute", + "tooltip":{ + "shared":true, + "sort":0, + "value_type":"individual" + }, + "type":"graph", + "xaxis":{ + "buckets":null, + "mode":"time", + "name":null, + "show":true, + "values":[ + + ] + }, + "yaxes":[ + { + "format":"ms", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + }, + { + "format":"short", + "label":null, + "logBase":1, + "max":null, + "min":null, + "show":true + } + ] + } + ], + "title":"Activator Metrics", + "type":"row" + } + ], + "refresh":false, + "schemaVersion":16, + "style":"dark", + "tags":[ + + ], + "templating":{ + "list":[ + { + "allValue":null, + "current":{ + + }, + "datasource":"prometheus", + "hide":0, + "includeAll":false, + "label":"Namespace", + "multi":false, + "name":"namespace", + "options":[ + + ], + "query":"label_values(autoscaler_actual_pod_count, configuration_namespace)", + "refresh":1, + "regex":"", + "sort":1, + "tagValuesQuery":"", + "tags":[ + + ], + "tagsQuery":"", + "type":"query", + "useTags":false + }, + { + "allValue":null, + "current":{ + + }, + "datasource":"prometheus", + "hide":0, + "includeAll":false, + "label":"Configuration", + "multi":false, + "name":"configuration", + "options":[ + + ], + "query":"label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\"}, configuration)", + "refresh":1, + "regex":"", + "sort":1, + "tagValuesQuery":"", + "tags":[ + + ], + "tagsQuery":"", + "type":"query", + "useTags":false + }, + { + "allValue":null, + "current":{ + + }, + "datasource":"prometheus", + "hide":0, + "includeAll":false, + "label":"Revision", + "multi":false, + "name":"revision", + "options":[ + + ], + "query":"label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\"}, revision)", + "refresh":1, + "regex":"", + "sort":2, + "tagValuesQuery":"", + "tags":[ + + ], + "tagsQuery":"", + "type":"query", + "useTags":false + } + ] + }, + "time":{ + "from":"now-15m", + "to":"now" + }, + "timepicker":{ + "refresh_intervals":[ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" ], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Knative Serving - Scaling Debugging", - "uid": "u_-9SIMiz", - "version": 1 - } \ No newline at end of file + "time_options":[ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone":"", + "title":"Knative Serving - Scaling Debugging", + "uid":"u_-9SIMiz", + "version":2 + } diff --git a/config/monitoring/200-common/300-prometheus/100-scrape-config.yaml b/config/monitoring/200-common/300-prometheus/100-scrape-config.yaml index ba2ee7f1e5ac..7034a38f15e7 100644 --- a/config/monitoring/200-common/300-prometheus/100-scrape-config.yaml +++ b/config/monitoring/200-common/300-prometheus/100-scrape-config.yaml @@ -56,6 +56,33 @@ data: regex: (.*) target_label: service replacement: $1 + # Activator pods + - job_name: activator + scrape_interval: 3s + scrape_timeout: 3s + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Scrape only the the targets matching the following metadata + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_pod_label_app, __meta_kubernetes_pod_container_port_name] + action: keep + regex: knative-serving;activator;activator-port + # Rename metadata labels to be reader friendly + - source_labels: [__meta_kubernetes_namespace] + action: replace + regex: (.*) + target_label: namespace + replacement: $1 + - source_labels: [__meta_kubernetes_pod_name] + action: replace + regex: (.*) + target_label: pod + replacement: $1 + - source_labels: [__meta_kubernetes_service_name] + action: replace + regex: (.*) + target_label: service + replacement: $1 # Fluentd daemonset - job_name: fluentd-ds kubernetes_sd_configs: diff --git a/pkg/activator/activator.go b/pkg/activator/activator.go index b02fca1e9c44..2ff3d28e23de 100644 --- a/pkg/activator/activator.go +++ b/pkg/activator/activator.go @@ -26,13 +26,14 @@ type Status int // Activator provides an active endpoint for a revision or an error and // status code indicating why it could not. type Activator interface { - ActiveEndpoint(namespace, name string) (Endpoint, Status, error) + ActiveEndpoint(namespace, configuration, name string) (Endpoint, Status, error) Shutdown() } type revisionID struct { - namespace string - name string + namespace string + configuration string + name string } // Endpoint is a fully-qualified domain name / port pair for an active revision. diff --git a/pkg/activator/dedupe.go b/pkg/activator/dedupe.go index 66f0e2ccd858..724940bde6ae 100644 --- a/pkg/activator/dedupe.go +++ b/pkg/activator/dedupe.go @@ -18,6 +18,10 @@ package activator import ( "fmt" "sync" + + clientset "github.com/knative/serving/pkg/client/clientset/versioned" + "go.uber.org/zap" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var shuttingDownError = activationResult{ @@ -39,19 +43,27 @@ type dedupingActivator struct { pendingRequests map[revisionID][]chan activationResult activator Activator shutdown bool + knaClient clientset.Interface + logger *zap.SugaredLogger + reporter StatsReporter } // NewDedupingActivator creates an Activator that deduplicates // activations requests for the same revision id and namespace. -func NewDedupingActivator(a Activator) Activator { +func NewDedupingActivator(a Activator, knaClient clientset.Interface, logger *zap.SugaredLogger, r StatsReporter) Activator { return &dedupingActivator{ pendingRequests: make(map[revisionID][]chan activationResult), activator: a, + knaClient: knaClient, + logger: logger, + reporter: r, } } -func (a *dedupingActivator) ActiveEndpoint(namespace, name string) (Endpoint, Status, error) { - id := revisionID{namespace: namespace, name: name} +func (a *dedupingActivator) ActiveEndpoint(namespace, configuration, name string) (Endpoint, Status, error) { + id := revisionID{namespace: namespace, + configuration: configuration, + name: name} ch := make(chan activationResult, 1) a.dedupe(id, ch) result := <-ch @@ -86,7 +98,17 @@ func (a *dedupingActivator) dedupe(id revisionID, ch chan activationResult) { } func (a *dedupingActivator) activate(id revisionID) { - endpoint, status, err := a.activator.ActiveEndpoint(id.namespace, id.name) + logger := loggerWithRevisionInfo(a.logger, id.namespace, id.name) + revisionClient := a.knaClient.ServingV1alpha1().Revisions(id.namespace) + revision, err := revisionClient.Get(id.name, metav1.GetOptions{}) + // default serving state is unknown + state := "Unknown" + if err != nil { + logger.Errorf("Failed to get revision %s for namespace: %s", id.name, id.namespace) + } + state = string(revision.Spec.ServingState) + + endpoint, status, err := a.activator.ActiveEndpoint(id.namespace, id.configuration, id.name) a.mux.Lock() defer a.mux.Unlock() result := activationResult{ @@ -95,6 +117,7 @@ func (a *dedupingActivator) activate(id revisionID) { err: err, } if reqs, ok := a.pendingRequests[id]; ok { + a.reporter.ReportRequest(id.namespace, id.configuration, id.name, state, float64(len(reqs))) delete(a.pendingRequests, id) for _, ch := range reqs { ch <- result diff --git a/pkg/activator/dedupe_test.go b/pkg/activator/dedupe_test.go index 2760637b7c16..28b79c071345 100644 --- a/pkg/activator/dedupe_test.go +++ b/pkg/activator/dedupe_test.go @@ -22,21 +22,28 @@ import ( "sync" "testing" "time" + + "github.com/knative/serving/pkg/apis/serving/v1alpha1" + . "github.com/knative/serving/pkg/logging/testing" ) func TestSingleRevision_SingleRequest_Success(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder(). + withServingState(v1alpha1.RevisionServingStateReserve).build()) want := Endpoint{"ip", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, testRevision}: activationResult{ endpoint: want, status: Status(0), err: nil, }, }) - d := NewDedupingActivator(Activator(f)) + d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) - endpoint, status, err := d.ActiveEndpoint("default", "rev1") + endpoint, status, err := d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) if err != nil { t.Errorf("Unexpected error: %v", err) @@ -53,20 +60,24 @@ func TestSingleRevision_SingleRequest_Success(t *testing.T) { } func TestSingleRevision_MultipleRequests_Success(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder(). + withServingState(v1alpha1.RevisionServingStateReserve).build()) ep := Endpoint{"ip", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, testRevision}: activationResult{ endpoint: ep, status: Status(0), err: nil, }, }) - d := NewDedupingActivator(f) + d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) got := concurrentTest(d, f, []revisionID{ - revisionID{"default", "rev1"}, - revisionID{"default", "rev1"}, + revisionID{testNamespace, testConfiguration, testRevision}, + revisionID{testNamespace, testConfiguration, testRevision}, }) want := []activationResult{ @@ -82,28 +93,33 @@ func TestSingleRevision_MultipleRequests_Success(t *testing.T) { } func TestMultipleRevisions_MultipleRequests_Success(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder().withRevisionName("rev1").build()) + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder().withRevisionName("rev2").build()) ep1 := Endpoint{"ip1", 8080} ep2 := Endpoint{"ip2", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, "rev1"}: activationResult{ endpoint: ep1, status: Status(0), err: nil, }, - revisionID{"default", "rev2"}: activationResult{ + revisionID{testNamespace, testConfiguration, "rev2"}: activationResult{ endpoint: ep2, status: Status(0), err: nil, }, }) - d := NewDedupingActivator(f) + d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) got := concurrentTest(d, f, []revisionID{ - revisionID{"default", "rev1"}, - revisionID{"default", "rev2"}, - revisionID{"default", "rev1"}, - revisionID{"default", "rev2"}, + revisionID{testNamespace, testConfiguration, "rev1"}, + revisionID{testNamespace, testConfiguration, "rev2"}, + revisionID{testNamespace, testConfiguration, "rev1"}, + revisionID{testNamespace, testConfiguration, "rev2"}, }) want := []activationResult{ @@ -116,34 +132,39 @@ func TestMultipleRevisions_MultipleRequests_Success(t *testing.T) { t.Errorf("Unexpected results. \nWant %+v. \nGot %+v", want, got) } if len(f.record) != 2 { - t.Errorf("Unexpected number of activation requests. Want 2. Got %v.", len(f.record)) + t.Errorf("Unexpected number of activation requests. Want 2. Got %v. %v", len(f.record), f.record) } } func TestMultipleRevisions_MultipleRequests_PartialSuccess(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder().withRevisionName("rev1").build()) + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder().withRevisionName("rev2").build()) ep1 := Endpoint{"ip1", 8080} status2 := Status(http.StatusInternalServerError) error2 := fmt.Errorf("test error") f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, "rev1"}: activationResult{ endpoint: ep1, status: Status(0), err: nil, }, - revisionID{"default", "rev2"}: activationResult{ + revisionID{testNamespace, testConfiguration, "rev2"}: activationResult{ endpoint: Endpoint{}, status: status2, err: error2, }, }) - d := NewDedupingActivator(f) + d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) got := concurrentTest(d, f, []revisionID{ - revisionID{"default", "rev1"}, - revisionID{"default", "rev2"}, - revisionID{"default", "rev1"}, - revisionID{"default", "rev2"}, + revisionID{testNamespace, testConfiguration, "rev1"}, + revisionID{testNamespace, testConfiguration, "rev2"}, + revisionID{testNamespace, testConfiguration, "rev1"}, + revisionID{testNamespace, testConfiguration, "rev2"}, }) want := []activationResult{ @@ -161,21 +182,25 @@ func TestMultipleRevisions_MultipleRequests_PartialSuccess(t *testing.T) { } func TestSingleRevision_MultipleRequests_FailureRecovery(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder(). + withServingState(v1alpha1.RevisionServingStateReserve).build()) failEp := Endpoint{} failStatus := Status(503) failErr := fmt.Errorf("test error") f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, testRevision}: activationResult{ endpoint: failEp, status: failStatus, err: failErr, }, }) - d := NewDedupingActivator(Activator(f)) + d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) // Activation initially fails - endpoint, status, err := d.ActiveEndpoint("default", "rev1") + endpoint, status, err := d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) if err != failErr { t.Errorf("Unexpected error. Want %v. Got %v.", failErr, err) @@ -193,13 +218,13 @@ func TestSingleRevision_MultipleRequests_FailureRecovery(t *testing.T) { // Later activation succeeds successEp := Endpoint{"ip", 8080} successStatus := Status(0) - f.responses[revisionID{"default", "rev1"}] = activationResult{ + f.responses[revisionID{testNamespace, testConfiguration, testRevision}] = activationResult{ endpoint: successEp, status: successStatus, err: nil, } - endpoint, status, err = d.ActiveEndpoint("default", "rev1") + endpoint, status, err = d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) if err != nil { t.Errorf("Unexpected error. Want %v. Got %v.", nil, err) @@ -216,23 +241,27 @@ func TestSingleRevision_MultipleRequests_FailureRecovery(t *testing.T) { } func TestShutdown_ReturnError(t *testing.T) { + _, kna := fakeClients() + kna.ServingV1alpha1().Revisions(testNamespace).Create( + newRevisionBuilder(). + withServingState(v1alpha1.RevisionServingStateReserve).build()) ep := Endpoint{"ip", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ - revisionID{"default", "rev1"}: activationResult{ + revisionID{testNamespace, testConfiguration, testRevision}: activationResult{ endpoint: ep, status: Status(0), err: nil, }, }) - d := NewDedupingActivator(Activator(f)) - f.hold(revisionID{"default", "rev1"}) + d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) + f.hold(revisionID{testNamespace, testConfiguration, testRevision}) go func() { time.Sleep(100 * time.Millisecond) d.Shutdown() }() - endpoint, status, err := d.ActiveEndpoint("default", "rev1") + endpoint, status, err := d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) want := Endpoint{} if endpoint != want { @@ -264,8 +293,8 @@ func newFakeActivator(t *testing.T, responses map[revisionID]activationResult) * } } -func (f *fakeActivator) ActiveEndpoint(namespace, name string) (Endpoint, Status, error) { - id := revisionID{namespace, name} +func (f *fakeActivator) ActiveEndpoint(namespace, configuration, name string) (Endpoint, Status, error) { + id := revisionID{namespace, configuration, name} f.recordMutex.Lock() f.record = append(f.record, id) @@ -314,7 +343,7 @@ func concurrentTest(a Activator, f *fakeActivator, ids []revisionID) []activatio end.Add(1) go func(index int, id revisionID) { start.Done() - endpoint, status, err := a.ActiveEndpoint(id.namespace, id.name) + endpoint, status, err := a.ActiveEndpoint(id.namespace, id.configuration, id.name) results[index] = activationResult{endpoint, status, err} end.Done() }(i, id) diff --git a/pkg/activator/revision.go b/pkg/activator/revision.go index c08c829e1ba9..492b9cc4191b 100644 --- a/pkg/activator/revision.go +++ b/pkg/activator/revision.go @@ -36,17 +36,19 @@ type revisionActivator struct { kubeClient kubernetes.Interface knaClient clientset.Interface logger *zap.SugaredLogger + reporter StatsReporter } // NewRevisionActivator creates an Activator that changes revision // serving status to active if necessary, then returns the endpoint // once the revision is ready to serve traffic. -func NewRevisionActivator(kubeClient kubernetes.Interface, servingClient clientset.Interface, logger *zap.SugaredLogger) Activator { +func NewRevisionActivator(kubeClient kubernetes.Interface, servingClient clientset.Interface, logger *zap.SugaredLogger, reporter StatsReporter) Activator { return &revisionActivator{ readyTimout: 60 * time.Second, kubeClient: kubeClient, knaClient: servingClient, logger: logger, + reporter: reporter, } } @@ -54,12 +56,14 @@ func (r *revisionActivator) Shutdown() { // nothing to do } -func (r *revisionActivator) ActiveEndpoint(namespace, name string) (end Endpoint, status Status, activationError error) { +func (r *revisionActivator) ActiveEndpoint(namespace, configuration, name string) (end Endpoint, status Status, activationError error) { logger := loggerWithRevisionInfo(r.logger, namespace, name) - rev := revisionID{namespace: namespace, name: name} + rev := revisionID{namespace: namespace, + configuration: configuration, + name: name} internalError := func(msg string, args ...interface{}) (Endpoint, Status, error) { - logger.Infof(msg, args...) + logger.Errorf(msg, args...) return Endpoint{}, http.StatusInternalServerError, fmt.Errorf(fmt.Sprintf("%s for namespace: %s, revision name: %s ", msg, namespace, name), args...) } @@ -69,10 +73,13 @@ func (r *revisionActivator) ActiveEndpoint(namespace, name string) (end Endpoint if err != nil { return internalError("Unable to get revision: %v", err) } + switch revision.Spec.ServingState { default: + r.reporter.ReportRequest(namespace, configuration, name, "Unknown", 1.0) return internalError("Disregarding activation request for revision in unknown state %v", revision.Spec.ServingState) case v1alpha1.RevisionServingStateRetired: + r.reporter.ReportRequest(namespace, configuration, name, string(v1alpha1.RevisionServingStateRetired), 1.0) return internalError("Disregarding activation request for retired revision ") case v1alpha1.RevisionServingStateActive: // Revision is already active. Nothing to do diff --git a/pkg/activator/revision_test.go b/pkg/activator/revision_test.go index 07c93d0183de..c7771df6548b 100644 --- a/pkg/activator/revision_test.go +++ b/pkg/activator/revision_test.go @@ -31,19 +31,34 @@ import ( ) const ( - testNamespace = "test-namespace" - testRevision = "test-rev" - testService = testRevision + "-service" - testServiceFQDN = testService + "." + testNamespace + ".svc.cluster.local" + testNamespace = "test-namespace" + testConfiguration = "test-configuration" + testRevision = "test-rev" + testService = testRevision + "-service" + testServiceFQDN = testService + "." + testNamespace + ".svc.cluster.local" ) +type mockReporter struct{} + +func (r *mockReporter) ReportRequest(ns, config, rev, servingState string, v float64) error { + return nil +} + +func (r *mockReporter) ReportResponseCount(ns, config, rev string, responseCode, numTries int, v float64) error { + return nil +} + +func (r *mockReporter) ReportResponseTime(ns, config, rev string, responseCode int, d time.Duration) error { + return nil +} + func TestActiveEndpoint_Active_StaysActive(t *testing.T) { k8s, kna := fakeClients() kna.ServingV1alpha1().Revisions(testNamespace).Create(newRevisionBuilder().build()) k8s.CoreV1().Services(testNamespace).Create(newServiceBuilder().build()) - a := NewRevisionActivator(k8s, kna, TestLogger(t)) + a := NewRevisionActivator(k8s, kna, TestLogger(t), &mockReporter{}) - got, status, err := a.ActiveEndpoint(testNamespace, testRevision) + got, status, err := a.ActiveEndpoint(testNamespace, testConfiguration, testRevision) want := Endpoint{testServiceFQDN, 8080} if got != want { @@ -64,9 +79,9 @@ func TestActiveEndpoint_Reserve_BecomesActive(t *testing.T) { withServingState(v1alpha1.RevisionServingStateReserve). build()) k8s.CoreV1().Services(testNamespace).Create(newServiceBuilder().build()) - a := NewRevisionActivator(k8s, kna, TestLogger(t)) + a := NewRevisionActivator(k8s, kna, TestLogger(t), &mockReporter{}) - got, status, err := a.ActiveEndpoint(testNamespace, testRevision) + got, status, err := a.ActiveEndpoint(testNamespace, testConfiguration, testRevision) want := Endpoint{testServiceFQDN, 8080} if got != want { @@ -92,9 +107,9 @@ func TestActiveEndpoint_Retired_StaysRetiredWithError(t *testing.T) { withServingState(v1alpha1.RevisionServingStateRetired). build()) k8s.CoreV1().Services(testNamespace).Create(newServiceBuilder().build()) - a := NewRevisionActivator(k8s, kna, TestLogger(t)) + a := NewRevisionActivator(k8s, kna, TestLogger(t), &mockReporter{}) - got, status, err := a.ActiveEndpoint(testNamespace, testRevision) + got, status, err := a.ActiveEndpoint(testNamespace, testConfiguration, testRevision) want := Endpoint{} if got != want { @@ -121,11 +136,11 @@ func TestActiveEndpoint_Reserve_WaitsForReady(t *testing.T) { withReady(false). build()) k8s.CoreV1().Services(testNamespace).Create(newServiceBuilder().build()) - a := NewRevisionActivator(k8s, kna, TestLogger(t)) + a := NewRevisionActivator(k8s, kna, TestLogger(t), &mockReporter{}) ch := make(chan activationResult) go func() { - endpoint, status, err := a.ActiveEndpoint(testNamespace, testRevision) + endpoint, status, err := a.ActiveEndpoint(testNamespace, testConfiguration, testRevision) ch <- activationResult{endpoint, status, err} }() @@ -167,12 +182,12 @@ func TestActiveEndpoint_Reserve_ReadyTimeoutWithError(t *testing.T) { withReady(false). build()) k8s.CoreV1().Services(testNamespace).Create(newServiceBuilder().build()) - a := NewRevisionActivator(k8s, kna, TestLogger(t)) + a := NewRevisionActivator(k8s, kna, TestLogger(t), &mockReporter{}) a.(*revisionActivator).readyTimout = 200 * time.Millisecond ch := make(chan activationResult) go func() { - endpoint, status, err := a.ActiveEndpoint(testNamespace, testRevision) + endpoint, status, err := a.ActiveEndpoint(testNamespace, testConfiguration, testRevision) ch <- activationResult{endpoint, status, err} }() @@ -243,6 +258,11 @@ func (b *revisionBuilder) build() *v1alpha1.Revision { return b.revision } +func (b *revisionBuilder) withRevisionName(name string) *revisionBuilder { + b.revision.ObjectMeta.Name = name + return b +} + func (b *revisionBuilder) withServingState(servingState v1alpha1.RevisionServingStateType) *revisionBuilder { b.revision.Spec.ServingState = servingState return b diff --git a/pkg/activator/stats_reporter.go b/pkg/activator/stats_reporter.go new file mode 100644 index 000000000000..6b0b7a58d971 --- /dev/null +++ b/pkg/activator/stats_reporter.go @@ -0,0 +1,200 @@ +/* +Copyright 2018 Google Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package activator + +import ( + "context" + "errors" + "strconv" + "time" + + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" +) + +// Measurement represents the type of the autoscaler metric to be reported +type Measurement int + +const ( + // RequestCountM is the requests count that are routed to the activator + RequestCountM Measurement = iota + + //ResponseCountM is the response count when activator proxy the request + ResponseCountM + + // ResponseTimeInMsecM is the response time in millisecond + ResponseTimeInMsecM +) + +var ( + measurements = []*stats.Float64Measure{ + RequestCountM: stats.Float64( + "revision_request_count", + "The number of requests that are routed to the activator", + stats.UnitNone), + ResponseCountM: stats.Float64( + "revision_response_count", + "The response count when activator proxy the request", + stats.UnitNone), + ResponseTimeInMsecM: stats.Float64( + "response_time_msec", + "The response time in millisecond", + stats.UnitNone), + } +) + +// StatsReporter defines the interface for sending activator metrics +type StatsReporter interface { + ReportRequest(ns, config, rev, servingState string, v float64) error + ReportResponseCount(ns, config, rev string, responseCode, numTries int, v float64) error + ReportResponseTime(ns, config, rev string, responseCode int, d time.Duration) error +} + +// Reporter holds cached metric objects to report autoscaler metrics +type Reporter struct { + initialized bool + namespaceTagKey tag.Key + configTagKey tag.Key + revisionTagKey tag.Key + servingStateKey tag.Key + responseCodeKey tag.Key + numTriesKey tag.Key +} + +// NewStatsReporter creates a reporter that collects and reports activator metrics +func NewStatsReporter() (*Reporter, error) { + + var r = &Reporter{} + + // Create the tag keys that will be used to add tags to our measurements. + nsTag, err := tag.NewKey("destination_namespace") + if err != nil { + return nil, err + } + r.namespaceTagKey = nsTag + configTag, err := tag.NewKey("destination_configuration") + if err != nil { + return nil, err + } + r.configTagKey = configTag + revTag, err := tag.NewKey("destination_revision") + if err != nil { + return nil, err + } + r.revisionTagKey = revTag + servingStateTag, err := tag.NewKey("serving_state") + if err != nil { + return nil, err + } + r.servingStateKey = servingStateTag + responseCodeTag, err := tag.NewKey("response_code") + if err != nil { + return nil, err + } + r.responseCodeKey = responseCodeTag + numTriesTag, err := tag.NewKey("num_tries") + if err != nil { + return nil, err + } + r.numTriesKey = numTriesTag + // Create view to see our measurements. + err = view.Register( + &view.View{ + Description: "The number of requests that are routed to the activator", + Measure: measurements[RequestCountM], + Aggregation: view.Sum(), + TagKeys: []tag.Key{r.namespaceTagKey, r.configTagKey, r.revisionTagKey, r.servingStateKey}, + }, + &view.View{ + Description: "The response count when activator proxy the request", + Measure: measurements[ResponseCountM], + Aggregation: view.Sum(), + TagKeys: []tag.Key{r.namespaceTagKey, r.configTagKey, r.revisionTagKey, r.responseCodeKey, r.numTriesKey}, + }, + &view.View{ + Description: "The response time in millisecond", + Measure: measurements[ResponseTimeInMsecM], + Aggregation: view.Distribution(1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000), + TagKeys: []tag.Key{r.namespaceTagKey, r.configTagKey, r.revisionTagKey, r.responseCodeKey}, + }, + ) + if err != nil { + return nil, err + } + + r.initialized = true + return r, nil +} + +// reportRequest captures value v for measurement m. +func (r *Reporter) ReportRequest(ns, config, rev, servingState string, v float64) error { + if !r.initialized { + return errors.New("StatsReporter is not initialized yet") + } + + ctx, err := tag.New( + context.Background(), + tag.Insert(r.namespaceTagKey, ns), + tag.Insert(r.configTagKey, config), + tag.Insert(r.revisionTagKey, rev), + tag.Insert(r.servingStateKey, servingState)) + if err != nil { + return err + } + + stats.Record(ctx, measurements[RequestCountM].M(v)) + return nil +} + +// ReportResponseCount captures ResponseCountM metric with value v. +func (r *Reporter) ReportResponseCount(ns, config, rev string, responseCode, numTries int, v float64) error { + if !r.initialized { + return errors.New("StatsReporter is not initialized yet") + } + + ctx, err := tag.New( + context.Background(), + tag.Insert(r.namespaceTagKey, ns), + tag.Insert(r.configTagKey, config), + tag.Insert(r.revisionTagKey, rev), + tag.Insert(r.responseCodeKey, strconv.Itoa(responseCode)), + tag.Insert(r.numTriesKey, strconv.Itoa(numTries))) + if err != nil { + return err + } + + stats.Record(ctx, measurements[ResponseCountM].M(v)) + return nil +} + +func (r *Reporter) ReportResponseTime(ns, config, rev string, responseCode int, d time.Duration) error { + if !r.initialized { + return errors.New("StatsReporter is not initialized yet") + } + + ctx, err := tag.New( + context.Background(), + tag.Insert(r.namespaceTagKey, ns), + tag.Insert(r.configTagKey, config), + tag.Insert(r.revisionTagKey, rev), + tag.Insert(r.responseCodeKey, strconv.Itoa(responseCode))) + if err != nil { + return err + } + + // convert time.Duration in nanoseconds to milliseconds + stats.Record(ctx, measurements[ResponseTimeInMsecM].M(float64(d/time.Millisecond))) + return nil +} diff --git a/pkg/activator/stats_reporter_test.go b/pkg/activator/stats_reporter_test.go new file mode 100644 index 000000000000..fc9db9528839 --- /dev/null +++ b/pkg/activator/stats_reporter_test.go @@ -0,0 +1,141 @@ +/* +Copyright 2018 Google Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package activator + +import ( + "testing" + "time" + + "go.opencensus.io/stats/view" +) + +func TestActivatorReporter(t *testing.T) { + r := &Reporter{} + + if err := r.ReportRequest("testns", "testconfig", "testrev", "Reserved", 1); err == nil { + t.Error("Reporter expected an error for Report call before init. Got success.") + } + if err := r.ReportResponseCount("testns", "testconfig", "testrev", 200, 1, 1); err == nil { + t.Error("Reporter expected an error for Report call before init. Got success.") + } + + var err error + if r, err = NewStatsReporter(); err != nil { + t.Error("Failed to create a new reporter.") + } + + // test ReportRequest + wantTags1 := map[string]string{ + "destination_namespace": "testns", + "destination_configuration": "testconfig", + "destination_revision": "testrev", + "serving_state": "Reserved", + } + expectSuccess(t, func() error { return r.ReportRequest("testns", "testconfig", "testrev", "Reserved", 1) }) + expectSuccess(t, func() error { return r.ReportRequest("testns", "testconfig", "testrev", "Reserved", 2.0) }) + checkSumData(t, "revision_request_count", wantTags1, 3) + + // test ReportResponseCount + wantTags2 := map[string]string{ + "destination_namespace": "testns", + "destination_configuration": "testconfig", + "destination_revision": "testrev", + "response_code": "200", + "num_tries": "6", + } + expectSuccess(t, func() error { return r.ReportResponseCount("testns", "testconfig", "testrev", 200, 6, 1) }) + expectSuccess(t, func() error { return r.ReportResponseCount("testns", "testconfig", "testrev", 200, 6, 3) }) + checkSumData(t, "revision_response_count", wantTags2, 4) + + // test ReportResponseTime + wantTags3 := map[string]string{ + "destination_namespace": "testns", + "destination_configuration": "testconfig", + "destination_revision": "testrev", + "response_code": "200", + } + expectSuccess(t, func() error { + return r.ReportResponseTime("testns", "testconfig", "testrev", 200, 1100*time.Millisecond) + }) + expectSuccess(t, func() error { + return r.ReportResponseTime("testns", "testconfig", "testrev", 200, 9100*time.Millisecond) + }) + checkDistributionData(t, "response_time_msec", wantTags3, 2, 1100, 9100) +} + +func expectSuccess(t *testing.T, f func() error) { + if err := f(); err != nil { + t.Errorf("Reporter expected success but got error %v", err) + } +} + +func checkSumData(t *testing.T, name string, wantTags map[string]string, wantValue int) { + if d, err := view.RetrieveData(name); err != nil { + t.Errorf("Reporter error = %v, wantErr %v", err, false) + } else { + if len(d) != 1 { + t.Errorf("Reporter len(d) %v, want %v", len(d), 1) + } + for _, got := range d[0].Tags { + if want, ok := wantTags[got.Key.Name()]; !ok { + t.Errorf("Reporter got an extra tag %v: %v", got.Key.Name(), got.Value) + } else { + if got.Value != want { + t.Errorf("Reporter expected a different tag value. key:%v, got: %v, want: %v", got.Key.Name(), got.Value, want) + } + } + } + + if s, ok := d[0].Data.(*view.SumData); !ok { + t.Error("Reporter expected a SumData type") + } else { + if s.Value != (float64)(wantValue) { + t.Errorf("Reporter expected %v got %v. metric: %v", (int64)(wantValue), s.Value, name) + } + } + } +} + +func checkDistributionData(t *testing.T, name string, wantTags map[string]string, expectedCount int, expectedMin float64, expectedMax float64) { + if d, err := view.RetrieveData(name); err != nil { + t.Errorf("Reporter error = %v, wantErr %v", err, false) + } else { + if len(d) != 1 { + t.Errorf("Reporter len(d) %v, want %v", len(d), 1) + } + for _, got := range d[0].Tags { + if want, ok := wantTags[got.Key.Name()]; !ok { + t.Errorf("Reporter got an extra tag %v: %v", got.Key.Name(), got.Value) + } else { + if got.Value != want { + t.Errorf("Reporter expected a different tag value. key:%v, got: %v, want: %v", got.Key.Name(), got.Value, want) + } + } + } + + if s, ok := d[0].Data.(*view.DistributionData); !ok { + t.Error("Reporter expected a DistributionData type") + } else { + if s.Count != int64(expectedCount) { + t.Errorf("Reporter expected count %v got %v. metric: %v", (int64)(expectedCount), s.Count, name) + } + if s.Min != float64(expectedMin) { + t.Errorf("Reporter expected min %v got %v. metric: %v", expectedMin, s.Min, name) + } + if s.Max != float64(expectedMax) { + t.Errorf("Reporter expected max %v got %v. metric: %v", expectedMax, s.Max, name) + } + } + } +} diff --git a/pkg/controller/names.go b/pkg/controller/names.go index 646e8afd2996..22fd27d1b14b 100644 --- a/pkg/controller/names.go +++ b/pkg/controller/names.go @@ -32,6 +32,10 @@ func GetRevisionHeaderName() string { return "knative-serving-revision" } +func GetConfigurationHeader() string { + return "knative-serving-configuration" +} + func GetRevisionHeaderNamespace() string { return "knative-serving-namespace" } diff --git a/pkg/controller/route/resources/virtual_service.go b/pkg/controller/route/resources/virtual_service.go index 937027b216dc..31587f9d4150 100644 --- a/pkg/controller/route/resources/virtual_service.go +++ b/pkg/controller/route/resources/virtual_service.go @@ -194,6 +194,7 @@ func addActivatorRoutes(r *v1alpha3.HTTPRoute, ns string, inactive []traffic.Rev Weight: totalInactivePercent, }) r.AppendHeaders[controller.GetRevisionHeaderName()] = maxInactiveTarget.RevisionName + r.AppendHeaders[controller.GetConfigurationHeader()] = maxInactiveTarget.ConfigurationName r.AppendHeaders[controller.GetRevisionHeaderNamespace()] = ns return r } diff --git a/pkg/controller/route/resources/virtual_service_test.go b/pkg/controller/route/resources/virtual_service_test.go index 5cc0f5651233..39f1fdd4935b 100644 --- a/pkg/controller/route/resources/virtual_service_test.go +++ b/pkg/controller/route/resources/virtual_service_test.go @@ -337,9 +337,10 @@ func TestMakeVirtualServiceRoute_VanillaScaledToZero(t *testing.T) { Weight: 100, }}, AppendHeaders: map[string]string{ - "knative-serving-revision": "revision", - "knative-serving-namespace": "test-ns", - IstioTimeoutHackHeaderKey: IstioTimeoutHackHeaderValue, + "knative-serving-revision": "revision", + "knative-serving-configuration": "config", + "knative-serving-namespace": "test-ns", + IstioTimeoutHackHeaderKey: IstioTimeoutHackHeaderValue, }, Timeout: DefaultRouteTimeout, } @@ -380,9 +381,10 @@ func TestMakeVirtualServiceRoute_TwoInactiveTargets(t *testing.T) { Weight: 100, }}, AppendHeaders: map[string]string{ - "knative-serving-revision": "revision", - "knative-serving-namespace": "test-ns", - IstioTimeoutHackHeaderKey: IstioTimeoutHackHeaderValue, + "knative-serving-revision": "revision", + "knative-serving-configuration": "config", + "knative-serving-namespace": "test-ns", + IstioTimeoutHackHeaderKey: IstioTimeoutHackHeaderValue, }, Timeout: DefaultRouteTimeout, } diff --git a/pkg/controller/route/route_test.go b/pkg/controller/route/route_test.go index 0079e3b44890..a648ad271bd1 100644 --- a/pkg/controller/route/route_test.go +++ b/pkg/controller/route/route_test.go @@ -259,8 +259,9 @@ func TestCreateRouteForOneReserveRevision(t *testing.T) { // A route targeting the revision route := getTestRouteWithTrafficTargets( []v1alpha1.TrafficTarget{{ - RevisionName: "test-rev", - Percent: 100, + RevisionName: "test-rev", + ConfigurationName: "test-config", + Percent: 100, }}, ) servingClient.ServingV1alpha1().Routes(testNamespace).Create(route) @@ -316,6 +317,7 @@ func TestCreateRouteForOneReserveRevision(t *testing.T) { Route: []v1alpha3.DestinationWeight{getActivatorDestinationWeight(100)}, AppendHeaders: map[string]string{ ctrl.GetRevisionHeaderName(): "test-rev", + ctrl.GetConfigurationHeader(): "test-config", ctrl.GetRevisionHeaderNamespace(): testNamespace, resources.IstioTimeoutHackHeaderKey: resources.IstioTimeoutHackHeaderValue, }, @@ -448,8 +450,9 @@ func TestCreateRouteWithOneTargetReserve(t *testing.T) { ConfigurationName: config.Name, Percent: 90, }, { - RevisionName: rev.Name, - Percent: 10, + RevisionName: rev.Name, + ConfigurationName: "test-config", + Percent: 10, }}, ) servingClient.ServingV1alpha1().Routes(testNamespace).Create(route) @@ -494,6 +497,7 @@ func TestCreateRouteWithOneTargetReserve(t *testing.T) { }, getActivatorDestinationWeight(10)}, AppendHeaders: map[string]string{ ctrl.GetRevisionHeaderName(): "test-rev", + ctrl.GetConfigurationHeader(): "test-config", ctrl.GetRevisionHeaderNamespace(): testNamespace, resources.IstioTimeoutHackHeaderKey: resources.IstioTimeoutHackHeaderValue, }, From 0c9eff0a310fd44df4997e777cb2512b115aca66 Mon Sep 17 00:00:00 2001 From: Yao Wu Date: Wed, 1 Aug 2018 11:18:25 -0700 Subject: [PATCH 2/2] Simplify request count metric serving state dimension --- cmd/activator/main.go | 2 +- pkg/activator/dedupe.go | 23 +---------------------- pkg/activator/dedupe_test.go | 21 ++++++--------------- pkg/activator/revision.go | 3 +-- 4 files changed, 9 insertions(+), 40 deletions(-) diff --git a/cmd/activator/main.go b/cmd/activator/main.go index 405993ec5070..1c13c1f67060 100644 --- a/cmd/activator/main.go +++ b/cmd/activator/main.go @@ -207,7 +207,7 @@ func main() { } a := activator.NewRevisionActivator(kubeClient, servingClient, logger, reporter) - a = activator.NewDedupingActivator(a, servingClient, logger, reporter) + a = activator.NewDedupingActivator(a) ah := &activationHandler{a, logger, reporter} // set up signals so we handle the first shutdown signal gracefully diff --git a/pkg/activator/dedupe.go b/pkg/activator/dedupe.go index 724940bde6ae..3ce2ebc1daa4 100644 --- a/pkg/activator/dedupe.go +++ b/pkg/activator/dedupe.go @@ -18,10 +18,6 @@ package activator import ( "fmt" "sync" - - clientset "github.com/knative/serving/pkg/client/clientset/versioned" - "go.uber.org/zap" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var shuttingDownError = activationResult{ @@ -43,20 +39,14 @@ type dedupingActivator struct { pendingRequests map[revisionID][]chan activationResult activator Activator shutdown bool - knaClient clientset.Interface - logger *zap.SugaredLogger - reporter StatsReporter } // NewDedupingActivator creates an Activator that deduplicates // activations requests for the same revision id and namespace. -func NewDedupingActivator(a Activator, knaClient clientset.Interface, logger *zap.SugaredLogger, r StatsReporter) Activator { +func NewDedupingActivator(a Activator) Activator { return &dedupingActivator{ pendingRequests: make(map[revisionID][]chan activationResult), activator: a, - knaClient: knaClient, - logger: logger, - reporter: r, } } @@ -98,16 +88,6 @@ func (a *dedupingActivator) dedupe(id revisionID, ch chan activationResult) { } func (a *dedupingActivator) activate(id revisionID) { - logger := loggerWithRevisionInfo(a.logger, id.namespace, id.name) - revisionClient := a.knaClient.ServingV1alpha1().Revisions(id.namespace) - revision, err := revisionClient.Get(id.name, metav1.GetOptions{}) - // default serving state is unknown - state := "Unknown" - if err != nil { - logger.Errorf("Failed to get revision %s for namespace: %s", id.name, id.namespace) - } - state = string(revision.Spec.ServingState) - endpoint, status, err := a.activator.ActiveEndpoint(id.namespace, id.configuration, id.name) a.mux.Lock() defer a.mux.Unlock() @@ -117,7 +97,6 @@ func (a *dedupingActivator) activate(id revisionID) { err: err, } if reqs, ok := a.pendingRequests[id]; ok { - a.reporter.ReportRequest(id.namespace, id.configuration, id.name, state, float64(len(reqs))) delete(a.pendingRequests, id) for _, ch := range reqs { ch <- result diff --git a/pkg/activator/dedupe_test.go b/pkg/activator/dedupe_test.go index 695e1678eb70..8b34534a9886 100644 --- a/pkg/activator/dedupe_test.go +++ b/pkg/activator/dedupe_test.go @@ -23,15 +23,10 @@ import ( "testing" "time" - . "github.com/knative/pkg/logging/testing" "github.com/knative/serving/pkg/apis/serving/v1alpha1" ) func TestSingleRevision_SingleRequest_Success(t *testing.T) { - _, kna := fakeClients() - kna.ServingV1alpha1().Revisions(testNamespace).Create( - newRevisionBuilder(). - withServingState(v1alpha1.RevisionServingStateReserve).build()) want := Endpoint{"ip", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ @@ -41,7 +36,7 @@ func TestSingleRevision_SingleRequest_Success(t *testing.T) { err: nil, }, }) - d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(Activator(f)) endpoint, status, err := d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) @@ -60,10 +55,6 @@ func TestSingleRevision_SingleRequest_Success(t *testing.T) { } func TestSingleRevision_MultipleRequests_Success(t *testing.T) { - _, kna := fakeClients() - kna.ServingV1alpha1().Revisions(testNamespace).Create( - newRevisionBuilder(). - withServingState(v1alpha1.RevisionServingStateReserve).build()) ep := Endpoint{"ip", 8080} f := newFakeActivator(t, map[revisionID]activationResult{ @@ -73,7 +64,7 @@ func TestSingleRevision_MultipleRequests_Success(t *testing.T) { err: nil, }, }) - d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(f) got := concurrentTest(d, f, []revisionID{ revisionID{testNamespace, testConfiguration, testRevision}, @@ -113,7 +104,7 @@ func TestMultipleRevisions_MultipleRequests_Success(t *testing.T) { err: nil, }, }) - d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(f) got := concurrentTest(d, f, []revisionID{ revisionID{testNamespace, testConfiguration, "rev1"}, @@ -158,7 +149,7 @@ func TestMultipleRevisions_MultipleRequests_PartialSuccess(t *testing.T) { err: error2, }, }) - d := NewDedupingActivator(f, kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(f) got := concurrentTest(d, f, []revisionID{ revisionID{testNamespace, testConfiguration, "rev1"}, @@ -197,7 +188,7 @@ func TestSingleRevision_MultipleRequests_FailureRecovery(t *testing.T) { err: failErr, }, }) - d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(Activator(f)) // Activation initially fails endpoint, status, err := d.ActiveEndpoint(testNamespace, testConfiguration, testRevision) @@ -254,7 +245,7 @@ func TestShutdown_ReturnError(t *testing.T) { err: nil, }, }) - d := NewDedupingActivator(Activator(f), kna, TestLogger(t), &mockReporter{}) + d := NewDedupingActivator(Activator(f)) f.hold(revisionID{testNamespace, testConfiguration, testRevision}) go func() { diff --git a/pkg/activator/revision.go b/pkg/activator/revision.go index 452e62b4ce25..3a7e1dbd06a2 100644 --- a/pkg/activator/revision.go +++ b/pkg/activator/revision.go @@ -75,12 +75,11 @@ func (r *revisionActivator) ActiveEndpoint(namespace, configuration, name string return internalError("Unable to get revision: %v", err) } + r.reporter.ReportRequest(namespace, configuration, name, string(revision.Spec.ServingState), 1.0) switch revision.Spec.ServingState { default: - r.reporter.ReportRequest(namespace, configuration, name, "Unknown", 1.0) return internalError("Disregarding activation request for revision in unknown state %v", revision.Spec.ServingState) case v1alpha1.RevisionServingStateRetired: - r.reporter.ReportRequest(namespace, configuration, name, string(v1alpha1.RevisionServingStateRetired), 1.0) return internalError("Disregarding activation request for retired revision ") case v1alpha1.RevisionServingStateActive: // Revision is already active. Nothing to do