diff --git a/Gopkg.lock b/Gopkg.lock index 4105156b0991..c0203c190331 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -653,6 +653,6 @@ [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "7f62108216ff3756548255b887e077fdf51fbbaf50f0832664e7070f68975d82" + inputs-digest = "045fb1dfd99560f2cf3f5969ef34fef4a6cdb5674e2fb1e7188a859f09fdff7a" solver-name = "gps-cdcl" solver-version = 1 diff --git a/cmd/ela-autoscaler/BUILD.bazel b/cmd/ela-autoscaler/BUILD.bazel index 5dd8d7d3e6e5..4c8207fac8a2 100644 --- a/cmd/ela-autoscaler/BUILD.bazel +++ b/cmd/ela-autoscaler/BUILD.bazel @@ -12,6 +12,8 @@ go_library( "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/gorilla/websocket:go_default_library", "//vendor/github.com/josephburnett/k8sflag/pkg/k8sflag:go_default_library", + "//vendor/go.opencensus.io/exporter/prometheus:go_default_library", + "//vendor/go.opencensus.io/stats/view:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/rest:go_default_library", diff --git a/cmd/ela-autoscaler/main.go b/cmd/ela-autoscaler/main.go index f28df730e823..83cb3b4ee50f 100644 --- a/cmd/ela-autoscaler/main.go +++ b/cmd/ela-autoscaler/main.go @@ -23,6 +23,9 @@ import ( "os" "time" + "go.opencensus.io/exporter/prometheus" + "go.opencensus.io/stats/view" + "github.com/elafros/elafros/pkg/apis/ela/v1alpha1" ela_autoscaler "github.com/elafros/elafros/pkg/autoscaler" clientset "github.com/elafros/elafros/pkg/client/clientset/versioned" @@ -54,8 +57,10 @@ var ( kubeClient *kubernetes.Clientset statChan = make(chan ela_autoscaler.Stat, statBufferSize) scaleChan = make(chan int32, scaleBufferSize) + statsReporter ela_autoscaler.StatsReporter elaNamespace string elaDeployment string + elaConfig string elaRevision string elaAutoscalerPort string @@ -75,8 +80,14 @@ func init() { } glog.Infof("ELA_DEPLOYMENT=%v", elaDeployment) + elaConfig = os.Getenv("ELA_CONFIGURATION") + if elaConfig == "" { + glog.Fatal("No ELA_CONFIGURATION provided.") + } + glog.Infof("ELA_CONFIGURATION=%v", elaConfig) + elaRevision = os.Getenv("ELA_REVISION") - if elaDeployment == "" { + if elaRevision == "" { glog.Fatal("No ELA_REVISION provided.") } glog.Infof("ELA_REVISION=%v", elaRevision) @@ -96,7 +107,7 @@ func autoscaler() { PanicWindow: k8sflag.Duration("autoscale.panic-window", nil, k8sflag.Required), ScaleToZeroThreshold: k8sflag.Duration("autoscale.scale-to-zero-threshold", nil, k8sflag.Required, k8sflag.Dynamic), } - a := ela_autoscaler.NewAutoscaler(config) + a := ela_autoscaler.NewAutoscaler(config, statsReporter) ticker := time.NewTicker(2 * time.Second) for { @@ -158,6 +169,10 @@ func scaleTo(podCount int32) { deployment.Status.Replicas, deployment.Status.AvailableReplicas, deployment.Status.ReadyReplicas) + statsReporter.Report(ela_autoscaler.DesiredPodCountM, (int64)(podCount)) + statsReporter.Report(ela_autoscaler.RequestedPodCountM, (int64)(deployment.Status.Replicas)) + statsReporter.Report(ela_autoscaler.ActualPodCountM, (int64)(deployment.Status.ReadyReplicas)) + if *deployment.Spec.Replicas == podCount { return } @@ -229,8 +244,25 @@ func main() { glog.Fatal(err) } elaClient = ec + + exporter, err := prometheus.NewExporter(prometheus.Options{Namespace: "autoscaler"}) + if err != nil { + glog.Fatal(err) + } + view.RegisterExporter(exporter) + view.SetReportingPeriod(1 * time.Second) + + reporter, err := ela_autoscaler.NewStatsReporter(elaNamespace, elaConfig, elaRevision) + if err != nil { + glog.Fatal(err) + } + statsReporter = reporter + go autoscaler() go scaleSerializer() - http.HandleFunc("/", handler) - http.ListenAndServe(":"+elaAutoscalerPort, nil) + + mux := http.NewServeMux() + mux.HandleFunc("/", handler) + mux.Handle("/metrics", exporter) + http.ListenAndServe(":"+elaAutoscalerPort, mux) } diff --git a/config/monitoring/grafana-dashboard-defs/elafros.yaml b/config/monitoring/grafana-dashboard-defs/elafros.yaml index 08c6f436d124..02a90fa379a8 100644 --- a/config/monitoring/grafana-dashboard-defs/elafros.yaml +++ b/config/monitoring/grafana-dashboard-defs/elafros.yaml @@ -384,7 +384,7 @@ data: "annotations": { "list": [] }, - "description": "Revision - HTTP Requests", + "description": "Elafros - Revision HTTP Requests", "editable": false, "gnetId": null, "graphTooltip": 0, @@ -794,7 +794,7 @@ data: "title": "Request Volume by Revision", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -875,7 +875,7 @@ data: "title": "Request Volume by Response Code", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -993,7 +993,7 @@ data: "title": "Response Time by Revision", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1126,7 +1126,7 @@ data: "title": "Response Time by Response Code", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1220,7 +1220,7 @@ data: "title": "Response Size by Revision", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1301,7 +1301,7 @@ data: "title": "Request Size by Revision", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1410,7 +1410,7 @@ data: ] }, "timezone": "", - "title": "Revision - HTTP Requests", + "title": "Elafros - Revision HTTP Requests", "uid": "im_gFbWik", "version": 2 } @@ -1429,7 +1429,7 @@ data: "annotations": { "list": [] }, - "description": "Revision - CPU and Memory Usage", + "description": "Elafros - Revision CPU and Memory Usage", "editable": false, "gnetId": null, "graphTooltip": 0, @@ -1485,7 +1485,7 @@ data: "title": "Total CPU Usage", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1565,7 +1565,7 @@ data: "title": "Total Memory Usage", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -1654,7 +1654,251 @@ data: ] }, "timezone": "", - "title": "Revision - CPU and Memory Usage", + "title": "Elafros - Revision CPU and Memory Usage", "uid": "bKOoE9Wmk", "version": 4 } + autoscaler-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "description": "Elafros - Autoscaler", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1524864003271, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 17, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Panic Mode", + "color": "#f29191", + "dashes": true, + "fill": 2, + "linewidth": 2, + "steppedLine": true, + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"}", + "format": "time_series", + "interval": "1s", + "intervalFactor": 1, + "legendFormat": "Actual Pods", + "refId": "A" + }, + { + "expr": "autoscaler_desired_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", + "format": "time_series", + "interval": "1s", + "intervalFactor": 1, + "legendFormat": "Desired Pods", + "refId": "B" + }, + { + "expr": "autoscaler_requested_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", + "format": "time_series", + "interval": "1s", + "intervalFactor": 1, + "legendFormat": "Requested Pods", + "refId": "C" + }, + { + "expr": "autoscaler_panic_mode{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", + "format": "time_series", + "interval": "1s", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Panic Mode", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pod Counts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": "Panic Mode", + "logBase": 1, + "max": "1.0", + "min": "0", + "show": true + } + ] + } + ], + "refresh": "5s", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(autoscaler_actual_pod_count, configuration_namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Configuration", + "multi": false, + "name": "configuration", + "options": [], + "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\"}, configuration)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Revision", + "multi": false, + "name": "revision", + "options": [], + "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\"}, revision)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Elafros - Autoscaler", + "uid": "u_-9SIMiz", + "version": 3 + } \ No newline at end of file diff --git a/config/monitoring/prometheus-servicemonitor/BUILD b/config/monitoring/prometheus-servicemonitor/BUILD index 02a13fe014da..9f7dcdf58afa 100644 --- a/config/monitoring/prometheus-servicemonitor/BUILD +++ b/config/monitoring/prometheus-servicemonitor/BUILD @@ -40,6 +40,11 @@ k8s_object( template = "prometheus-operator.yaml", ) +k8s_object( + name = "ela-autoscaler", + template = "ela-autoscaler.yaml", +) + k8s_object( name = "ela-controller", template = "ela-controller.yaml", @@ -58,6 +63,7 @@ k8s_objects( ":istio", ":prometheus", ":prometheus-operator", + ":ela-autoscaler", ":ela-controller", ":fluentd-es", ":apiserver", diff --git a/config/monitoring/prometheus-servicemonitor/ela-autoscaler.yaml b/config/monitoring/prometheus-servicemonitor/ela-autoscaler.yaml new file mode 100644 index 000000000000..e3990b532653 --- /dev/null +++ b/config/monitoring/prometheus-servicemonitor/ela-autoscaler.yaml @@ -0,0 +1,31 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: ela-autoscaler + namespace: monitoring + labels: + monitor-category: ela-system +spec: + selector: + matchExpressions: + - {key: elafros.dev/autoscaler, operator: Exists} + namespaceSelector: + matchNames: + - ela-system + endpoints: + - port: autoscaler-port + interval: 3s diff --git a/docs/debugging/images/autoscaler_dash1.png b/docs/debugging/images/autoscaler_dash1.png new file mode 100644 index 000000000000..bf1ead1ff779 Binary files /dev/null and b/docs/debugging/images/autoscaler_dash1.png differ diff --git a/docs/debugging/images/cpu_dash1.png b/docs/debugging/images/cpu_dash1.png new file mode 100644 index 000000000000..b25ab28a9a8e Binary files /dev/null and b/docs/debugging/images/cpu_dash1.png differ diff --git a/docs/debugging/images/request_dash1.png b/docs/debugging/images/request_dash1.png new file mode 100644 index 000000000000..2f5c8c5d1ac3 Binary files /dev/null and b/docs/debugging/images/request_dash1.png differ diff --git a/docs/debugging/images/zipkin1.png b/docs/debugging/images/zipkin1.png new file mode 100644 index 000000000000..165eda0fcc90 Binary files /dev/null and b/docs/debugging/images/zipkin1.png differ diff --git a/docs/debugging/images/zipkin2.png b/docs/debugging/images/zipkin2.png new file mode 100644 index 000000000000..6dc1795efb3b Binary files /dev/null and b/docs/debugging/images/zipkin2.png differ diff --git a/docs/debugging/performance-investigation-guide.md b/docs/debugging/performance-investigation-guide.md new file mode 100644 index 000000000000..d11fb6865b85 --- /dev/null +++ b/docs/debugging/performance-investigation-guide.md @@ -0,0 +1,94 @@ +# Investigating Performance Issues + +You deployed your application or function to Elafros but its performance +is not up to the expectations. Elafros provides various dashboards and tools to +help investigate such issues. This document goes through these dashboards +and tools. + +## Request metrics + +Start your investigation with "Revision - HTTP Requests" dashboard. To open this dashboard, +open Grafana UI as described in [telemetry.md](../telemetry.md) and navigate to +"Elafros - Revision HTTP Requests". Select your configuration and revision +from the menu on top left of the page. You will see a page like below: + +![Elafros - Revision HTTP Requests](images/request_dash1.png) + +This dashboard gives visibility into the following for each revision: +* Request volume +* Request volume per HTTP response code +* Response time +* Response time per HTTP response code +* Request and response sizes + +This dashboard can show traffic volume or latency discrepancies between different revisions. +If, for example, a revision's latency is higher than others revisions, then +focus your investigation on the offending revision through the rest of this guide. + +## Request traces +Next, look into request traces to find out where the time is spent for a single request. +To access request traces, open Zipkin UI as described in [telemetry.md](../telemetry.md). +Select your revision from the "Service Name" drop down and click on "Find Traces" button. +This will bring up a view that looks like below: + +![Zipkin - Trace Overview](images/zipkin1.png) + +In the example above, we can see that the request spent most of its time in the +[span](https://github.com/opentracing/specification/blob/master/specification.md#the-opentracing-data-model) right before the last. +Investigation should now be focused on that specific span. +Clicking on that will bring up a view that looks like below: + +![Zipkin - Span Details](images/zipkin2.png) + +This view shows detailed information about the specific span, such as the +micro service or external URL that was called. In this example, call to a +Grafana URL is taking the most time and investigation should focus on why +that URL is taking that long. + +## Autoscaler metrics +If request metrics or traces do not show any obvious hot spots, or if they show +that most of the time is spent in your own code, autoscaler metrics should be +looked next. To open autoscaler dashboard, open Grafana UI and select +"Elafros - Autoscaler" dashboard. This will bring up a view that looks like below: + +![Elafros - Autoscaler](images/autoscaler_dash1.png) + +This view shows four key metrics from Elafros autoscaler: +* Actual pod count: # of pods that are running a given revision +* Desired pod count: # of pods that autoscaler thinks that should serve the + revision +* Requested pod count: # of pods that autoscaler requested from Kubernetes +* Panic mode: If 0, autoscaler is operating in [stable mode](../../pkg/autoscaler/README.md#stable-mode). +If 1, autoscaler is operating in [panic mode](../../pkg/autoscaler/README.md#panic-mode). + +If there is a large gap between actual pod count and requested pod count, that +means that the Kubernetes cluster is unable to keep up allocating new +resources fast enough, or that the Kubernetes cluster is out of requested +resources. + +If there is a large gap between requested pod count and desired pod count, that +is an indication that Elafros autoscaler is unable to communicate with +Kubernetes master to make the request. + +In the example above, autoscaler requested 18 pods to optimally serve the traffic +but was only granted 8 pods because the cluster is out of resources. + +## CPU and memory usage +You can access total CPU and memory usage of your revision from +"Elafros - Revision CPU and Memory Usage" dashboard. Opening this will bring up a +view that looks like below: + +![Elafros - Revision CPU and Memory Usage](images/cpu_dash1.png) + +The first chart shows rate of the CPU usage across all pods serving the revision. +The second chart shows total memory consumed across all pods serving the revision. +Both of these metrics are further divided into per container usage. +* ela-container: This container runs the user code (application, function or container). +* [istio-proxy](https://github.com/istio/proxy): Sidecar container to form an +[Istio](https://istio.io/docs/concepts/what-is-istio/overview.html) mesh. +* queue-proxy: Elafros owned sidecar container to enforce request concurrency limits. +* autoscaler: Elafros owned sidecar container to provide auto scaling for the revision. +* fluentd-proxy: Sidecar container to collect logs from /var/log. + +## Profiling +...To be filled... diff --git a/pkg/autoscaler/BUILD.bazel b/pkg/autoscaler/BUILD.bazel index 2b0e2f7ce4a8..62ddde62522a 100644 --- a/pkg/autoscaler/BUILD.bazel +++ b/pkg/autoscaler/BUILD.bazel @@ -5,21 +5,29 @@ go_library( srcs = [ "autoscaler.go", "doc.go", + "stats_reporter.go", ], importpath = "github.com/elafros/elafros/pkg/autoscaler", visibility = ["//visibility:public"], deps = [ "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/josephburnett/k8sflag/pkg/k8sflag:go_default_library", + "//vendor/go.opencensus.io/stats:go_default_library", + "//vendor/go.opencensus.io/stats/view:go_default_library", + "//vendor/go.opencensus.io/tag:go_default_library", ], ) go_test( name = "go_default_test", - srcs = ["autoscaler_test.go"], + srcs = [ + "autoscaler_test.go", + "stats_reporter_test.go", + ], embed = [":go_default_library"], deps = [ "//vendor/github.com/golang/glog:go_default_library", "//vendor/github.com/josephburnett/k8sflag/pkg/k8sflag:go_default_library", + "//vendor/go.opencensus.io/stats/view:go_default_library", ], ) diff --git a/pkg/autoscaler/autoscaler.go b/pkg/autoscaler/autoscaler.go index 4a4b0e3e1bb2..ef8abf3b431c 100644 --- a/pkg/autoscaler/autoscaler.go +++ b/pkg/autoscaler/autoscaler.go @@ -23,6 +23,7 @@ import ( "github.com/josephburnett/k8sflag/pkg/k8sflag" ) +// Stat defines a single measurement at a point in time type Stat struct { // The time the data point was collected on the pod. Time *time.Time @@ -47,6 +48,7 @@ var ( lastRequestTime = time.Now() ) +// Config defines the tunable autoscaler parameters type Config struct { TargetConcurrency *k8sflag.Float64Flag MaxScaleUpRate *k8sflag.Float64Flag @@ -55,18 +57,22 @@ type Config struct { ScaleToZeroThreshold *k8sflag.DurationFlag } +// Autoscaler stores current state of an instance of an autoscaler type Autoscaler struct { Config stats map[statKey]Stat panicking bool panicTime *time.Time maxPanicPods float64 + reporter StatsReporter } -func NewAutoscaler(config Config) *Autoscaler { +// NewAutoscaler creates a new instance of autoscaler +func NewAutoscaler(config Config, reporter StatsReporter) *Autoscaler { return &Autoscaler{ - Config: config, - stats: make(map[statKey]Stat), + Config: config, + stats: make(map[statKey]Stat), + reporter: reporter, } } @@ -83,7 +89,7 @@ func (a *Autoscaler) Record(stat Stat) { a.stats[key] = stat } -// Calculate the desired scale based on current statistics given the current time. +// Scale calculates the desired scale based on current statistics given the current time. // Not safe for concurrent access or concurrent access with Record. func (a *Autoscaler) Scale(now time.Time) (int32, bool) { @@ -144,6 +150,7 @@ func (a *Autoscaler) Scale(now time.Time) (int32, bool) { // Stop panicking after the surge has made its way into the stable metric. if a.panicking && a.panicTime.Add(*a.StableWindow.Get()).Before(now) { glog.Info("Un-panicking.") + a.reporter.Report(PanicM, 0) a.panicking = false a.panicTime = nil a.maxPanicPods = 0 @@ -175,6 +182,7 @@ func (a *Autoscaler) Scale(now time.Time) (int32, bool) { // Begin panicking when we cross the 6 second concurrency threshold. if !a.panicking && len(panicPods) > 0 && observedPanicConcurrency >= (a.TargetConcurrency.Get()*2) { glog.Info("PANICKING") + a.reporter.Report(PanicM, 1) a.panicking = true a.panicTime = &now } diff --git a/pkg/autoscaler/autoscaler_test.go b/pkg/autoscaler/autoscaler_test.go index bc2c68ab3965..9f9511234d82 100644 --- a/pkg/autoscaler/autoscaler_test.go +++ b/pkg/autoscaler/autoscaler_test.go @@ -259,6 +259,12 @@ type linearSeries struct { podCount int } +type mockReporter struct{} + +func (r *mockReporter) Report(m Measurement, v int64) error { + return nil +} + func newTestAutoscaler(targetConcurrency float64) *Autoscaler { stableWindow := 60 * time.Second panicWindow := 6 * time.Second @@ -270,7 +276,7 @@ func newTestAutoscaler(targetConcurrency float64) *Autoscaler { PanicWindow: k8sflag.Duration("panic-window", &panicWindow), ScaleToZeroThreshold: k8sflag.Duration("scale-to-zero-threshold", &scaleToZeroThreshold), } - return NewAutoscaler(config) + return NewAutoscaler(config, &mockReporter{}) } // Record a data point every second, for every pod, for duration of the diff --git a/pkg/autoscaler/doc.go b/pkg/autoscaler/doc.go index 23445a4bb756..67f80b6594f7 100644 --- a/pkg/autoscaler/doc.go +++ b/pkg/autoscaler/doc.go @@ -14,9 +14,9 @@ limitations under the License. */ /* -Autoscaler calculates the number of pods necessary for the desired -level of concurrency per pod (stableConcurrencyPerPod). It operates in -two modes, stable mode and panic mode. +Package autoscaler calculates the number of pods necessary for the +desired level of concurrency per pod (stableConcurrencyPerPod). It +operates in two modes, stable mode and panic mode. Stable mode calculates the average concurrency observed over the last 60 seconds and adjusts the observed pod count to achieve the target diff --git a/pkg/autoscaler/stats_reporter.go b/pkg/autoscaler/stats_reporter.go new file mode 100644 index 000000000000..fc62c02588c1 --- /dev/null +++ b/pkg/autoscaler/stats_reporter.go @@ -0,0 +1,143 @@ +/* +Copyright 2018 Google Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "context" + "errors" + + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" +) + +// Measurement represents the type of the autoscaler metric to be reported +type Measurement int + +const ( + // DesiredPodCountM is used for the pod count that autoscaler wants + DesiredPodCountM Measurement = 0 + // RequestedPodCountM is used for the requested pod count from kubernetes + RequestedPodCountM Measurement = 1 + // ActualPodCountM is used for the actual number of pods we have + ActualPodCountM Measurement = 2 + // PanicM is used as a flag to indicate if autoscaler is in panic mode or not + PanicM Measurement = 3 + lastEnumEntry = (int)(PanicM) +) + +// StatsReporter defines the interface for sending autoscaler metrics +type StatsReporter interface { + Report(m Measurement, v int64) error +} + +// Reporter holds cached metric objects to report autoscaler metrics +type Reporter struct { + measurements [lastEnumEntry + 1]*stats.Int64Measure + ctx context.Context + initialized bool +} + +// NewStatsReporter creates a reporter that collects and reports autoscaler metrics +func NewStatsReporter(podNamespace string, config string, revision string) (*Reporter, error) { + + var r = &Reporter{} + r.measurements[DesiredPodCountM] = stats.Int64( + "desired_pod_count", + "Number of pods autoscaler wants to allocate", + stats.UnitNone) + r.measurements[RequestedPodCountM] = stats.Int64( + "requested_pod_count", + "Number of pods autoscaler requested from Kubernetes", + stats.UnitNone) + r.measurements[ActualPodCountM] = stats.Int64( + "actual_pod_count", + "Number of pods that are allocated currently", + stats.UnitNone) + r.measurements[PanicM] = stats.Int64( + "panic_mode", + "1 if autoscaler is in panic mode, 0 otherwise", + stats.UnitNone) + + // Create the tag keys that will be used to add tags to our measurements. + namespaceTagKey, err := tag.NewKey("configuration_namespace") + if err != nil { + return nil, err + } + configTagKey, err := tag.NewKey("configuration") + if err != nil { + return nil, err + } + revisionTagKey, err := tag.NewKey("revision") + if err != nil { + return nil, err + } + + // Create view to see our measurements. + err = view.Register( + &view.View{ + Description: "Number of pods autoscaler wants to allocate", + Measure: r.measurements[DesiredPodCountM], + Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTagKey, configTagKey, revisionTagKey}, + }, + &view.View{ + Description: "Number of pods autoscaler requested from Kubernetes", + Measure: r.measurements[RequestedPodCountM], + Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTagKey, configTagKey, revisionTagKey}, + }, + &view.View{ + Description: "Number of pods that are allocated currently", + Measure: r.measurements[ActualPodCountM], + Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTagKey, configTagKey, revisionTagKey}, + }, + &view.View{ + Description: "1 if autoscaler is in panic mode, 0 otherwise", + Measure: r.measurements[PanicM], + Aggregation: view.LastValue(), + TagKeys: []tag.Key{namespaceTagKey, configTagKey, revisionTagKey}, + }, + ) + if err != nil { + return nil, err + } + + // Our tags are static. So, we can get away with creating a single context + // and reuse it for reporting all of our metrics. + r.ctx, err = tag.New( + context.Background(), + tag.Insert(namespaceTagKey, podNamespace), + tag.Insert(configTagKey, config), + tag.Insert(revisionTagKey, revision)) + if err != nil { + return nil, err + } + + r.initialized = true + return r, nil +} + +// Report captures value v for measurement m +func (r *Reporter) Report(m Measurement, v int64) error { + if !r.initialized { + return errors.New("StatsReporter is not initialized yet") + } + + stats.Record(r.ctx, r.measurements[m].M(v)) + return nil +} diff --git a/pkg/autoscaler/stats_reporter_test.go b/pkg/autoscaler/stats_reporter_test.go new file mode 100644 index 000000000000..570cae45d6ca --- /dev/null +++ b/pkg/autoscaler/stats_reporter_test.go @@ -0,0 +1,103 @@ +/* +Copyright 2018 Google Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "testing" + + "go.opencensus.io/stats/view" +) + +func TestReporter_Report(t *testing.T) { + r := &Reporter{} + if err := r.Report(DesiredPodCountM, 10); err == nil { + t.Error("Reporter.Report() expected an error for Report call before init. Got success.") + } + + r, _ = NewStatsReporter("testns", "testconfig", "testrev") + wantTags := map[string]string{ + "configuration_namespace": "testns", + "configuration": "testconfig", + "revision": "testrev", + } + + // Send statistics only once and observe the results + expectSuccess(t, func() error { return r.Report(DesiredPodCountM, 10) }) + expectSuccess(t, func() error { return r.Report(RequestedPodCountM, 7) }) + expectSuccess(t, func() error { return r.Report(ActualPodCountM, 5) }) + expectSuccess(t, func() error { return r.Report(PanicM, 0) }) + checkData(t, "desired_pod_count", wantTags, 10) + checkData(t, "requested_pod_count", wantTags, 7) + checkData(t, "actual_pod_count", wantTags, 5) + checkData(t, "panic_mode", wantTags, 0) + + // All the stats are gauges - record multiple entries for one stat - last one should stick + expectSuccess(t, func() error { return r.Report(DesiredPodCountM, 1) }) + expectSuccess(t, func() error { return r.Report(DesiredPodCountM, 2) }) + expectSuccess(t, func() error { return r.Report(DesiredPodCountM, 3) }) + checkData(t, "desired_pod_count", wantTags, 3) + + expectSuccess(t, func() error { return r.Report(RequestedPodCountM, 4) }) + expectSuccess(t, func() error { return r.Report(RequestedPodCountM, 5) }) + expectSuccess(t, func() error { return r.Report(RequestedPodCountM, 6) }) + checkData(t, "requested_pod_count", wantTags, 6) + + expectSuccess(t, func() error { return r.Report(ActualPodCountM, 7) }) + expectSuccess(t, func() error { return r.Report(ActualPodCountM, 8) }) + expectSuccess(t, func() error { return r.Report(ActualPodCountM, 9) }) + checkData(t, "actual_pod_count", wantTags, 9) + + expectSuccess(t, func() error { return r.Report(PanicM, 1) }) + expectSuccess(t, func() error { return r.Report(PanicM, 0) }) + expectSuccess(t, func() error { return r.Report(PanicM, 1) }) + checkData(t, "panic_mode", wantTags, 1) + + expectSuccess(t, func() error { return r.Report(PanicM, 0) }) + checkData(t, "panic_mode", wantTags, 0) +} + +func expectSuccess(t *testing.T, f func() error) { + if err := f(); err != nil { + t.Errorf("Reporter.Report() expected success but got error %v", err) + } +} + +func checkData(t *testing.T, name string, wantTags map[string]string, wantValue int) { + if d, err := view.RetrieveData(name); err != nil { + t.Errorf("Reporter.Report() error = %v, wantErr %v", err, false) + } else { + if len(d) != 1 { + t.Errorf("Reporter.Report() len(d) %v, want %v", len(d), 1) + } + for _, got := range d[0].Tags { + if want, ok := wantTags[got.Key.Name()]; !ok { + t.Errorf("Reporter.Report() got an extra tag %v: %v", got.Key.Name(), got.Value) + } else { + if got.Value != want { + t.Errorf("Reporter.Report() expected a different tag value. key:%v, got: %v, want: %v", got.Key.Name(), got.Value, want) + } + } + } + + if s, ok := d[0].Data.(*view.LastValueData); !ok { + t.Error("Reporter.Report() expected a LastValueData type") + } else { + if s.Value != (float64)(wantValue) { + t.Errorf("Reporter.Report() expected %v got %v. metric: %v", s.Value, (float64)(wantValue), name) + } + } + } +} diff --git a/pkg/controller/revision/ela_autoscaler.go b/pkg/controller/revision/ela_autoscaler.go index 33bd0d9abfd0..fc97dc9d810e 100644 --- a/pkg/controller/revision/ela_autoscaler.go +++ b/pkg/controller/revision/ela_autoscaler.go @@ -43,8 +43,6 @@ func MakeElaAutoscalerDeployment(rev *v1alpha1.Revision, autoscalerImage string) MaxSurge: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, } - labels := MakeElaResourceLabels(rev) - labels[ela.AutoscalerLabelKey] = controller.GetRevisionAutoscalerName(rev) annotations := MakeElaResourceAnnotations(rev) annotations[sidecarIstioInjectAnnotation] = "false" @@ -76,7 +74,7 @@ func MakeElaAutoscalerDeployment(rev *v1alpha1.Revision, autoscalerImage string) }, Template: corev1.PodTemplateSpec{ ObjectMeta: meta_v1.ObjectMeta{ - Labels: labels, + Labels: makeElaAutoScalerLabels(rev), Annotations: annotations, }, Spec: corev1.PodSpec{ @@ -102,6 +100,10 @@ func MakeElaAutoscalerDeployment(rev *v1alpha1.Revision, autoscalerImage string) Name: "ELA_DEPLOYMENT", Value: controller.GetRevisionDeploymentName(rev), }, + { + Name: "ELA_CONFIGURATION", + Value: controller.LookupOwningConfigurationName(rev.OwnerReferences), + }, { Name: "ELA_REVISION", Value: rev.Name, @@ -138,7 +140,7 @@ func MakeElaAutoscalerService(rev *v1alpha1.Revision) *corev1.Service { ObjectMeta: meta_v1.ObjectMeta{ Name: controller.GetRevisionAutoscalerName(rev), Namespace: AutoscalerNamespace, - Labels: MakeElaResourceLabels(rev), + Labels: makeElaAutoScalerLabels(rev), Annotations: MakeElaResourceAnnotations(rev), }, Spec: corev1.ServiceSpec{ @@ -156,3 +158,11 @@ func MakeElaAutoscalerService(rev *v1alpha1.Revision) *corev1.Service { }, } } + +// makeElaAutoScalerLabels constructs the labels we will apply to +// service and deployment specs for autoscaler. +func makeElaAutoScalerLabels(rev *v1alpha1.Revision) map[string]string { + labels := MakeElaResourceLabels(rev) + labels[ela.AutoscalerLabelKey] = controller.GetRevisionAutoscalerName(rev) + return labels +} diff --git a/pkg/controller/revision/revision_test.go b/pkg/controller/revision/revision_test.go index 7a76946c4f30..671e6672e7c4 100644 --- a/pkg/controller/revision/revision_test.go +++ b/pkg/controller/revision/revision_test.go @@ -23,6 +23,7 @@ package revision import ( "fmt" "reflect" + "strconv" "testing" "time" @@ -448,6 +449,9 @@ func TestCreateRevCreatesStuff(t *testing.T) { foundAutoscaler = true checkEnv(container.Env, "ELA_NAMESPACE", testNamespace, "") checkEnv(container.Env, "ELA_DEPLOYMENT", expectedDeploymentName, "") + checkEnv(container.Env, "ELA_CONFIGURATION", config.Name, "") + checkEnv(container.Env, "ELA_REVISION", rev.Name, "") + checkEnv(container.Env, "ELA_AUTOSCALER_PORT", strconv.Itoa(autoscalerPort), "") break } } @@ -464,7 +468,7 @@ func TestCreateRevCreatesStuff(t *testing.T) { if diff := cmp.Diff(expectedRefs, asService.OwnerReferences, cmpopts.IgnoreFields(expectedRefs[0], "Controller", "BlockOwnerDeletion")); diff != "" { t.Errorf("Unexpected service owner refs diff (-want +got): %v", diff) } - if labels := asService.ObjectMeta.Labels; !reflect.DeepEqual(labels, expectedLabels) { + if labels := asService.ObjectMeta.Labels; !reflect.DeepEqual(labels, expectedAutoscalerLabels) { t.Errorf("Label not set correctly autoscaler service: expected %v got %v.", expectedLabels, labels) } @@ -574,7 +578,7 @@ func TestCreateRevPreservesAppLabel(t *testing.T) { if err != nil { t.Fatalf("Couldn't get autoscaler service: %v", err) } - if labels := asService.ObjectMeta.Labels; !reflect.DeepEqual(labels, expectedLabels) { + if labels := asService.ObjectMeta.Labels; !reflect.DeepEqual(labels, expectedAutoscalerLabels) { t.Errorf("Label not set correctly autoscaler service: expected %v got %v.", expectedLabels, labels) }