From 20e38f2cc8b19d56a162265e7efaa419dd2131b7 Mon Sep 17 00:00:00 2001 From: mdemirhan Date: Wed, 18 Jul 2018 12:12:54 -0700 Subject: [PATCH 1/6] * Change log levels dynamically for activator, webhook and autoscaler * Remove the use of runtime.Handle and instead use logger.Errorf since that is pretty much all runtime.Handle does today. * Disable glog in activator and autoscaler to prevent k8s logs appearing in these services. We already log errors and extra & unstructured k8s logs are not helpful. * Simplify the monitoring installation by getting rid of dev profile. Instead, we now by default watch for logs in knative components, but the logging level is set to fatal. To enable Knative logging, we just need to edit config-logging configmap and change those values to info. The updates will be picked up immediately without the need to restart any components. --- DEVELOPMENT.md | 31 +- cmd/activator/main.go | 13 +- cmd/autoscaler/main.go | 14 +- cmd/controller/main.go | 24 +- cmd/controller/main_test.go | 63 --- cmd/webhook/main.go | 17 +- config/activator.yaml | 7 +- config/config-logging.yaml | 14 +- .../100-fluentd-configmap.yaml | 82 ---- .../100-scaling-configmap.yaml | 455 ------------------ .../100-fluentd-configmap.yaml | 2 +- .../100-scaling-configmap.yaml} | 0 .../fluentd-configmap.yaml | 93 ---- .../fluentd-configmap.yaml | 1 - config/monitoring/README.md | 7 +- docs/setting-up-a-logging-plugin.md | 2 +- docs/telemetry.md | 218 +-------- hack/release.sh | 4 +- pkg/controller/configuration/configuration.go | 5 +- pkg/controller/controller.go | 8 +- .../revision/resources/autoscaler.go | 4 + pkg/controller/revision/revision.go | 5 +- pkg/controller/route/route.go | 6 +- pkg/controller/service/service.go | 6 +- pkg/logging/config.go | 19 + pkg/logging/config_test.go | 39 ++ test/e2e-tests.sh | 4 +- 27 files changed, 169 insertions(+), 974 deletions(-) delete mode 100644 cmd/controller/main_test.go delete mode 100644 config/monitoring/150-elasticsearch-prod/100-fluentd-configmap.yaml delete mode 100644 config/monitoring/150-elasticsearch-prod/100-scaling-configmap.yaml rename config/monitoring/{150-elasticsearch-dev => 150-elasticsearch}/100-fluentd-configmap.yaml (90%) rename config/monitoring/{150-elasticsearch-dev/100-scaling-configmap-dev.yaml => 150-elasticsearch/100-scaling-configmap.yaml} (100%) delete mode 100644 config/monitoring/150-stackdriver-prod/fluentd-configmap.yaml rename config/monitoring/{150-stackdriver-dev => 150-stackdriver}/fluentd-configmap.yaml (98%) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index e8f71bd53737..8ba7f37b299a 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -82,7 +82,8 @@ To check out this repository: 1. Create your own [fork of this repo](https://help.github.com/articles/fork-a-repo/) -2. Clone it to your machine: +1. Clone it to your machine: + ```shell mkdir -p ${GOPATH}/src/github.com/knative cd ${GOPATH}/src/github.com/knative @@ -152,9 +153,13 @@ Next, run: ```shell ko apply -f config/ + +# Set the logging threhold to info for all Knative components and reapply the config +sed 's/\"fatal\"/\"info\"/g' config/config-logging.yaml | kubectl apply -f - ``` You can see things running with: + ```shell kubectl -n knative-serving get pods NAME READY STATUS RESTARTS AGE @@ -172,35 +177,19 @@ If you're using a GCP project to host your Kubernetes cluster, it's good to chec [Discovery & load balancing](http://console.developers.google.com/kubernetes/discovery) page to ensure that all services are up and running (and not blocked by a quota issue, for example). -### Enable log and metric collection - -You can use two different setups for collecting logs(to Elasticsearch&Kibana) and metrics -(See [Logs and Metrics](./docs/telemetry.md) for setting up other logging backend): - -1. **150-elasticsearch-prod**: This configuration collects logs & metrics from user containers, build controller and Istio requests. - -```shell -kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ - -f third_party/config/monitoring/common \ - -f third_party/config/monitoring/elasticsearch \ - -f config/monitoring/200-common \ - -f config/monitoring/200-common/100-istio.yaml -``` +### Install logging and monitoring backends -1. **150-elasticsearch-dev**: This configuration collects everything in (1) plus Knative Serving controller logs. +Run: ```shell kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-dev \ + -f config/monitoring/150-elasticsearch \ -f third_party/config/monitoring/common \ -f third_party/config/monitoring/elasticsearch \ -f config/monitoring/200-common \ -f config/monitoring/200-common/100-istio.yaml ``` -Once complete, follow the instructions at [Logs and Metrics](./docs/telemetry.md). - ## Iterating As you make changes to the code-base, there are two special cases to be aware of: @@ -212,6 +201,7 @@ As you make changes to the code-base, there are two special cases to be aware of These are both idempotent, and we expect that running these at `HEAD` to have no diffs. Once the codegen and dependency information is correct, redeploying the controller is simply: + ```shell ko apply -f config/controller.yaml ``` @@ -222,6 +212,7 @@ redeploy `Knative Serving`](./README.md#start-knative). ## Clean up You can delete all of the service components with: + ```shell ko delete --ignore-not-found=true \ -f config/monitoring/100-common \ diff --git a/cmd/activator/main.go b/cmd/activator/main.go index 71f78842a92a..5c48a8c41999 100644 --- a/cmd/activator/main.go +++ b/cmd/activator/main.go @@ -32,6 +32,7 @@ import ( h2cutil "github.com/knative/serving/pkg/h2c" "github.com/knative/serving/pkg/logging" "github.com/knative/serving/pkg/signals" + "github.com/knative/serving/pkg/system" "github.com/knative/serving/third_party/h2c" "go.uber.org/zap" "k8s.io/client-go/kubernetes" @@ -42,6 +43,7 @@ const ( maxUploadBytes = 32e6 // 32MB - same as app engine maxRetry = 60 retryInterval = 1 * time.Second + logLevelKey = "activator" ) type activationHandler struct { @@ -124,7 +126,7 @@ func (a *activationHandler) handler(w http.ResponseWriter, r *http.Request) { endpoint, status, err := a.act.ActiveEndpoint(namespace, name) if err != nil { msg := fmt.Sprintf("Error getting active endpoint: %v", err) - a.logger.Errorf(msg) + a.logger.Error(msg) http.Error(w, msg, int(status)) return } @@ -154,7 +156,7 @@ func main() { if err != nil { log.Fatalf("Error parsing logging configuration: %v", err) } - logger, _ := logging.NewLoggerFromConfig(config, "activator") + logger, atomicLevel := logging.NewLoggerFromConfig(config, logLevelKey) defer logger.Sync() logger.Info("Starting the knative activator") @@ -183,6 +185,13 @@ func main() { a.Shutdown() }() + // Watch the logging config map and dynamically update logging levels. + configMapWatcher := configmap.NewDefaultWatcher(kubeClient, system.Namespace) + configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logLevelKey)) + if err = configMapWatcher.Start(stopCh); err != nil { + logger.Fatalf("failed to start configuration manager: %v", err) + } + http.HandleFunc("/", ah.handler) h2c.ListenAndServe(":8080", nil) } diff --git a/cmd/autoscaler/main.go b/cmd/autoscaler/main.go index b02bc01e44c6..182b758f2d00 100644 --- a/cmd/autoscaler/main.go +++ b/cmd/autoscaler/main.go @@ -35,6 +35,7 @@ import ( "github.com/knative/serving/pkg/configmap" "github.com/knative/serving/pkg/logging" "github.com/knative/serving/pkg/logging/logkey" + "github.com/knative/serving/pkg/system" "github.com/gorilla/websocket" @@ -53,6 +54,7 @@ const ( // seconds while an http request is taking the full timeout of 5 // second. scaleBufferSize = 10 + logLevelKey = "autoscaler" ) var ( @@ -69,6 +71,7 @@ var ( servingAutoscalerPort string currentScale int32 logger *zap.SugaredLogger + atomicLevel zap.AtomicLevel // Revision-level configuration concurrencyModel = flag.String("concurrencyModel", string(v1alpha1.RevisionRequestConcurrencyModelMulti), "") @@ -226,7 +229,7 @@ func main() { if err != nil { log.Fatalf("Error parsing logging configuration: %v", err) } - logger, _ = logging.NewLoggerFromConfig(logginConfig, "autoscaler") + logger, atomicLevel = logging.NewLoggerFromConfig(logginConfig, logLevelKey) defer logger.Sync() initEnv() @@ -265,6 +268,14 @@ func main() { } statsReporter = reporter + // Watch the logging config map and dynamically update logging levels. + stopCh := make(chan struct{}) + configMapWatcher := configmap.NewDefaultWatcher(kubeClient, system.Namespace) + configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logLevelKey)) + if err := configMapWatcher.Start(stopCh); err != nil { + logger.Fatalf("failed to start configuration manager: %v", err) + } + go runAutoscaler() go scaleSerializer() @@ -272,4 +283,5 @@ func main() { mux.HandleFunc("/", handler) mux.Handle("/metrics", exporter) http.ListenAndServe(":"+servingAutoscalerPort, mux) + close(stopCh) } diff --git a/cmd/controller/main.go b/cmd/controller/main.go index a322ca50a38f..966b4d741446 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -22,13 +22,11 @@ import ( "time" "github.com/knative/serving/pkg/configmap" - "go.uber.org/zap" "github.com/knative/serving/pkg/controller" "github.com/knative/serving/pkg/logging" "github.com/knative/serving/pkg/system" - corev1 "k8s.io/api/core/v1" vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/clientset/versioned" vpainformers "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/client/informers/externalversions" @@ -52,6 +50,7 @@ import ( const ( threadsPerController = 2 + logLevelKey = "controller" ) var ( @@ -69,7 +68,7 @@ func main() { if err != nil { log.Fatalf("Error parsing logging configuration: %v", err) } - logger, atomicLevel := logging.NewLoggerFromConfig(loggingConfig, "controller") + logger, atomicLevel := logging.NewLoggerFromConfig(loggingConfig, logLevelKey) defer logger.Sync() // set up signals so we handle the first shutdown signal gracefully @@ -164,7 +163,7 @@ func main() { } // Watch the logging config map and dynamically update logging levels. - configMapWatcher.Watch(logging.ConfigName, receiveLoggingConfig(logger, atomicLevel)) + configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logLevelKey)) // These are non-blocking. kubeInformerFactory.Start(stopCh) @@ -213,20 +212,3 @@ func init() { flag.StringVar(&kubeconfig, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.") flag.StringVar(&masterURL, "master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.") } - -func receiveLoggingConfig(logger *zap.SugaredLogger, atomicLevel zap.AtomicLevel) func(configMap *corev1.ConfigMap) { - return func(configMap *corev1.ConfigMap) { - loggingConfig, err := logging.NewConfigFromConfigMap(configMap) - if err != nil { - logger.Error("Failed to parse the logging configmap. Previous config map will be used.", zap.Error(err)) - return - } - - if level, ok := loggingConfig.LoggingLevel["controller"]; ok { - if atomicLevel.Level() != level { - logger.Infof("Updating logging level from %v to %v.", atomicLevel.Level(), level) - atomicLevel.SetLevel(level) - } - } - } -} diff --git a/cmd/controller/main_test.go b/cmd/controller/main_test.go deleted file mode 100644 index de6866b8cfcd..000000000000 --- a/cmd/controller/main_test.go +++ /dev/null @@ -1,63 +0,0 @@ -/* -Copyright 2018 The Knative Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "testing" - - "github.com/knative/serving/pkg/logging" - "github.com/knative/serving/pkg/system" - "go.uber.org/zap/zapcore" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestReceiveLoggingConfigMap(t *testing.T) { - logger, atomicLevel := logging.NewLogger("", "debug") - want := zapcore.DebugLevel - if atomicLevel.Level() != zapcore.DebugLevel { - t.Fatalf("Expected initial logger level to %v, got: %v", want, atomicLevel.Level()) - } - - receiveFunc := receiveLoggingConfig(logger, atomicLevel) - cm := &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: system.Namespace, - Name: "config-logging", - }, - Data: map[string]string{ - "zap-logger-config": "", - "loglevel.controller": "info", - }, - } - - for _, test := range []struct { - l zapcore.Level - s string - }{ - {zapcore.InfoLevel, "info"}, - {zapcore.DebugLevel, "debug"}, - {zapcore.ErrorLevel, "error"}, - {zapcore.ErrorLevel, "invalid level"}, - } { - cm.Data["loglevel.controller"] = test.s - receiveFunc(cm) - if atomicLevel.Level() != test.l { - t.Errorf("Expected logger level to be %v, got: %v", test.l, atomicLevel.Level()) - } - } -} diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 3f2a5ddfc42a..094388aec80c 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -31,6 +31,10 @@ import ( "k8s.io/client-go/rest" ) +const ( + logLevelKey = "webhook" +) + func main() { flag.Parse() cm, err := configmap.Load("/etc/config-logging") @@ -41,7 +45,7 @@ func main() { if err != nil { log.Fatalf("Error parsing logging configuration: %v", err) } - logger, _ := logging.NewLoggerFromConfig(config, "webhook") + logger, atomicLevel := logging.NewLoggerFromConfig(config, logLevelKey) defer logger.Sync() logger.Info("Starting the Configuration Webhook") @@ -54,11 +58,18 @@ func main() { logger.Fatal("Failed to get in cluster config", zap.Error(err)) } - clientset, err := kubernetes.NewForConfig(clusterConfig) + kubeClient, err := kubernetes.NewForConfig(clusterConfig) if err != nil { logger.Fatal("Failed to get the client set", zap.Error(err)) } + // Watch the logging config map and dynamically update logging levels. + configMapWatcher := configmap.NewDefaultWatcher(kubeClient, system.Namespace) + configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logLevelKey)) + if err = configMapWatcher.Start(stopCh); err != nil { + logger.Fatalf("failed to start configuration manager: %v", err) + } + options := webhook.ControllerOptions{ ServiceName: "webhook", ServiceNamespace: system.Namespace, @@ -66,7 +77,7 @@ func main() { SecretName: "webhook-certs", WebhookName: "webhook.knative.dev", } - controller, err := webhook.NewAdmissionController(clientset, options, logger) + controller, err := webhook.NewAdmissionController(kubeClient, options, logger) if err != nil { logger.Fatal("Failed to create the admission controller", zap.Error(err)) } diff --git a/config/activator.yaml b/config/activator.yaml index cfda00c92304..28a0ef3d66eb 100644 --- a/config/activator.yaml +++ b/config/activator.yaml @@ -39,9 +39,10 @@ spec: - name: http containerPort: 8080 args: - - "-logtostderr=true" - - "-stderrthreshold=INFO" - + # Disable glog writing into stderr. Our code doesn't use glog + # and seeing k8s logs in addition to ours is not useful. + - "-logtostderr=false" + - "-stderrthreshold=FATAL" volumeMounts: - name: config-logging mountPath: /etc/config-logging diff --git a/config/config-logging.yaml b/config/config-logging.yaml index b9f66fa073b7..05bd8b26fbd5 100644 --- a/config/config-logging.yaml +++ b/config/config-logging.yaml @@ -42,8 +42,12 @@ data: } # Log level overrides - loglevel.controller: "info" - loglevel.autoscaler: "info" - loglevel.queueproxy: "info" - loglevel.webhook: "info" - loglevel.activator: "info" + # By default, we only log critical errors. + # To enable logging for Knative components, change log levels below to "info" + # For all components except the autoscaler, changes will be picked up immediately. + # Autoscaler changes require recreation of the revision until the multi-tenant autoscaler lands. + loglevel.controller: "fatal" + loglevel.autoscaler: "fatal" + loglevel.queueproxy: "fatal" + loglevel.webhook: "fatal" + loglevel.activator: "fatal" diff --git a/config/monitoring/150-elasticsearch-prod/100-fluentd-configmap.yaml b/config/monitoring/150-elasticsearch-prod/100-fluentd-configmap.yaml deleted file mode 100644 index 63ef0f282d37..000000000000 --- a/config/monitoring/150-elasticsearch-prod/100-fluentd-configmap.yaml +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2018 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -kind: ConfigMap -apiVersion: v1 -metadata: - name: fluentd-ds-config - namespace: monitoring - labels: - addonmanager.kubernetes.io/mode: Reconcile -data: - 100.system.conf: |- - - root_dir /tmp/fluentd-buffers/ - - 200.containers.input.conf: |- - - @id fluentd-containers.log - @type tail - path /var/log/containers/*user-container-*.log,/var/log/containers/*build-step-*.log - pos_file /var/log/es-containers.log.pos - time_format %Y-%m-%dT%H:%M:%S.%NZ - tag raw.kubernetes.* - format json - read_from_head true - - # Combine multi line logs which form an exception stack trace into a single log entry - - @id raw.kubernetes - @type detect_exceptions - remove_tag_prefix raw - message log - stream stream - multiline_flush_interval 5 - max_bytes 500000 - max_lines 1000 - - # Add Kubernetes metadata - - @type kubernetes_metadata - - 300.forward.input.conf: |- - # Takes the messages sent over TCP, e.g. request logs from Istio - - @type forward - port 24224 - - 900.output.conf: |- - # Send to Elastic Search - - @id elasticsearch - @type elasticsearch - @log_level info - include_tag_key true - host elasticsearch-logging - port 9200 - logstash_format true - - @type file - path /var/log/fluentd-buffers/kubernetes.system.buffer - flush_mode interval - retry_type exponential_backoff - flush_thread_count 2 - flush_interval 5s - retry_forever - retry_max_interval 30 - chunk_limit_size 2M - queue_limit_length 8 - overflow_action block - - diff --git a/config/monitoring/150-elasticsearch-prod/100-scaling-configmap.yaml b/config/monitoring/150-elasticsearch-prod/100-scaling-configmap.yaml deleted file mode 100644 index 997671cb1cd8..000000000000 --- a/config/monitoring/150-elasticsearch-prod/100-scaling-configmap.yaml +++ /dev/null @@ -1,455 +0,0 @@ -# Copyright 2018 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: ConfigMap -metadata: - name: scaling-config - namespace: monitoring -data: - scaling-dashboard.json: |+ - { - "__inputs": [ - { - "description": "", - "label": "prometheus", - "name": "prometheus", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "description": "Knative Serving - Scaling", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "id": null, - "iteration": 1525724908045, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 17, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "legend": { - "avg": false, - "current": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Panic Mode", - "color": "#f29191", - "dashes": true, - "fill": 2, - "linewidth": 2, - "steppedLine": true, - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": true, - "targets": [ - { - "expr": "autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"}", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Actual Pods", - "refId": "A" - }, - { - "expr": "autoscaler_desired_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Desired Pods", - "refId": "B" - }, - { - "expr": "autoscaler_requested_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", - "format": "time_series", - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Requested Pods", - "refId": "C" - }, - { - "expr": "autoscaler_panic_mode{configuration_namespace=\"$namespace\", configuration=\"$configuration\", revision=\"$revision\"} ", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "1s", - "intervalFactor": 1, - "legendFormat": "Panic Mode", - "refId": "D" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Revision Pod Counts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": "Panic Mode", - "logBase": 1, - "max": "1.0", - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 4, - "legend": { - "avg": false, - "current": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Cores requested", - "refId": "A" - }, - { - "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"}[1m]))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Cores used", - "refId": "B" - }, - { - "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "Core limit", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Revision CPU Usage", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 17 - }, - "id": 6, - "legend": { - "avg": false, - "current": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Memory requested", - "refId": "A" - }, - { - "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=~\"$revision-deployment-.*\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "Memory used", - "refId": "B" - }, - { - "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=~\"$revision-deployment-.*\"})", - "format": "time_series", - "intervalFactor": 1, - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod Memory Usage", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - } - ], - "refresh": "5s", - "schemaVersion": 16, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Namespace", - "multi": false, - "name": "namespace", - "options": [], - "query": "label_values(autoscaler_actual_pod_count, configuration_namespace)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Configuration", - "multi": false, - "name": "configuration", - "options": [], - "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\"}, configuration)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": {}, - "datasource": "prometheus", - "hide": 0, - "includeAll": false, - "label": "Revision", - "multi": false, - "name": "revision", - "options": [], - "query": "label_values(autoscaler_actual_pod_count{configuration_namespace=\"$namespace\", configuration=\"$configuration\"}, revision)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "", - "title": "Knative Serving - Scaling", - "uid": "u_-9SIMiz", - "version": 4 - } \ No newline at end of file diff --git a/config/monitoring/150-elasticsearch-dev/100-fluentd-configmap.yaml b/config/monitoring/150-elasticsearch/100-fluentd-configmap.yaml similarity index 90% rename from config/monitoring/150-elasticsearch-dev/100-fluentd-configmap.yaml rename to config/monitoring/150-elasticsearch/100-fluentd-configmap.yaml index af8bf88402a8..de9b33f8dafa 100644 --- a/config/monitoring/150-elasticsearch-dev/100-fluentd-configmap.yaml +++ b/config/monitoring/150-elasticsearch/100-fluentd-configmap.yaml @@ -28,7 +28,7 @@ data: @id fluentd-containers.log @type tail - path /var/log/containers/*user-container-*.log,/var/log/containers/*build-step-*.log,/var/log/containers/*controller-*.log,/var/log/containers/*webhook-*.log,/var/log/containers/*autoscaler-*.log,/var/log/containers/*queue-proxy-*.log,/var/log/containers/*activator-*.log + path /var/log/containers/*user-container-*.log,/var/log/containers/*build-step-*.log,/var/log/containers/controller-*controller-*.log,/var/log/containers/webhook-*webhook-*.log,/var/log/containers/*autoscaler-*autoscaler-*.log,/var/log/containers/*queue-proxy-*.log,/var/log/containers/activator-*activator-*.log pos_file /var/log/es-containers.log.pos time_format %Y-%m-%dT%H:%M:%S.%NZ tag raw.kubernetes.* diff --git a/config/monitoring/150-elasticsearch-dev/100-scaling-configmap-dev.yaml b/config/monitoring/150-elasticsearch/100-scaling-configmap.yaml similarity index 100% rename from config/monitoring/150-elasticsearch-dev/100-scaling-configmap-dev.yaml rename to config/monitoring/150-elasticsearch/100-scaling-configmap.yaml diff --git a/config/monitoring/150-stackdriver-prod/fluentd-configmap.yaml b/config/monitoring/150-stackdriver-prod/fluentd-configmap.yaml deleted file mode 100644 index e793782074e3..000000000000 --- a/config/monitoring/150-stackdriver-prod/fluentd-configmap.yaml +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2018 The Knative Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -kind: ConfigMap -apiVersion: v1 -metadata: - name: fluentd-ds-config - namespace: monitoring - labels: - addonmanager.kubernetes.io/mode: Reconcile -data: - 100.system.conf: |- - - root_dir /tmp/fluentd-buffers/ - - 200.containers.input.conf: |- - - @id fluentd-containers.log - @type tail - # path is different from dev configuration - path /var/log/containers/*user-container-*.log,/var/log/containers/*build-step-*.log - pos_file /var/log/es-containers.log.pos - time_format %Y-%m-%dT%H:%M:%S.%NZ - tag raw.kubernetes.* - format json - read_from_head true - - # Combine multi line logs which form an exception stack trace into a single log entry - - @id raw.kubernetes - @type detect_exceptions - remove_tag_prefix raw - message log - stream stream - multiline_flush_interval 5 - max_bytes 500000 - max_lines 1000 - - # Add Kubernetes metadata - - @type kubernetes_metadata - merge_json_log false # Don't parse json log - preserve_json_log false - - 300.forward.input.conf: |- - # Takes the messages sent over TCP, e.g. request logs from Istio - - @type forward - port 24224 - - 900.output.conf: |- - # Send to Stackdriver - # google_cloud plugin moves `kubernetes` metadata to `labels`. - - @type google_cloud - - # Try to detect JSON formatted log entries. - detect_json true - # Allow log entries from multiple containers to be sent in the same request. - split_logs_by_tag false - # Set the buffer type to file to improve the reliability and reduce the memory consumption - buffer_type file - buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer - # Set queue_full action to block because we want to pause gracefully - # in case of the off-the-limits load instead of throwing an exception - buffer_queue_full_action block - # Set the chunk limit conservatively to avoid exceeding the recommended - # chunk size of 5MB per write request. - buffer_chunk_limit 1M - # Cap the combined memory usage of this buffer and the one below to - # 1MiB/chunk * (6 + 2) chunks = 8 MiB - buffer_queue_limit 6 - # Never wait more than 5 seconds before flushing logs in the non-error case. - flush_interval 5s - # Never wait longer than 30 seconds between retries. - max_retry_wait 30 - # Disable the limit on the number of retries (retry forever). - disable_retry_limit - # Use multiple threads for processing. - num_threads 2 - use_grpc true - diff --git a/config/monitoring/150-stackdriver-dev/fluentd-configmap.yaml b/config/monitoring/150-stackdriver/fluentd-configmap.yaml similarity index 98% rename from config/monitoring/150-stackdriver-dev/fluentd-configmap.yaml rename to config/monitoring/150-stackdriver/fluentd-configmap.yaml index 8dd2e562aa43..0cf4377e2c16 100644 --- a/config/monitoring/150-stackdriver-dev/fluentd-configmap.yaml +++ b/config/monitoring/150-stackdriver/fluentd-configmap.yaml @@ -28,7 +28,6 @@ data: @id fluentd-containers.log @type tail - # path is different from prod configuration path /var/log/containers/*user-container-*.log,/var/log/containers/*build-step-*.log,/var/log/containers/*controller-*.log,/var/log/containers/*webhook-*.log,/var/log/containers/*autoscaler-*.log,/var/log/containers/*queue-proxy-*.log,/var/log/containers/*activator-*.log pos_file /var/log/es-containers.log.pos time_format %Y-%m-%dT%H:%M:%S.%NZ diff --git a/config/monitoring/README.md b/config/monitoring/README.md index 19dbf50dcc98..29e99af30516 100644 --- a/config/monitoring/README.md +++ b/config/monitoring/README.md @@ -6,7 +6,7 @@ monitoring components by running the following at the root of the repository: ```shell kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ + -f config/monitoring/150-elasticsearch \ -f third_party/config/monitoring/common \ -f third_party/config/monitoring/elasticsearch \ -f config/monitoring/200-common \ @@ -26,9 +26,8 @@ a three digit prefix is added. * The root folder (`config/monitoring`) is special. It requires the following installation ordering: * `/config/monitoring/100-common` - * Only one of `/config/monitoring/150-*`. File with `dev` postfix is a special - configuration that enables verbose logging and should only be used for development - purposes. File with `elasticsearch` or `stackdriver` indicates the logging destination. + * Only one of `/config/monitoring/150-*`. + File with `elasticsearch` or `stackdriver` indicates the logging destination. * `/third_party/config/monitoring/common` * `/third_party/config/monitoring/elasticsearch`. Required only when Elasticsearch is used as logging destination. * `/config/monitoring/200-common` diff --git a/docs/setting-up-a-logging-plugin.md b/docs/setting-up-a-logging-plugin.md index 81cda8039a9e..80b2a4074550 100644 --- a/docs/setting-up-a-logging-plugin.md +++ b/docs/setting-up-a-logging-plugin.md @@ -68,7 +68,7 @@ kubectl apply -f \ ``` In the commands above, replace `` with the -Fluentd DaemonSet configuration file, e.g. `config/monitoring/150-stackdriver-prod`. +Fluentd DaemonSet configuration file, e.g. `config/monitoring/150-stackdriver`. **NOTE**: Operators sometimes need to deploy extra services as the logging backends. For example, if they desire Elasticsearch&Kibana, they have to deploy diff --git a/docs/telemetry.md b/docs/telemetry.md index f17c868336ee..db94457a2fe9 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -1,202 +1,23 @@ -# Logs and metrics +# Logs, metrics and traces -## Monitoring components setup +Install monitoring components using +[Monitoring, Logging and Tracing Installation](https://github.com/knative/docs/blob/master/serving/installing-logging-metrics-traces.md). -First, deploy monitoring components. - -### Elasticsearch, Kibana, Prometheus, and Grafana Setup - -You can use two different setups: - -1. **150-elasticsearch-prod**: This configuration collects logs & metrics from -user containers, build controller and Istio requests. - - ```shell - kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ - -f third_party/config/monitoring/common \ - -f third_party/config/monitoring/elasticsearch \ - -f config/monitoring/200-common \ - -f config/monitoring/200-common/100-istio.yaml - ``` - -1. **150-elasticsearch-dev**: This configuration collects everything **150 --elasticsearch-prod** does, plus Knative Serving controller logs. - - ```shell - kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-dev \ - -f third_party/config/monitoring/common \ - -f third_party/config/monitoring/elasticsearch \ - -f config/monitoring/200-common \ - -f config/monitoring/200-common/100-istio.yaml - ``` - -### Stackdriver, Prometheus, and Grafana Setup - -If your Knative Serving is not built on a Google Cloud Platform based cluster, -or you want to send logs to another GCP project, you need to build your own -Fluentd image and modify the configuration first. See - -1. [Fluentd image on Knative Serving](/image/fluentd/README.md) -2. [Setting up a logging plugin](setting-up-a-logging-plugin.md) - -Then you can use two different setups: - -1. **150-stackdriver-prod**: This configuration collects logs and metrics from -user containers, build controller, and Istio requests. - -```shell -kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-stackdriver-prod \ - -f third_party/config/monitoring/common \ - -f config/monitoring/200-common \ - -f config/monitoring/200-common/100-istio.yaml -``` - -2. **150-stackdriver-dev**: This configuration collects everything **150 --stackdriver-prod** does, plus Knative Serving controller logs. - -```shell -kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-stackdriver-dev \ - -f third_party/config/monitoring/common \ - -f config/monitoring/200-common \ - -f config/monitoring/200-common/100-istio.yaml -``` - -## Accessing logs - -### Kibana and Elasticsearch - -To open the Kibana UI (the visualization tool for [Elasticsearch](https://info.elastic.co), -enter the following command: - -```shell -kubectl proxy -``` - -This starts a local proxy of Kibana on port 8001. The Kibana UI is only exposed within -the cluster for security reasons. - -Navigate to the [Kibana UI](http://localhost:8001/api/v1/namespaces/monitoring/services/kibana-logging/proxy/app/kibana) -(*It might take a couple of minutes for the proxy to work*). - -When Kibana is opened the first time, it will ask you to create an index. -Accept the default options: - -![Kibana UI Configuring an Index Pattern](images/kibana-landing-page-configure-index.png) - -The Discover tab of the Kibana UI looks like this: - -![Kibana UI Discover tab](images/kibana-discover-tab-annotated.png) - -You can change the time frame of logs Kibana displays in the upper right corner -of the screen. The main search bar is across the top of the Dicover page. - -As more logs are ingested, new fields will be discovered. To have them indexed, -go to Management > Index Patterns > Refresh button (on top right) > Refresh -fields. - - - -#### Accessing configuration and revision logs - -To access the logs for a configuration, enter the following search query in Kibana: - -``` -kubernetes.labels.knative_dev\/configuration: "configuration-example" -``` - -Replace `configuration-example` with your configuration's name. Enter the following -command to get your configuration's name: - -```shell -kubectl get configurations -``` - -To access logs for a revision, enter the following search query in Kibana: - -``` -kubernetes.labels.knative_dev\/revision: "configuration-example-00001" -``` - -Replace `configuration-example-00001` with your revision's name. - -#### Accessing build logs - -To access the logs for a build, enter the following search query in Kibana: - -``` -kubernetes.labels.build\-name: "test-build" -``` - -Replace `test-build` with your build's name. The build name is specified in the `.yaml` file as follows: - -```yaml -apiVersion: build.knative.dev/v1alpha1 -kind: Build -metadata: - name: test-build -``` - -### Stackdriver - -Go to the [Google Cloud Console logging page](https://console.cloud.google.com/logs/viewer) for -your GCP project which stores your logs via Stackdriver. - -## Accessing metrics - -Enter: +To enable logs from Knative components, run: ```shell -kubectl port-forward -n monitoring $(kubectl get pods -n monitoring --selector=app=grafana --output=jsonpath="{.items..metadata.name}") 3000 +kubectl edit configmap config-logging -n knative-serving ``` -Then open the Grafana UI at [http://localhost:3000](http://localhost:3000). The following dashboards are -pre-installed with Knative Serving: - -* **Revision HTTP Requests:** HTTP request count, latency and size metrics per revision and per configuration -* **Nodes:** CPU, memory, network and disk metrics at node level -* **Pods:** CPU, memory and network metrics at pod level -* **Deployment:** CPU, memory and network metrics aggregated at deployment level -* **Istio, Mixer and Pilot:** Detailed Istio mesh, Mixer and Pilot metrics -* **Kubernetes:** Dashboards giving insights into cluster health, deployments and capacity usage - -### Accessing per request traces - -Before you can view per request metrics, you'll need to create a new index pattern that will store -per request traces captured by Zipkin: - -1. Start the Kibana UI serving on local port 8001 by entering the following command: - - ```shell - kubectl proxy - ``` - -1. Open the [Kibana UI](http://localhost:8001/api/v1/namespaces/monitoring/services/kibana-logging/proxy/app/kibana). +and change the level from `fatal` to `info` for Knative components that you are interested in. -1. Navigate to Management -> Index Patterns -> Create Index Pattern. - -1. Enter `zipkin*` in the "Index pattern" text field. - -1. Click **Create**. - -After you've created the Zipkin index pattern, open the -[Zipkin UI](http://localhost:8001/api/v1/namespaces/istio-system/services/zipkin:9411/proxy/zipkin/). -Click on "Find Traces" to see the latest traces. You can search for a trace ID -or look at traces of a specific application. Click on a trace to see a detailed -view of a specific call. - -To see a demo of distributed tracing, deploy the -[Telemetry sample](../sample/telemetrysample/README.md), send some traffic to it, -then explore the traces it generates from Zipkin UI. - - +Once finished, visit [Knative Serving](https://github.com/knative/docs/tree/master/serving) +for guides on accessing logs, metrics and traces. ## Default metrics The following metrics are collected by default: + * Knative Serving controller metrics * Istio metrics (mixer, envoy and pilot) * Node and pod metrics @@ -241,8 +62,8 @@ necessary and instrument your code as described in step 3. In the example below, we will setup the service to host the metrics and instrument a sample 'Gauge' type metric using the setup. -1. First, go through [OpenCensus Go Documentation](https://godoc.org/go.opencensus.io). -2. Add the following to your application startup: +1.First, go through [OpenCensus Go Documentation](https://godoc.org/go.opencensus.io). +2.Add the following to your application startup: ```go import ( @@ -306,7 +127,7 @@ func main() { } ``` -3. In your code where you want to instrument, set the counter with the +3.In your code where you want to instrument, set the counter with the appropriate label values - example: ```go @@ -318,7 +139,7 @@ tag.New( stats.Record(ctx, desiredPodCountM.M({Measurement Value})) ``` -4. Add the following to scape config file located at +4.Add the following to scape config file located at config/monitoring/200-common/300-prometheus/100-scrape-config.yaml: ```yaml @@ -348,25 +169,28 @@ config/monitoring/200-common/300-prometheus/100-scrape-config.yaml: replacement: $1 ``` -5. Redeploy prometheus and its configuration: +5.Redeploy prometheus and its configuration: + ```sh kubectl delete -f config/monitoring/200-common/300-prometheus kubectl apply -f config/monitoring/200-common/300-prometheus ``` -6. Add a dashboard for your metrics - you can see examples of it under +6.Add a dashboard for your metrics - you can see examples of it under config/grafana/dashboard-definition folder. An easy way to generate JSON definitions is to use Grafana UI (make sure to login with as admin user) and [export JSON](http://docs.grafana.org/reference/export_import) from it. -7. Validate the metrics flow either by Grafana UI or Prometheus UI (see +7.Validate the metrics flow either by Grafana UI or Prometheus UI (see Troubleshooting section above to enable Prometheus UI) ## Distributed tracing with Zipkin -Check [Telemetry sample](../sample/telemetrysample/README.md) as an example usage of -[OpenZipkin](https://zipkin.io/pages/existing_instrumentations)'s Go client library. + +Check [Telemetry sample](https://github.com/knative/docs/tree/master/serving/samples/telemetry-go) +as an example usage of [OpenZipkin](https://zipkin.io/pages/existing_instrumentations)'s Go client library. ## Delete monitoring components + Enter: ```shell diff --git a/hack/release.sh b/hack/release.sh index 16c75411983b..294d3b87156b 100755 --- a/hack/release.sh +++ b/hack/release.sh @@ -146,7 +146,7 @@ cp ${OUTPUT_YAML} ${NO_MON_YAML} echo "Building Monitoring & Logging" # Use ko to concatenate them all together. ko resolve -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ + -f config/monitoring/150-elasticsearch \ -f third_party/config/monitoring/common \ -f third_party/config/monitoring/elasticsearch \ -f config/monitoring/200-common \ @@ -156,7 +156,7 @@ ko resolve -R -f config/monitoring/100-common \ -f third_party/config/monitoring/common/istio \ -f third_party/config/monitoring/common/kubernetes/kube-state-metrics \ -f third_party/config/monitoring/common/prometheus-operator \ - -f config/monitoring/150-elasticsearch-prod/100-scaling-configmap.yaml \ + -f config/monitoring/150-elasticsearch/100-scaling-configmap.yaml \ -f config/monitoring/200-common/100-fluentd.yaml \ -f config/monitoring/200-common/100-grafana-dash-knative-efficiency.yaml \ -f config/monitoring/200-common/100-grafana-dash-knative.yaml \ diff --git a/pkg/controller/configuration/configuration.go b/pkg/controller/configuration/configuration.go index 2d8c57f66527..4307b8ab112d 100644 --- a/pkg/controller/configuration/configuration.go +++ b/pkg/controller/configuration/configuration.go @@ -33,7 +33,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/tools/cache" ) @@ -97,7 +96,7 @@ func (c *Controller) Reconcile(key string) error { // Convert the namespace/name string into a distinct namespace and name namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { - runtime.HandleError(fmt.Errorf("invalid resource key: %s", key)) + c.Logger.Errorf("invalid resource key: %s", key) return nil } // Wrap our logger with the additional context of the configuration that we are reconciling. @@ -108,7 +107,7 @@ func (c *Controller) Reconcile(key string) error { original, err := c.configurationLister.Configurations(namespace).Get(name) if errors.IsNotFound(err) { // The resource no longer exists, in which case we stop processing. - runtime.HandleError(fmt.Errorf("configuration %q in work queue no longer exists", key)) + logger.Errorf("configuration %q in work queue no longer exists", key) return nil } else if err != nil { return err diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index f7cf0caed879..8e6d62d69c37 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -156,7 +156,7 @@ func (c *Base) Enqueue(obj interface{}) { var key string var err error if key, err = cache.DeletionHandlingMetaNamespaceKeyFunc(obj); err != nil { - runtime.HandleError(err) + c.Logger.Error(zap.Error(err)) return } c.EnqueueKey(key) @@ -170,7 +170,7 @@ func (c *Base) EnqueueControllerOf(obj interface{}) { // to enqueue the last known owner. object, err := meta.Accessor(obj) if err != nil { - runtime.HandleError(err) + c.Logger.Error(zap.Error(err)) return } @@ -247,7 +247,7 @@ func (c *Base) processNextWorkItem(syncHandler func(string) error) bool { // Forget here else we'd go into a loop of attempting to // process a work item that is invalid. c.WorkQueue.Forget(obj) - runtime.HandleError(fmt.Errorf("expected string in workqueue but got %#v", obj)) + c.Logger.Errorf("expected string in workqueue but got %#v", obj) return nil } // Run the syncHandler, passing it the namespace/name string of the @@ -263,7 +263,7 @@ func (c *Base) processNextWorkItem(syncHandler func(string) error) bool { }(obj) if err != nil { - runtime.HandleError(err) + c.Logger.Error(zap.Error(err)) return true } diff --git a/pkg/controller/revision/resources/autoscaler.go b/pkg/controller/revision/resources/autoscaler.go index ec583c5ab1e6..8c9dc85c5a98 100644 --- a/pkg/controller/revision/resources/autoscaler.go +++ b/pkg/controller/revision/resources/autoscaler.go @@ -139,6 +139,10 @@ func MakeAutoscalerDeployment(rev *v1alpha1.Revision, autoscalerImage string, re }}, Args: []string{ fmt.Sprintf("-concurrencyModel=%v", rev.Spec.ConcurrencyModel), + // Disable glog writing into stderr. Our code doesn't use glog + // and seeing k8s logs in addition to ours is not useful. + "-logtostderr=false", + "-stderrthreshold=FATAL", }, VolumeMounts: autoscalerVolumeMounts, }}, diff --git a/pkg/controller/revision/revision.go b/pkg/controller/revision/revision.go index 03885006d431..aea6850d40c0 100644 --- a/pkg/controller/revision/revision.go +++ b/pkg/controller/revision/revision.go @@ -53,7 +53,6 @@ import ( apierrs "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/runtime" appsv1listers "k8s.io/client-go/listers/apps/v1" corev1listers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -226,7 +225,7 @@ func (c *Controller) Reconcile(key string) error { // Convert the namespace/name string into a distinct namespace and name namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { - runtime.HandleError(fmt.Errorf("invalid resource key: %s", key)) + c.Logger.Errorf("invalid resource key: %s", key) return nil } @@ -238,7 +237,7 @@ func (c *Controller) Reconcile(key string) error { original, err := c.revisionLister.Revisions(namespace).Get(name) // The resource may no longer exist, in which case we stop processing. if apierrs.IsNotFound(err) { - runtime.HandleError(fmt.Errorf("revision %q in work queue no longer exists", key)) + logger.Errorf("revision %q in work queue no longer exists", key) return nil } else if err != nil { return err diff --git a/pkg/controller/route/route.go b/pkg/controller/route/route.go index e04817d74212..53e92e2f67cb 100644 --- a/pkg/controller/route/route.go +++ b/pkg/controller/route/route.go @@ -24,7 +24,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" apierrs "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/runtime" corev1informers "k8s.io/client-go/informers/core/v1" corev1listers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" @@ -128,7 +127,7 @@ func (c *Controller) Reconcile(key string) error { // Convert the namespace/name string into a distinct namespace and name namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { - runtime.HandleError(fmt.Errorf("invalid resource key: %s", key)) + c.Logger.Errorf("invalid resource key: %s", key) return nil } @@ -139,7 +138,7 @@ func (c *Controller) Reconcile(key string) error { original, err := c.routeLister.Routes(namespace).Get(name) if apierrs.IsNotFound(err) { // The resource may no longer exist, in which case we stop processing. - runtime.HandleError(fmt.Errorf("route %q in work queue no longer exists", key)) + logger.Errorf("route %q in work queue no longer exists", key) return nil } else if err != nil { return err @@ -230,7 +229,6 @@ func (c *Controller) EnqueueReferringRoute(obj interface{}) { return } if config.Status.LatestReadyRevisionName == "" { - fmt.Printf("Configuration %s is not ready\n", config.Name) c.Logger.Infof("Configuration %s is not ready", config.Name) return } diff --git a/pkg/controller/service/service.go b/pkg/controller/service/service.go index 6d4a7c393eb8..e6b5f356afca 100644 --- a/pkg/controller/service/service.go +++ b/pkg/controller/service/service.go @@ -18,7 +18,6 @@ package service import ( "context" - "fmt" "reflect" "github.com/google/go-cmp/cmp" @@ -27,7 +26,6 @@ import ( "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" apierrs "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/tools/cache" "github.com/knative/serving/pkg/apis/serving/v1alpha1" @@ -113,7 +111,7 @@ func (c *Controller) Reconcile(key string) error { // Convert the namespace/name string into a distinct namespace and name namespace, name, err := cache.SplitMetaNamespaceKey(key) if err != nil { - runtime.HandleError(fmt.Errorf("invalid resource key: %s", key)) + c.Logger.Errorf("invalid resource key: %s", key) return nil } @@ -125,7 +123,7 @@ func (c *Controller) Reconcile(key string) error { original, err := c.serviceLister.Services(namespace).Get(name) if apierrs.IsNotFound(err) { // The resource may no longer exist, in which case we stop processing. - runtime.HandleError(fmt.Errorf("service %q in work queue no longer exists", key)) + logger.Errorf("service %q in work queue no longer exists", key) return nil } else if err != nil { return err diff --git a/pkg/logging/config.go b/pkg/logging/config.go index a1086f537b9d..62753ffeae32 100644 --- a/pkg/logging/config.go +++ b/pkg/logging/config.go @@ -131,3 +131,22 @@ func levelFromString(level string) (*zapcore.Level, error) { } return &zapLevel, nil } + +// UpdateLevelFromConfigMap returns a helper func that can be used to update the logging level +// when a config map is updated +func UpdateLevelFromConfigMap(logger *zap.SugaredLogger, atomicLevel zap.AtomicLevel, levelKey string) func(configMap *corev1.ConfigMap) { + return func(configMap *corev1.ConfigMap) { + loggingConfig, err := NewConfigFromConfigMap(configMap) + if err != nil { + logger.Error("Failed to parse the logging configmap. Previous config map will be used.", zap.Error(err)) + return + } + + if level, ok := loggingConfig.LoggingLevel[levelKey]; ok { + if atomicLevel.Level() != level { + logger.Infof("Updating logging level for %v from %v to %v.", levelKey, atomicLevel.Level(), level) + atomicLevel.SetLevel(level) + } + } + } +} diff --git a/pkg/logging/config_test.go b/pkg/logging/config_test.go index 6b6324cbf383..76a2eeaa5115 100644 --- a/pkg/logging/config_test.go +++ b/pkg/logging/config_test.go @@ -234,3 +234,42 @@ func getTestConfig() (*Config, string, string) { }) return c, wantCfg, wantLevel } + +func TestUpdateLevelFromConfigMap(t *testing.T) { + logger, atomicLevel := NewLogger("", "debug") + want := zapcore.DebugLevel + if atomicLevel.Level() != zapcore.DebugLevel { + t.Fatalf("Expected initial logger level to %v, got: %v", want, atomicLevel.Level()) + } + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: system.Namespace, + Name: "config-logging", + }, + Data: map[string]string{ + "zap-logger-config": "", + "loglevel.controller": "panic", + }, + } + + tests := []struct { + setLevel string + wantLevel zapcore.Level + }{ + {"info", zapcore.InfoLevel}, + {"error", zapcore.ErrorLevel}, + {"invalid", zapcore.ErrorLevel}, + {"debug", zapcore.DebugLevel}, + {"debug", zapcore.DebugLevel}, + } + + u := UpdateLevelFromConfigMap(logger, atomicLevel, "controller") + for _, tt := range tests { + cm.Data["loglevel.controller"] = tt.setLevel + u(cm) + if atomicLevel.Level() != tt.wantLevel { + t.Errorf("Invalid logging level. want: %v, got: %v", tt.wantLevel, atomicLevel.Level()) + } + } +} diff --git a/test/e2e-tests.sh b/test/e2e-tests.sh index 27c51c88ae67..9cdbf9ae2c4f 100755 --- a/test/e2e-tests.sh +++ b/test/e2e-tests.sh @@ -50,7 +50,7 @@ function create_istio() { function create_monitoring() { kubectl apply -R -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ + -f config/monitoring/150-elasticsearch \ -f third_party/config/monitoring/common \ -f third_party/config/monitoring/elasticsearch \ -f config/monitoring/200-common \ @@ -71,7 +71,7 @@ function delete_istio() { function delete_monitoring() { kubectl delete --ignore-not-found=true -f config/monitoring/100-common \ - -f config/monitoring/150-elasticsearch-prod \ + -f config/monitoring/150-elasticsearch \ -f third_party/config/monitoring/common \ -f third_party/config/monitoring/elasticsearch \ -f config/monitoring/200-common From d1194e134d1efc925fbd81dccf63982c4f32b40c Mon Sep 17 00:00:00 2001 From: mdemirhan Date: Wed, 18 Jul 2018 15:52:23 -0700 Subject: [PATCH 2/6] Unit test fixes. --- pkg/controller/revision/resources/autoscaler_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/controller/revision/resources/autoscaler_test.go b/pkg/controller/revision/resources/autoscaler_test.go index feca46a9e135..bb9ce166d280 100644 --- a/pkg/controller/revision/resources/autoscaler_test.go +++ b/pkg/controller/revision/resources/autoscaler_test.go @@ -208,7 +208,7 @@ func TestMakeAutoscalerDeployment(t *testing.T) { Name: "SERVING_AUTOSCALER_PORT", Value: strconv.Itoa(AutoscalerPort), }}, - Args: []string{"-concurrencyModel=Single"}, + Args: []string{"-concurrencyModel=Single", "-logtostderr=false", "-stderrthreshold=FATAL"}, VolumeMounts: autoscalerVolumeMounts, }}, ServiceAccountName: "autoscaler", @@ -293,7 +293,7 @@ func TestMakeAutoscalerDeployment(t *testing.T) { Name: "SERVING_AUTOSCALER_PORT", Value: strconv.Itoa(AutoscalerPort), }}, - Args: []string{"-concurrencyModel=Multi"}, + Args: []string{"-concurrencyModel=Multi", "-logtostderr=false", "-stderrthreshold=FATAL"}, VolumeMounts: autoscalerVolumeMounts, }}, ServiceAccountName: "autoscaler", @@ -385,7 +385,7 @@ func TestMakeAutoscalerDeployment(t *testing.T) { Name: "SERVING_AUTOSCALER_PORT", Value: strconv.Itoa(AutoscalerPort), }}, - Args: []string{"-concurrencyModel=Multi"}, + Args: []string{"-concurrencyModel=Multi", "-logtostderr=false", "-stderrthreshold=FATAL"}, VolumeMounts: autoscalerVolumeMounts, }}, ServiceAccountName: "autoscaler", @@ -476,7 +476,7 @@ func TestMakeAutoscalerDeployment(t *testing.T) { Name: "SERVING_AUTOSCALER_PORT", Value: strconv.Itoa(AutoscalerPort), }}, - Args: []string{"-concurrencyModel=Multi"}, + Args: []string{"-concurrencyModel=Multi", "-logtostderr=false", "-stderrthreshold=FATAL"}, VolumeMounts: autoscalerVolumeMounts, }}, ServiceAccountName: "autoscaler", @@ -567,7 +567,7 @@ func TestMakeAutoscalerDeployment(t *testing.T) { Name: "SERVING_AUTOSCALER_PORT", Value: strconv.Itoa(AutoscalerPort), }}, - Args: []string{"-concurrencyModel=Multi"}, + Args: []string{"-concurrencyModel=Multi", "-logtostderr=false", "-stderrthreshold=FATAL"}, VolumeMounts: autoscalerVolumeMounts, }}, ServiceAccountName: "autoscaler", From 31236844ee31590a844539f1b577c5bc2d031781 Mon Sep 17 00:00:00 2001 From: mdemirhan Date: Fri, 20 Jul 2018 11:19:43 -0700 Subject: [PATCH 3/6] Spelling fixes. --- DEVELOPMENT.md | 2 +- docs/telemetry.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8ba7f37b299a..ff87d25d10a7 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -154,7 +154,7 @@ Next, run: ```shell ko apply -f config/ -# Set the logging threhold to info for all Knative components and reapply the config +# Set the logging threshold to info for all Knative components and reapply the config sed 's/\"fatal\"/\"info\"/g' config/config-logging.yaml | kubectl apply -f - ``` diff --git a/docs/telemetry.md b/docs/telemetry.md index db94457a2fe9..f5797d035ac0 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -62,8 +62,8 @@ necessary and instrument your code as described in step 3. In the example below, we will setup the service to host the metrics and instrument a sample 'Gauge' type metric using the setup. -1.First, go through [OpenCensus Go Documentation](https://godoc.org/go.opencensus.io). -2.Add the following to your application startup: +1. First, go through [OpenCensus Go Documentation](https://godoc.org/go.opencensus.io). +2. Add the following to your application startup: ```go import ( From a5ba2471a1e8af60fc4f90e62993db8ca05f2560 Mon Sep 17 00:00:00 2001 From: mdemirhan Date: Fri, 20 Jul 2018 11:30:26 -0700 Subject: [PATCH 4/6] Add controller type information to activator, autoscaler and webhook loggers. --- cmd/activator/main.go | 4 +++- cmd/autoscaler/main.go | 1 + cmd/webhook/main.go | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/activator/main.go b/cmd/activator/main.go index 5c48a8c41999..eb6bd8005950 100644 --- a/cmd/activator/main.go +++ b/cmd/activator/main.go @@ -25,6 +25,8 @@ import ( "net/url" "time" + "github.com/knative/serving/pkg/logging/logkey" + "github.com/knative/serving/pkg/activator" clientset "github.com/knative/serving/pkg/client/clientset/versioned" "github.com/knative/serving/pkg/configmap" @@ -158,7 +160,7 @@ func main() { } logger, atomicLevel := logging.NewLoggerFromConfig(config, logLevelKey) defer logger.Sync() - + logger = logger.With(zap.String(logkey.ControllerType, "activator")) logger.Info("Starting the knative activator") clusterConfig, err := rest.InClusterConfig() diff --git a/cmd/autoscaler/main.go b/cmd/autoscaler/main.go index 182b758f2d00..d249083a9d78 100644 --- a/cmd/autoscaler/main.go +++ b/cmd/autoscaler/main.go @@ -234,6 +234,7 @@ func main() { initEnv() logger = logger.With( + zap.String(logkey.ControllerType, "autoscaler"), zap.String(logkey.Namespace, servingNamespace), zap.String(logkey.Configuration, servingConfig), zap.String(logkey.Revision, servingRevision)) diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 094388aec80c..2ba09c202c92 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -21,6 +21,7 @@ import ( "go.uber.org/zap" + "github.com/knative/build/pkg/logging/logkey" "github.com/knative/serving/pkg/configmap" "github.com/knative/serving/pkg/logging" "github.com/knative/serving/pkg/signals" @@ -47,6 +48,7 @@ func main() { } logger, atomicLevel := logging.NewLoggerFromConfig(config, logLevelKey) defer logger.Sync() + logger = logger.With(zap.String(logkey.ControllerType, "webhook")) logger.Info("Starting the Configuration Webhook") From e1a899c52147e54c5e9ced3cea67ef8f251bead2 Mon Sep 17 00:00:00 2001 From: mdemirhan Date: Fri, 20 Jul 2018 11:34:14 -0700 Subject: [PATCH 5/6] Fixes. --- cmd/webhook/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 2ba09c202c92..b2cda3b5eda1 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -21,9 +21,9 @@ import ( "go.uber.org/zap" - "github.com/knative/build/pkg/logging/logkey" "github.com/knative/serving/pkg/configmap" "github.com/knative/serving/pkg/logging" + "github.com/knative/serving/pkg/logging/logkey" "github.com/knative/serving/pkg/signals" "github.com/knative/serving/pkg/system" "github.com/knative/serving/pkg/webhook" From 140c3ad392ce6f7c7fcc13cae6142a653178afa5 Mon Sep 17 00:00:00 2001 From: Mustafa Demirhan Date: Mon, 23 Jul 2018 12:44:49 -0700 Subject: [PATCH 6/6] Change the default level to info --- DEVELOPMENT.md | 3 --- cmd/autoscaler/main.go | 2 +- config/config-logging.yaml | 17 ++++++++--------- docs/telemetry.md | 12 ++---------- 4 files changed, 11 insertions(+), 23 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index ff87d25d10a7..22cbb13660d0 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -153,9 +153,6 @@ Next, run: ```shell ko apply -f config/ - -# Set the logging threshold to info for all Knative components and reapply the config -sed 's/\"fatal\"/\"info\"/g' config/config-logging.yaml | kubectl apply -f - ``` You can see things running with: diff --git a/cmd/autoscaler/main.go b/cmd/autoscaler/main.go index d249083a9d78..243d74e2daa3 100644 --- a/cmd/autoscaler/main.go +++ b/cmd/autoscaler/main.go @@ -271,6 +271,7 @@ func main() { // Watch the logging config map and dynamically update logging levels. stopCh := make(chan struct{}) + defer close(stopCh) configMapWatcher := configmap.NewDefaultWatcher(kubeClient, system.Namespace) configMapWatcher.Watch(logging.ConfigName, logging.UpdateLevelFromConfigMap(logger, atomicLevel, logLevelKey)) if err := configMapWatcher.Start(stopCh); err != nil { @@ -284,5 +285,4 @@ func main() { mux.HandleFunc("/", handler) mux.Handle("/metrics", exporter) http.ListenAndServe(":"+servingAutoscalerPort, mux) - close(stopCh) } diff --git a/config/config-logging.yaml b/config/config-logging.yaml index 05bd8b26fbd5..de338a67c688 100644 --- a/config/config-logging.yaml +++ b/config/config-logging.yaml @@ -42,12 +42,11 @@ data: } # Log level overrides - # By default, we only log critical errors. - # To enable logging for Knative components, change log levels below to "info" - # For all components except the autoscaler, changes will be picked up immediately. - # Autoscaler changes require recreation of the revision until the multi-tenant autoscaler lands. - loglevel.controller: "fatal" - loglevel.autoscaler: "fatal" - loglevel.queueproxy: "fatal" - loglevel.webhook: "fatal" - loglevel.activator: "fatal" + # For all components except the autoscaler and queue proxy, + # changes are be picked up immediately. + # For autoscaler and queue proxy, changes require recreation of the pods. + loglevel.controller: "info" + loglevel.autoscaler: "info" + loglevel.queueproxy: "info" + loglevel.webhook: "info" + loglevel.activator: "info" diff --git a/docs/telemetry.md b/docs/telemetry.md index f5797d035ac0..36aa5d5a28b6 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -2,16 +2,8 @@ Install monitoring components using [Monitoring, Logging and Tracing Installation](https://github.com/knative/docs/blob/master/serving/installing-logging-metrics-traces.md). - -To enable logs from Knative components, run: - -```shell -kubectl edit configmap config-logging -n knative-serving -``` - -and change the level from `fatal` to `info` for Knative components that you are interested in. - -Once finished, visit [Knative Serving](https://github.com/knative/docs/tree/master/serving) +Once finished, visit +[Knative Serving](https://github.com/knative/docs/tree/master/serving) for guides on accessing logs, metrics and traces. ## Default metrics