From 03caa135bdbbc93e3bef34c306cee1ca0cbff115 Mon Sep 17 00:00:00 2001 From: Michael Sinz Date: Mon, 8 Feb 2021 23:56:10 +0000 Subject: [PATCH 1/2] Add priority class to the daemonsets Add a priority class for omsagent and have the daemonsets use this to be sure to schedule the pods. Daemonset pods are constrained in scheduling to run on specific nodes. This is done by the daemonset controller. When a node shows up it will create a pod with a strong affinity to that node. When a node goes away, it will delete the pod with the node affinity to that node. Kubernetes pod scheduling does not know it is a daemonset but it does know it is tied to a specific node. With default scheduling, it is possible for the pods to be "frozen out" of a node because the node already is full. This can happen because "normal" pods may already exist and are looking for a node to get scheduled on when a node is added to the cluster. The daemonset controller will only first create the pod for the node at around the same time. The kubernetes scheduler is running async from all of this and thus there can be a race as to who gets scheduled on the node. The pod priority class (and thus the pod priority) is a way to indicate that the pod has a higher scheduling priority than a default pod. By default, all pods are at priority 0. Higher numbers are higher priority. Setting the priority to something greater than zero will allow the omsagent daemonsets to win a race against "normal" pods for scheduled resources on a node - and will also allow for graceful eviction in the case the node is too full. Without this, omsagent can be left out of node in clusters that are very busy, especially in dynamic scaling situations. I did not test the windows pod as we have no windows clusters. --- .../templates/omsagent-daemonset-windows.yaml | 5 +-- .../templates/omsagent-daemonset.yaml | 13 +++---- .../templates/omsagent-priorityclass.yaml | 22 ++++++++++++ charts/azuremonitor-containers/values.yaml | 36 +++++++++++++------ 4 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml index 81003c704..82d210f3d 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset-windows.yaml @@ -27,10 +27,11 @@ spec: checksum/secret: {{ include (print $.Template.BasePath "/omsagent-secret.yaml") . | sha256sum }} checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion }} nodeSelector: kubernetes.io/os: windows diff --git a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml index 595edd7bb..0272c6263 100644 --- a/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-daemonset.yaml @@ -28,10 +28,11 @@ spec: checksum/config: {{ toYaml .Values.omsagent.resources | sha256sum }} checksum/logsettings: {{ toYaml .Values.omsagent.logsettings | sha256sum }} spec: - dnsConfig: + priorityClassName: omsagent + dnsConfig: options: - name: ndots - value: "3" + value: "3" {{- if .Values.omsagent.rbac }} serviceAccountName: omsagent {{- end }} @@ -70,12 +71,12 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP - {{- if not (empty .Values.Azure.Extension.Name) }} + {{- if not (empty .Values.Azure.Extension.Name) }} - name: ARC_K8S_EXTENSION_NAME - value: {{ .Values.Azure.Extension.Name | quote }} - {{- end }} + value: {{ .Values.Azure.Extension.Name | quote }} + {{- end }} - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "" + value: "" {{- if .Values.omsagent.logsettings.logflushintervalsecs }} - name: FBIT_SERVICE_FLUSH_INTERVAL value: {{ .Values.omsagent.logsettings.logflushintervalsecs | quote }} diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml new file mode 100644 index 000000000..c0ce8187c --- /dev/null +++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml @@ -0,0 +1,22 @@ +{{- if and (ne .Values.omsagent.secret.key "") (ne .Values.omsagent.secret.wsid "") (or (ne .Values.omsagent.env.clusterName "") (ne .Values.omsagent.env.clusterId "") (ne .Values.Azure.Cluster.ResourceId "") )}} +# This pod priority class is used for daemonsets to allow them to have priority +# over pods that can be scheduled elsewhere. Without a priority class, it is +# possible for a node to fill up with pods before the daemonset pods get to be +# created for the pod or get scheduled. Note that pods are not "daemonset" +# pods - they are just pods created by the daemonset controller but they have +# a specific affinity set during creation to the specific node each pod was +# created to run on (daemonset controller takes care of that) +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: omsagent + # Priority classes don't have labels :-) + annotations: + chart: {{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + component: oms-agent +value: {{ .Values.omsagent.priority }} +globalDefault: false +description: "This is the daemonset priority class for omsagent" +{{- end }} diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 341b9fb65..5601a5738 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -8,9 +8,9 @@ Azure: Cluster: Region: - ResourceId: + ResourceId: Extension: - Name: "" + Name: "" ResourceId: "" omsagent: image: @@ -20,6 +20,20 @@ omsagent: pullPolicy: IfNotPresent dockerProviderVersion: "12.0.0-0" agentVersion: "1.10.0.1" + + # The priority used by the omsagent priority class for the daemonset pods + # Note that this is not execution piority - it is scheduling priority, as + # in getting scheduled to the node. This needs to be greater than 0 such + # that the daemonset pods, which can not schedule onto different nodes as + # they are defined to run on specific nodes, are not accidentally frozen + # out of a node due to other pods showing up earlier in scheduling. + # (DaemonSet pods by definition only are created once the node exists for + # them to be created for and thus it is possible to have "normal" pods + # already in line to run on the node before the DeamonSet controller got a + # chance to build pod for the node and give it to the scheduler) + # Should be some number greater than default (0) + priority: 10 + ## To get your workspace id and key do the following ## You can create a Azure Loganalytics workspace from portal.azure.com and get its ID & PRIMARY KEY from 'Advanced Settings' tab in the Ux. @@ -61,7 +75,7 @@ omsagent: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - labelSelector: - matchExpressions: + matchExpressions: - key: kubernetes.io/os operator: In values: @@ -70,10 +84,10 @@ omsagent: operator: NotIn values: - virtual-kubelet - - key: kubernetes.io/arch + - key: kubernetes.io/arch operator: In values: - - amd64 + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -85,10 +99,10 @@ omsagent: operator: NotIn values: - virtual-kubelet - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 deployment: affinity: nodeAffinity: @@ -117,10 +131,10 @@ omsagent: operator: NotIn values: - master - - key: kubernetes.io/arch + - key: kubernetes.io/arch operator: In values: - - amd64 + - amd64 nodeSelectorTerms: - labelSelector: matchExpressions: @@ -136,10 +150,10 @@ omsagent: operator: NotIn values: - master - - key: beta.kubernetes.io/arch + - key: beta.kubernetes.io/arch operator: In values: - - amd64 + - amd64 ## Configure resource requests and limits ## ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## From 2c783fa9e9b63a22c84abd837b12b2e68051e12f Mon Sep 17 00:00:00 2001 From: Michael Sinz Date: Tue, 9 Feb 2021 12:43:52 +0000 Subject: [PATCH 2/2] CR feedback --- charts/azuremonitor-containers/README.md | 2 ++ .../templates/omsagent-priorityclass.yaml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/charts/azuremonitor-containers/README.md b/charts/azuremonitor-containers/README.md index 469fac94a..a3f17b509 100644 --- a/charts/azuremonitor-containers/README.md +++ b/charts/azuremonitor-containers/README.md @@ -93,6 +93,7 @@ The following table lists the configurable parameters of the MSOMS chart and the | `omsagent.env.clusterName` | Name of your cluster | Does not have a default value, needs to be provided | | `omsagent.rbac` | rbac enabled/disabled | true (i.e.enabled) | | `omsagent.proxy` | Proxy endpoint | Doesnt have default value. Refer to [configure proxy](#Configuring-Proxy-Endpoint) | +| `omsagent.priority` | DaemonSet Pod Priority | This is the [priority](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/) to use for the daemonsets such that they get scheduled onto the node ahead of "normal" pods - must be an integer, defaults to 10 | > Note: For Azure Manage K8s clusters such as Azure Arc K8s and ARO v4, `omsagent.env.clusterId` with fully qualified azure resource id of the cluster should be used instead of `omsagent.env.clusterName` @@ -100,6 +101,7 @@ The following table lists the configurable parameters of the MSOMS chart and the - Parameter `omsagent.env.doNotCollectKubeSystemLogs` has been removed starting chart version 1.0.0. Refer to 'Agent data collection settings' section below to configure it using configmap. - onboarding of multiple clusters with the same cluster name to same log analytics workspace not supported. If need this configuration, use the cluster FQDN name rather than cluster dns prefix to avoid collision with clusterName +- The `omsagent.priority` parameter sets the priority of the omsagent daemonset priority class. This pod priority class is used for daemonsets to allow them to have priority over pods that can be scheduled elsewhere. Without a priority class, it is possible for a node to fill up with "normal" pods before the daemonset pods get to be created for the node or get scheduled. Note that pods are not "daemonset" pods - they are just pods created by the daemonset controller but they have a specific affinity set during creation to the specific node each pod was created to run on. You want this value to be greater than 0 (default is 10) and generally greater than pods that have the flexibility to run on different nodes such that they do not block the node specific pods. ## Agent data collection settings diff --git a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml index c0ce8187c..4d9980ab3 100644 --- a/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml +++ b/charts/azuremonitor-containers/templates/omsagent-priorityclass.yaml @@ -2,7 +2,7 @@ # This pod priority class is used for daemonsets to allow them to have priority # over pods that can be scheduled elsewhere. Without a priority class, it is # possible for a node to fill up with pods before the daemonset pods get to be -# created for the pod or get scheduled. Note that pods are not "daemonset" +# created for the node or get scheduled. Note that pods are not "daemonset" # pods - they are just pods created by the daemonset controller but they have # a specific affinity set during creation to the specific node each pod was # created to run on (daemonset controller takes care of that)