diff --git a/kubernetes/container-azm-ms-vpaconfig.yaml b/kubernetes/container-azm-ms-vpaconfig.yaml new file mode 100644 index 000000000..9734a59f7 --- /dev/null +++ b/kubernetes/container-azm-ms-vpaconfig.yaml @@ -0,0 +1,13 @@ +kind: ConfigMap +apiVersion: v1 +data: + NannyConfiguration: |- + apiVersion: nannyconfig/v1alpha1 + kind: NannyConfiguration + baseCPU: 200m + cpuPerNode: 2m + baseMemory: 350Mi + memoryPerNode: 4Mi +metadata: + name: container-azm-ms-vpaconfig + namespace: kube-system diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index c11650b9e..42a96acaa 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -27,6 +27,11 @@ rules: - apiGroups: ["apps", "extensions", "autoscaling"] resources: ["replicasets", "deployments", "horizontalpodautoscalers"] verbs: ["list"] + # Uncomment below lines if AddonResizer VPA enabled + # - apiGroups: ["apps"] + # resources: ["deployments"] + # resourceNames: [ "omsagent-rs" ] + # verbs: ["get", "patch"] # Uncomment below lines for MSI Auth Mode testing # - apiGroups: [""] # resources: ["secrets"] @@ -617,6 +622,42 @@ spec: spec: serviceAccountName: omsagent containers: + # Uncomment below lines to enable VPA + # # Make sure this matching with version in AKS RP side + # - image: "mcr.microsoft.com/oss/kubernetes/autoscaler/addon-resizer:1.8.14" + # imagePullPolicy: IfNotPresent + # name: omsagent-vpa + # resources: + # limits: + # cpu: 100m + # memory: 300Mi + # requests: + # cpu: 5m + # memory: 30Mi + # env: + # - name: MY_POD_NAME + # valueFrom: + # fieldRef: + # fieldPath: metadata.name + # - name: MY_POD_NAMESPACE + # valueFrom: + # fieldRef: + # fieldPath: metadata.namespace + # volumeMounts: + # - name: omsagent-rs-vpa-config-volume + # mountPath: /etc/config + # command: + # - /pod_nanny + # - --config-dir=/etc/config + # - --cpu=200m + # - --extra-cpu=2m + # - --memory=300Mi + # - --extra-memory=4Mi + # - --poll-period=180000 + # - --threshold=5 + # - --namespace=kube-system + # - --deployment=omsagent-rs + # - --container=omsagent # Uncomment below lines for MSI Auth Mode testing # - name: addon-token-adapter # command: @@ -655,6 +696,7 @@ spec: - name: omsagent image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022-hotfix" imagePullPolicy: IfNotPresent + # comment resources if VPA configured since the VPA will set these values resources: limits: cpu: 1 @@ -695,6 +737,9 @@ spec: # Uncomment below lines for MSI Auth Mode testing # - name: USING_AAD_MSI_AUTH # value: "true" + # Uncomment below lines when the Addon-resizer VPA enabled + # - name: RS_ADDON-RESIZER_VPA_ENABLED + # value: "true" securityContext: privileged: true ports: @@ -798,6 +843,11 @@ spec: configMap: name: container-azm-ms-osmconfig optional: true + # Uncomment below lines to enable VPA + # - name: omsagent-rs-vpa-config-volume + # configMap: + # name: omsagent-rs-vpa-config + # optional: true --- apiVersion: apps/v1 kind: DaemonSet diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index b34cb20ee..6f499e8bd 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -22,6 +22,7 @@ class ApplicationInsightsUtility @@EnvControllerType = "CONTROLLER_TYPE" @@EnvContainerRuntime = "CONTAINER_RUNTIME" @@EnvAADMSIAuthMode = "AAD_MSI_AUTH_MODE" + @@EnvAddonResizerVPAEnabled = "RS_ADDON-RESIZER_VPA_ENABLED" @@isWindows = false @@hostName = (OMS::Common.get_hostname) @@ -93,6 +94,10 @@ def initializeUtility() else @@CustomProperties["aadAuthMSIMode"] = "false" end + addonResizerVPAEnabled = ENV[@@EnvAddonResizerVPAEnabled] + if !addonResizerVPAEnabled.nil? && !addonResizerVPAEnabled.empty? && addonResizerVPAEnabled.downcase == "true".downcase + @@CustomProperties["addonResizerVPAEnabled"] = "true" + end #Check if telemetry is turned off telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 6828109b3..9e1ea467c 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -37,7 +37,10 @@ class KubernetesApiClient @Log = Logger.new(@LogPath, 2, 10 * 1048576) #keep last 2 files, max log file size = 10M @@TokenFileName = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@TokenStr = nil - @@telemetryTimeTracker = DateTime.now.to_time.to_i + @@cpuLimitsTelemetryTimeTracker = DateTime.now.to_time.to_i + @@cpuRequestsTelemetryTimeTracker = DateTime.now.to_time.to_i + @@memoryLimitsTelemetryTimeTracker = DateTime.now.to_time.to_i + @@memoryRequestsTelemetryTimeTracker = DateTime.now.to_time.to_i @@resourceLimitsTelemetryHash = {} def initialize @@ -470,6 +473,7 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle if podUid.nil? return metricItems end + podName = pod["metadata"]["name"] nodeName = "" #for unscheduled (non-started) pods nodeName does NOT exist @@ -514,8 +518,12 @@ def getContainerResourceRequestsAndLimits(pod, metricCategory, metricNameToColle metricCollections.push(metricCollection) metricProps["json_Collections"] = metricCollections.to_json metricItems.push(metricProps) - #No container level limit for the given metric, so default to node level limit + + if isAddonResizerVPAEnabled() + sendReplicasetAgentRequestsAndLimitsTelemetry(podName, podNameSpace, containerName, metricNametoReturn, metricValue) + end else + #No container level limit for the given metric, so default to node level limit if (metricCategory == "limits" && !nodeAllocatableRecord.nil? && !nodeAllocatableRecord.empty? && nodeAllocatableRecord.has_key?(metricNameToCollect)) metricValue = getMetricNumericValue(metricNameToCollect, nodeAllocatableRecord[metricNameToCollect]) metricProps = {} @@ -1394,5 +1402,55 @@ def isEmitCacheTelemetry end return isEmitCacheTelemtryEnabled end + + def isAddonResizerVPAEnabled + isAddonResizerVPAEnabled = false + if !ENV["RS_ADDON-RESIZER_VPA_ENABLED"].nil? && !ENV["RS_ADDON-RESIZER_VPA_ENABLED"].empty? && ENV["RS_ADDON-RESIZER_VPA_ENABLED"].downcase == "true".downcase + isAddonResizerVPAEnabled = true + end + return isAddonResizerVPAEnabled + end + + def sendReplicasetAgentRequestsAndLimitsTelemetry(podName, podNameSpace, containerName, metricName, metricValue) + begin + if (!podName.nil? && podName.downcase.start_with?("omsagent-rs-") && podNameSpace.eql?("kube-system") && containerName.eql?("omsagent")) + telemetryProps = {} + telemetryProps["PodName"] = podName + telemetryProps["ContainerName"] = containerName + case metricName + when "cpuLimitNanoCores" + timeDifference = (DateTime.now.to_time.to_i - @@cpuLimitsTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@cpuLimitsTelemetryTimeTracker = DateTime.now.to_time.to_i + ApplicationInsightsUtility.sendMetricTelemetry(metricName, metricValue, telemetryProps) + end + when "memoryLimitBytes" + timeDifference = (DateTime.now.to_time.to_i - @@memoryLimitsTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@memoryLimitsTelemetryTimeTracker = DateTime.now.to_time.to_i + ApplicationInsightsUtility.sendMetricTelemetry(metricName, metricValue, telemetryProps) + end + when "cpuRequestNanoCores" + timeDifference = (DateTime.now.to_time.to_i - @@cpuRequestsTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@cpuRequestsTelemetryTimeTracker = DateTime.now.to_time.to_i + ApplicationInsightsUtility.sendMetricTelemetry(metricName, metricValue, telemetryProps) + end + when "memoryRequestBytes" + timeDifference = (DateTime.now.to_time.to_i - @@memoryRequestsTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) + @@memoryRequestsTelemetryTimeTracker = DateTime.now.to_time.to_i + ApplicationInsightsUtility.sendMetricTelemetry(metricName, metricValue, telemetryProps) + end + end + end + rescue => err + @Log.warn "KubernetesApiClient::sendReplicasetAgentRequestsAndLimitsTelemetry failed with an error: #{err}" + end + end end end