From 1e0000b6afbaec67b51ff0fc054d3c96d41c0639 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Mon, 3 Jan 2022 15:21:42 -0800 Subject: [PATCH 01/21] feature implemented? --- .../scripts/tomlparser-prom-customconfig.rb | 8 +- build/linux/installer/conf/env_vars | 0 .../installer/datafiles/base_container.data | 1 + .../linux/installer/scripts/livenessprobe.sh | 42 +-- .../scripts/tomlparser-osm-config.rb | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 12 +- kubernetes/linux/main.sh | 295 ++++++++---------- kubernetes/linux/setup.sh | 2 +- source/plugins/go/src/telemetry.go | 2 +- source/plugins/ruby/in_kube_nodes.rb | 2 +- 10 files changed, 170 insertions(+), 196 deletions(-) create mode 100644 build/linux/installer/conf/env_vars diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb index 819c1956f..33a3d611e 100644 --- a/build/common/installer/scripts/tomlparser-prom-customconfig.rb +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -223,7 +223,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) File.open(file_name, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") + file = File.open("prom_config_env_var", "w") if !file.nil? file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") #Setting array lengths as environment variables for telemetry purposes @@ -325,10 +325,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" #Set environment variables for telemetry in the sidecar container if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) - file = File.open("telemetry_prom_config_env_var", "w") + file = File.open("prom_config_env_var", "w") if !file.nil? #Setting array lengths as environment variables for telemetry purposes - file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") @@ -381,7 +381,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" #Set environment variables for telemetry - file = File.open("telemetry_prom_config_env_var", "w") + file = File.open("prom_config_env_var", "w") if !file.nil? file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") #Setting array lengths as environment variables for telemetry purposes diff --git a/build/linux/installer/conf/env_vars b/build/linux/installer/conf/env_vars new file mode 100644 index 000000000..e69de29bb diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index d104a5084..676113974 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -55,6 +55,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root /opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/test.json; build/linux/installer/conf/test.json; 644; root; root +/opt/env_vars; build/linux/installer/conf/env_vars; 644; root; root diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 8ecb7fe44..4091e34ab 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -1,4 +1,28 @@ #!/bin/bash +source /opt/env_vars + +if [ -s "inotifyoutput.txt" ] +then + # inotifyoutput file has data(config map was applied) + echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log + exit 1 +fi + +# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled +if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || + ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then + if [ -s "inotifyoutput-osm.txt" ] + then + # inotifyoutput-osm file has data(config map was applied) + echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log + exit 1 + fi +fi + +# if this is the prometheus sidecar and kubernetes pod scraping is not enabled then the rest of the liveness probe doesn't apply +if [[ "${CONTAINER_TYPE}" == "PrometheusSidecar" && "${MUTE_PROM_SIDECAR}" == "true" ]]; then + exit 0 +fi #test to exit non zero value if mdsd is not running (ps -ef | grep "mdsd" | grep -v "grep") @@ -53,22 +77,4 @@ then # exit 1 fi -if [ -s "inotifyoutput.txt" ] -then - # inotifyoutput file has data(config map was applied) - echo "inotifyoutput.txt has been updated - config changed" > /dev/termination-log - exit 1 -fi - -# Perform the following check only for prometheus sidecar that does OSM scraping or for replicaset when sidecar scraping is disabled -if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) ) || - ( ( -e "/etc/config/kube.conf" ) && ( ( ! -z "${SIDECAR_SCRAPING_ENABLED}" ) && ( "${SIDECAR_SCRAPING_ENABLED}" == "false" ) ) ) ]]; then - if [ -s "inotifyoutput-osm.txt" ] - then - # inotifyoutput-osm file has data(config map was applied) - echo "inotifyoutput-osm.txt has been updated - config changed" > /dev/termination-log - exit 1 - fi -fi - exit 0 diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb index 096064db8..676ff8832 100644 --- a/build/linux/installer/scripts/tomlparser-osm-config.rb +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -151,7 +151,7 @@ def replaceOsmTelegrafConfigPlaceHolders telemetryFile = File.open("integration_osm_config_env_var", "w") if !telemetryFile.nil? - telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + telemetryFile.write("export OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") # Close file after writing all environment variables telemetryFile.close else diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 328acb201..c3b010287 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -34,14 +34,14 @@ data: # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] - # In the absense of this configmap, default value for enrich_container_logs is false + # In the absense of this configmap, default value for enrich_container_logs is false enabled = false - # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image + # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] - # In the absense of this configmap, default value for collect_all_kube_events is false + # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false - # When this is enabled (enabled = true), all kube events including normal events will be collected + # When this is enabled (enabled = true), all kube events including normal events will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -106,10 +106,10 @@ data: metric_collection_settings: |- # Metrics collection settings for metrics sent to Log Analytics and MDM [metric_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected enabled = false - # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index a9184ab53..0aea6d341 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,5 +1,13 @@ #!/bin/bash + +# usage: writeGlobalVar ENABLE_SIDECAR_SCRAPING true +writeGlobalVar() { + export "$1"="$2" + echo "export \"$1\"=\"$2\"" >> /opt/env_vars +} +echo "source /opt/env_vars" >> ~/.bashrc + waitforlisteneronTCPport() { local sleepdurationsecs=1 local totalsleptsecs=0 @@ -82,7 +90,6 @@ checkAgentOnboardingStatus() { fi } - #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -99,13 +106,9 @@ fi if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" else - export customResourceId=$AKS_RESOURCE_ID - echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc - source ~/.bashrc + writeGlobalVar "customResourceId" "$AKS_RESOURCE_ID" echo "customResourceId:$customResourceId" - export customRegion=$AKS_REGION - echo "export customRegion=$AKS_REGION" >> ~/.bashrc - source ~/.bashrc + writeGlobalVar "customRegion" "$AKS_REGION" echo "customRegion:$customRegion" fi @@ -118,9 +121,7 @@ if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_schema_version="$(echo $config_schema_version| cut -c1-10)" - export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version - echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc - source ~/.bashrc + writeGlobalVar "AZMON_AGENT_CFG_SCHEMA_VERSION" "$config_schema_version" echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION" fi @@ -133,9 +134,7 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_file_version="$(echo $config_file_version| cut -c1-10)" - export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version - echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc - source ~/.bashrc + writeGlobalVar "AZMON_AGENT_CFG_FILE_VERSION" $config_file_version echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi @@ -150,9 +149,7 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus #take first 10 characters osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" - export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version - echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc - source ~/.bashrc + writeGlobalVar "AZMON_OSM_CFG_SCHEMA_VERSION" "$osm_config_schema_version" echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" fi fi @@ -198,18 +195,12 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then echo $pwd > /opt/microsoft/docker-cimprov/proxy_password - export MDSD_PROXY_MODE=application - echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc - export MDSD_PROXY_ADDRESS=$proto$hostport - echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc - export MDSD_PROXY_USERNAME=$user - echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc - export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password - echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc - + writeGlobalVar "MDSD_PROXY_MODE" "application" + writeGlobalVar "MDSD_PROXY_ADDRESS" "$proto$hostport" + writeGlobalVar "MDSD_PROXY_USERNAME" "$user" + writeGlobalVar "MDSD_PROXY_PASSWORD_FILE" "/opt/microsoft/docker-cimprov/proxy_password" #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD - export MDSD_ODS_COMPRESSION_LEVEL=0 - echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc + writeGlobalVar "MDSD_ODS_COMPRESSION_LEVEL" "0" fi if [ ! -z "$PROXY_ENDPOINT" ]; then @@ -268,14 +259,10 @@ elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then CLOUD_ENVIRONMENT="ussec" fi -export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT -echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc - +writeGlobalVar "CLOUD_ENVIRONMENT" $CLOUD_ENVIRONMENT #consisten naming conventions with the windows -export DOMAIN=$domain -echo "export DOMAIN=$DOMAIN" >> ~/.bashrc -export WSID=$workspaceId -echo "export WSID=$WSID" >> ~/.bashrc +writeGlobalVar "DOMAIN" $domain +writeGlobalVar "WSID" $workspaceId # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) @@ -291,30 +278,25 @@ if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSI # validate that the retrieved data is an instrumentation key if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then - export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) - echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc + writeGlobalVar "APPLICATIONINSIGHTS_AUTH" $(echo $KEY) echo "Using cloud-specific instrumentation key" else # no ikey can be retrieved. Disable telemetry and continue - export DISABLE_TELEMETRY=true - echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc + writeGlobalVar "DISABLE_TELEMETRY" "true" echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" fi fi - aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +writeGlobalVar "TELEMETRY_APPLICATIONINSIGHTS_KEY" "$aikey" -source ~/.bashrc if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then #Parse the configmap to set the right environment variables. /usr/bin/ruby2.6 tomlparser.rb cat config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source config_env_var fi @@ -325,8 +307,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-agent-config.rb cat agent_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source agent_config_env_var @@ -334,8 +315,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-npm-config.rb cat integration_npm_config_env_var | while read line; do - #echo $line - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source integration_npm_config_env_var fi @@ -352,28 +332,28 @@ fi if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then cat defaultpromenvvariables-sidecar | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source defaultpromenvvariables-sidecar else cat defaultpromenvvariables | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source defaultpromenvvariables fi else cat defaultpromenvvariables-rs | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source defaultpromenvvariables-rs fi #Sourcing telemetry environment variable file if it exists -if [ -e "telemetry_prom_config_env_var" ]; then - cat telemetry_prom_config_env_var | while read line; do - echo $line >> ~/.bashrc +if [ -e "prom_config_env_var" ]; then + cat prom_config_env_var | while read line; do + echo $line >> /opt/env_vars done - source telemetry_prom_config_env_var + source prom_config_env_var fi #Parse sidecar agent settings for custom configuration @@ -384,7 +364,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then #Sourcing config environment variable file if it exists if [ -e "side_car_fbit_config_env_var" ]; then cat side_car_fbit_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source side_car_fbit_config_env_var fi @@ -397,7 +377,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source config_mdm_metrics_env_var @@ -405,7 +385,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source config_metric_collection_env_var fi @@ -417,12 +397,21 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source integration_osm_config_env_var fi fi + +if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && + ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && + ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then + writeGlobalVar MUTE_PROM_SIDECAR true +else + writeGlobalVar MUTE_PROM_SIDECAR false +fi + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 @@ -438,18 +427,14 @@ export NODE_NAME="" if [ "$cAdvisorIsSecure" = true ]; then echo "Wget request using port 10250 succeeded. Using 10250" - export IS_SECURE_CADVISOR_PORT=true - echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc - export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" - echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc + writeGlobalVar "IS_SECURE_CADVISOR_PORT" "true" + writeGlobalVar "CADVISOR_METRICS_URL" "https://$NODE_IP:10250/metrics" echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else echo "Wget request using port 10250 failed. Using port 10255" - export IS_SECURE_CADVISOR_PORT=false - echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc - export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" - echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc + writeGlobalVar "IS_SECURE_CADVISOR_PORT" "false" + writeGlobalVar "CADVISOR_METRICS_URL" "http://$NODE_IP:10255/metrics" echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') fi @@ -470,6 +455,7 @@ if [ ! -z "$podWithValidContainerId" ]; then if [ -z "$nodeName" -o "$nodeName" == null ]; then echo "-e error nodeName in /pods API response is empty" else + #TODO: This is never written to bashrc or /opt/env_vars, why? export NODE_NAME=$nodeName fi else @@ -477,29 +463,23 @@ else fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME -echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc +writeGlobalVar "CONTAINER_RUNTIME" "$CONTAINER_RUNTIME" -export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" -echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc -export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" -echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc +writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC" "kubelet_runtime_operations_total" +writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" "kubelet_runtime_operations_errors_total" # default to docker metrics -export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" -export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors" - if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 - export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" + writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_runtime_operations" + writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_runtime_operations_errors" +else + writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_docker_operations" + writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_docker_operations_errors" fi echo "set caps for ruby process to read container env from proc" sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6 -echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc -echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc - -source ~/.bashrc echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. @@ -513,17 +493,14 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" -export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION -echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +writeGlobalVar "DOCKER_CIMPROV_VERSION" "$DOCKER_CIMPROV_VERSION" #skip imds lookup since not used either legacy or aad msi auth path -export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +writeGlobalVar "SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" "true" # this used by mdsd to determine cloud specific LA endpoints -export OMS_TLD=$domain -echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc +writeGlobalVar "OMS_TLD" "$domain" cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc + echo $line >> /opt/env_vars done source /etc/mdsd.d/envmdsd MDSD_AAD_MSI_AUTH_ARGS="" @@ -533,48 +510,32 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then echo "*** activating oneagent in aad auth msi mode ***" # msi auth specific args MDSD_AAD_MSI_AUTH_ARGS="-a -A" - export AAD_MSI_AUTH_MODE=true - echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc + writeGlobalVar "AAD_MSI_AUTH_MODE" "true" # this used by mdsd to determine the cloud specific AMCS endpoints - export customEnvironment=$CLOUD_ENVIRONMENT - echo "export customEnvironment=$customEnvironment" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="28230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - export ENABLE_MCS="true" - echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc - export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" - echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc - export MDSD_USE_LOCAL_PERSISTENCY="false" - echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc + writeGlobalVar "customEnvironment" "$CLOUD_ENVIRONMENT" + writeGlobalVar "ENABLE_MCS" "true" + writeGlobalVar "MONITORING_USE_GENEVA_CONFIG_SERVICE" "false" + writeGlobalVar "MDSD_USE_LOCAL_PERSISTENCY" "false" else - echo "*** activating oneagent in legacy auth mode ***" - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - #use the file path as its secure than env - CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile - echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="29230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + echo "*** activating oneagent in legacy auth mode ***" + writeGlobalVar "AAD_MSI_AUTH_MODE" "false" + #use the file path as its secure than env + writeGlobalVar "CIWORKSPACE_id" "$(cat /etc/omsagent-secret/WSID)" + writeGlobalVar "CIWORKSPACE_keyFile" "/etc/omsagent-secret/KEY" + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" fi -source ~/.bashrc +writeGlobalVar "MDSD_FLUENT_SOCKET_PORT" "29230" dpkg -l | grep mdsd | awk '{print $2 " " $3}' if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." - #use tenant name to avoid unix socket conflict and different ports for port conflict - #roleprefix to use container specific mdsd socket - export TENANT_NAME="${CONTAINER_TYPE}" - echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc - export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default - echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc - source ~/.bashrc - mkdir /var/run/mdsd-${CONTAINER_TYPE} - # add -T 0xFFFF for full traces - mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." + writeGlobalVar "MDSD_ROLE_PREFIX" "/var/run/mdsd-${CONTAINER_TYPE}/default" + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + fi else echo "starting mdsd mode in main container..." # add -T 0xFFFF for full traces @@ -601,13 +562,17 @@ fi #If config parsing was successful, a copy of the conf file with replaced custom settings file is created if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ] && [ -e "/opt/telegraf-test-prom-side-car.conf" ]; then - echo "****************Start Telegraf in Test Mode**************************" - /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf --input-filter file -test - if [ $? -eq 0 ]; then - mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" - echo "Moving test conf file to telegraf side-car conf since test run succeeded" + if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + echo "****************Start Telegraf in Test Mode**************************" + /opt/telegraf --config /opt/telegraf-test-prom-side-car.conf --input-filter file -test + if [ $? -eq 0 ]; then + mv "/opt/telegraf-test-prom-side-car.conf" "/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + echo "Moving test conf file to telegraf side-car conf since test run succeeded" + fi + echo "****************End Telegraf Run in Test Mode**************************" + else + echo "****************Skipping Telegraf Run in Test Mode**************************" fi - echo "****************End Telegraf Run in Test Mode**************************" else if [ -e "/opt/telegraf-test.conf" ]; then echo "****************Start Telegraf in Test Mode**************************" @@ -634,9 +599,13 @@ fi #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" - /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" + /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" + else + echo "not starting fluent-bit in prometheus sidecar (monitor_kubernetes_pods == false)" + fi else echo "starting fluent-bit and setting telegraf conf file for daemonset" if [ "$CONTAINER_RUNTIME" == "docker" ]; then @@ -670,16 +639,11 @@ else telemetry_cluster_type="AKS" fi -export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id -echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc -export TELEMETRY_AKS_REGION=$telemetry_aks_region -echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc -export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name -echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc -export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name -echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc -export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type -echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc +writeGlobalVar "TELEMETRY_AKS_RESOURCE_ID" "$telemetry_aks_resource_id" +writeGlobalVar "TELEMETRY_AKS_REGION" "$telemetry_aks_region" +writeGlobalVar "TELEMETRY_CLUSTER_NAME" "$telemetry_cluster_name" +writeGlobalVar "TELEMETRY_ACS_RESOURCE_NAME" "$telemetry_acs_resource_name" +writeGlobalVar "TELEMETRY_CLUSTER_TYPE" "$telemetry_cluster_type" #if [ ! -e "/etc/config/kube.conf" ]; then # nodename=$(cat /hostfs/etc/hostname) @@ -690,36 +654,37 @@ echo "nodename: $nodename" echo "replacing nodename in telegraf config" sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile -export HOST_MOUNT_PREFIX=/hostfs -echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc -export HOST_PROC=/hostfs/proc -echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc -export HOST_SYS=/hostfs/sys -echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc -export HOST_ETC=/hostfs/etc -echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc -export HOST_VAR=/hostfs/var -echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc +writeGlobalVar "HOST_MOUNT_PREFIX" "/hostfs" +writeGlobalVar "HOST_PROC" "/hostfs/proc" +writeGlobalVar "HOST_SYS" "/hostfs/sys" +writeGlobalVar "HOST_ETC" "/hostfs/etc" +writeGlobalVar "HOST_VAR" "/hostfs/var" -if [ ! -e "/etc/config/kube.conf" ]; then - if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." - waitforlisteneronTCPport 25229 30 +if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25229 30 + else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 + echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25228 30 + fi else echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." waitforlisteneronTCPport 25226 30 - echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." - waitforlisteneronTCPport 25228 30 fi -else - echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." - waitforlisteneronTCPport 25226 30 fi #start telegraf -/opt/telegraf --config $telegrafConfFile & -/opt/telegraf --version -dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' +if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + /opt/telegraf --config $telegrafConfFile & + echo "telegraf version: $(/opt/telegraf --version)" + dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' +else + echo "not starting telegraf (monitor_kubernetes_pods != true)" +fi #dpkg -l | grep telegraf | awk '{print $2 " " $3}' @@ -732,7 +697,9 @@ service rsyslog stop echo "getting rsyslog status..." service rsyslog status -checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30 +if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then + checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30 +fi shutdown() { pkill -f mdsd diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 5bddfc604..c87ab4feb 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -32,7 +32,7 @@ sudo apt-get install libcap2-bin -y wget https://dl.influxdata.com/telegraf/releases/telegraf-1.18.0_linux_amd64.tar.gz tar -zxvf telegraf-1.18.0_linux_amd64.tar.gz - +rm telegraf-1.18.0_linux_amd64.tar.gz mv /opt/telegraf-1.18.0/usr/bin/telegraf /opt/telegraf chmod 777 /opt/telegraf diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 31818dbb3..eeb1a99eb 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -354,7 +354,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } } - PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") + PromMonitorPods = os.Getenv("CUSTOM_PROM_MONITOR_PODS") PromMonitorPodsNamespaceLength = 0 promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a32a32769..bf16224ac 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -48,7 +48,7 @@ def initialize (kubernetesApiClient=nil, @@rsPromMonitorPodsLabelSelectorLength = @env["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] @@rsPromMonitorPodsFieldSelectorLength = @env["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = @env["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] - @@osmNamespaceCount = @env["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] + @@osmNamespaceCount = @env["OSM_CONFIGURATION_NAMESPACES_COUNT"] @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" From d9a95d38782419a3273bfa20b78d491a82edfd9d Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 5 Jan 2022 13:53:36 -0800 Subject: [PATCH 02/21] small changes --- .../linux/installer/scripts/livenessprobe.sh | 2 +- kubernetes/linux/main.sh | 120 +++++++++--------- 2 files changed, 61 insertions(+), 61 deletions(-) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 4091e34ab..e74da46b3 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -19,7 +19,7 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus fi fi -# if this is the prometheus sidecar and kubernetes pod scraping is not enabled then the rest of the liveness probe doesn't apply +# if this is the prometheus sidecar and there are no prometheus metrics to scrape then the rest of the liveness probe doesn't apply if [[ "${CONTAINER_TYPE}" == "PrometheusSidecar" && "${MUTE_PROM_SIDECAR}" == "true" ]]; then exit 0 fi diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 0aea6d341..66fa64f6a 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,8 +1,9 @@ #!/bin/bash -# usage: writeGlobalVar ENABLE_SIDECAR_SCRAPING true -writeGlobalVar() { +# please use this instead of adding env vars to bashrc directly +# usage: setGlobalEnvVar ENABLE_SIDECAR_SCRAPING true +setGlobalEnvVar() { export "$1"="$2" echo "export \"$1\"=\"$2\"" >> /opt/env_vars } @@ -90,6 +91,7 @@ checkAgentOnboardingStatus() { fi } + #using /var/opt/microsoft/docker-cimprov/state instead of /var/opt/microsoft/omsagent/state since the latter gets deleted during onboarding mkdir -p /var/opt/microsoft/docker-cimprov/state @@ -106,9 +108,9 @@ fi if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" else - writeGlobalVar "customResourceId" "$AKS_RESOURCE_ID" + setGlobalEnvVar "customResourceId" "$AKS_RESOURCE_ID" echo "customResourceId:$customResourceId" - writeGlobalVar "customRegion" "$AKS_REGION" + setGlobalEnvVar "customRegion" "$AKS_REGION" echo "customRegion:$customRegion" fi @@ -121,7 +123,7 @@ if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_schema_version="$(echo $config_schema_version| cut -c1-10)" - writeGlobalVar "AZMON_AGENT_CFG_SCHEMA_VERSION" "$config_schema_version" + setGlobalEnvVar "AZMON_AGENT_CFG_SCHEMA_VERSION" "$config_schema_version" echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION" fi @@ -134,7 +136,7 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_file_version="$(echo $config_file_version| cut -c1-10)" - writeGlobalVar "AZMON_AGENT_CFG_FILE_VERSION" $config_file_version + setGlobalEnvVar "AZMON_AGENT_CFG_FILE_VERSION" $config_file_version echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi @@ -149,7 +151,7 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus #take first 10 characters osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" - writeGlobalVar "AZMON_OSM_CFG_SCHEMA_VERSION" "$osm_config_schema_version" + setGlobalEnvVar "AZMON_OSM_CFG_SCHEMA_VERSION" "$osm_config_schema_version" echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" fi fi @@ -195,12 +197,12 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then echo $pwd > /opt/microsoft/docker-cimprov/proxy_password - writeGlobalVar "MDSD_PROXY_MODE" "application" - writeGlobalVar "MDSD_PROXY_ADDRESS" "$proto$hostport" - writeGlobalVar "MDSD_PROXY_USERNAME" "$user" - writeGlobalVar "MDSD_PROXY_PASSWORD_FILE" "/opt/microsoft/docker-cimprov/proxy_password" + setGlobalEnvVar "MDSD_PROXY_MODE" "application" + setGlobalEnvVar "MDSD_PROXY_ADDRESS" "$proto$hostport" + setGlobalEnvVar "MDSD_PROXY_USERNAME" "$user" + setGlobalEnvVar "MDSD_PROXY_PASSWORD_FILE" "/opt/microsoft/docker-cimprov/proxy_password" #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD - writeGlobalVar "MDSD_ODS_COMPRESSION_LEVEL" "0" + setGlobalEnvVar "MDSD_ODS_COMPRESSION_LEVEL" "0" fi if [ ! -z "$PROXY_ENDPOINT" ]; then @@ -259,10 +261,10 @@ elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then CLOUD_ENVIRONMENT="ussec" fi -writeGlobalVar "CLOUD_ENVIRONMENT" $CLOUD_ENVIRONMENT +setGlobalEnvVar "CLOUD_ENVIRONMENT" $CLOUD_ENVIRONMENT #consisten naming conventions with the windows -writeGlobalVar "DOMAIN" $domain -writeGlobalVar "WSID" $workspaceId +setGlobalEnvVar "DOMAIN" $domain +setGlobalEnvVar "WSID" $workspaceId # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) @@ -278,17 +280,16 @@ if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSI # validate that the retrieved data is an instrumentation key if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then - writeGlobalVar "APPLICATIONINSIGHTS_AUTH" $(echo $KEY) + setGlobalEnvVar "APPLICATIONINSIGHTS_AUTH" $(echo $KEY) echo "Using cloud-specific instrumentation key" else # no ikey can be retrieved. Disable telemetry and continue - writeGlobalVar "DISABLE_TELEMETRY" "true" + setGlobalEnvVar "DISABLE_TELEMETRY" "true" echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" fi fi - aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -writeGlobalVar "TELEMETRY_APPLICATIONINSIGHTS_KEY" "$aikey" +setGlobalEnvVar "TELEMETRY_APPLICATIONINSIGHTS_KEY" "$aikey" if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then @@ -403,13 +404,13 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus fi fi - +# If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it. if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then - writeGlobalVar MUTE_PROM_SIDECAR true + setGlobalEnvVar MUTE_PROM_SIDECAR true else - writeGlobalVar MUTE_PROM_SIDECAR false + setGlobalEnvVar MUTE_PROM_SIDECAR false fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request @@ -427,14 +428,14 @@ export NODE_NAME="" if [ "$cAdvisorIsSecure" = true ]; then echo "Wget request using port 10250 succeeded. Using 10250" - writeGlobalVar "IS_SECURE_CADVISOR_PORT" "true" - writeGlobalVar "CADVISOR_METRICS_URL" "https://$NODE_IP:10250/metrics" + setGlobalEnvVar "IS_SECURE_CADVISOR_PORT" "true" + setGlobalEnvVar "CADVISOR_METRICS_URL" "https://$NODE_IP:10250/metrics" echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else echo "Wget request using port 10250 failed. Using port 10255" - writeGlobalVar "IS_SECURE_CADVISOR_PORT" "false" - writeGlobalVar "CADVISOR_METRICS_URL" "http://$NODE_IP:10255/metrics" + setGlobalEnvVar "IS_SECURE_CADVISOR_PORT" "false" + setGlobalEnvVar "CADVISOR_METRICS_URL" "http://$NODE_IP:10255/metrics" echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') fi @@ -455,27 +456,26 @@ if [ ! -z "$podWithValidContainerId" ]; then if [ -z "$nodeName" -o "$nodeName" == null ]; then echo "-e error nodeName in /pods API response is empty" else - #TODO: This is never written to bashrc or /opt/env_vars, why? - export NODE_NAME=$nodeName + setGlobalEnvVar NODE_NAME $nodeName fi else echo "-e error either /pods API request failed or no running pods" fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME -writeGlobalVar "CONTAINER_RUNTIME" "$CONTAINER_RUNTIME" +setGlobalEnvVar "CONTAINER_RUNTIME" "$CONTAINER_RUNTIME" -writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC" "kubelet_runtime_operations_total" -writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" "kubelet_runtime_operations_errors_total" +setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC" "kubelet_runtime_operations_total" +setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" "kubelet_runtime_operations_errors_total" # default to docker metrics if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 - writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_runtime_operations" - writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_runtime_operations_errors" + setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_runtime_operations" + setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_runtime_operations_errors" else - writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_docker_operations" - writeGlobalVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_docker_operations_errors" + setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_docker_operations" + setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_docker_operations_errors" fi echo "set caps for ruby process to read container env from proc" @@ -493,12 +493,12 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" -writeGlobalVar "DOCKER_CIMPROV_VERSION" "$DOCKER_CIMPROV_VERSION" +setGlobalEnvVar "DOCKER_CIMPROV_VERSION" "$DOCKER_CIMPROV_VERSION" #skip imds lookup since not used either legacy or aad msi auth path -writeGlobalVar "SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" "true" +setGlobalEnvVar "SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" "true" # this used by mdsd to determine cloud specific LA endpoints -writeGlobalVar "OMS_TLD" "$domain" +setGlobalEnvVar "OMS_TLD" "$domain" cat /etc/mdsd.d/envmdsd | while read line; do echo $line >> /opt/env_vars done @@ -510,28 +510,28 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then echo "*** activating oneagent in aad auth msi mode ***" # msi auth specific args MDSD_AAD_MSI_AUTH_ARGS="-a -A" - writeGlobalVar "AAD_MSI_AUTH_MODE" "true" + setGlobalEnvVar "AAD_MSI_AUTH_MODE" "true" # this used by mdsd to determine the cloud specific AMCS endpoints - writeGlobalVar "customEnvironment" "$CLOUD_ENVIRONMENT" - writeGlobalVar "ENABLE_MCS" "true" - writeGlobalVar "MONITORING_USE_GENEVA_CONFIG_SERVICE" "false" - writeGlobalVar "MDSD_USE_LOCAL_PERSISTENCY" "false" + setGlobalEnvVar "customEnvironment" "$CLOUD_ENVIRONMENT" + setGlobalEnvVar "ENABLE_MCS" "true" + setGlobalEnvVar "MONITORING_USE_GENEVA_CONFIG_SERVICE" "false" + setGlobalEnvVar "MDSD_USE_LOCAL_PERSISTENCY" "false" else echo "*** activating oneagent in legacy auth mode ***" - writeGlobalVar "AAD_MSI_AUTH_MODE" "false" + setGlobalEnvVar "AAD_MSI_AUTH_MODE" "false" #use the file path as its secure than env - writeGlobalVar "CIWORKSPACE_id" "$(cat /etc/omsagent-secret/WSID)" - writeGlobalVar "CIWORKSPACE_keyFile" "/etc/omsagent-secret/KEY" + setGlobalEnvVar "CIWORKSPACE_id" "$(cat /etc/omsagent-secret/WSID)" + setGlobalEnvVar "CIWORKSPACE_keyFile" "/etc/omsagent-secret/KEY" echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" fi -writeGlobalVar "MDSD_FLUENT_SOCKET_PORT" "29230" +setGlobalEnvVar "MDSD_FLUENT_SOCKET_PORT" "29230" dpkg -l | grep mdsd | awk '{print $2 " " $3}' if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." - writeGlobalVar "MDSD_ROLE_PREFIX" "/var/run/mdsd-${CONTAINER_TYPE}/default" + setGlobalEnvVar "MDSD_ROLE_PREFIX" "/var/run/mdsd-${CONTAINER_TYPE}/default" mkdir /var/run/mdsd-${CONTAINER_TYPE} # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & @@ -604,7 +604,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else - echo "not starting fluent-bit in prometheus sidecar (monitor_kubernetes_pods == false)" + echo "not starting fluent-bit in prometheus sidecar (no metrics to scrape)" fi else echo "starting fluent-bit and setting telegraf conf file for daemonset" @@ -639,11 +639,11 @@ else telemetry_cluster_type="AKS" fi -writeGlobalVar "TELEMETRY_AKS_RESOURCE_ID" "$telemetry_aks_resource_id" -writeGlobalVar "TELEMETRY_AKS_REGION" "$telemetry_aks_region" -writeGlobalVar "TELEMETRY_CLUSTER_NAME" "$telemetry_cluster_name" -writeGlobalVar "TELEMETRY_ACS_RESOURCE_NAME" "$telemetry_acs_resource_name" -writeGlobalVar "TELEMETRY_CLUSTER_TYPE" "$telemetry_cluster_type" +setGlobalEnvVar "TELEMETRY_AKS_RESOURCE_ID" "$telemetry_aks_resource_id" +setGlobalEnvVar "TELEMETRY_AKS_REGION" "$telemetry_aks_region" +setGlobalEnvVar "TELEMETRY_CLUSTER_NAME" "$telemetry_cluster_name" +setGlobalEnvVar "TELEMETRY_ACS_RESOURCE_NAME" "$telemetry_acs_resource_name" +setGlobalEnvVar "TELEMETRY_CLUSTER_TYPE" "$telemetry_cluster_type" #if [ ! -e "/etc/config/kube.conf" ]; then # nodename=$(cat /hostfs/etc/hostname) @@ -654,11 +654,11 @@ echo "nodename: $nodename" echo "replacing nodename in telegraf config" sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile -writeGlobalVar "HOST_MOUNT_PREFIX" "/hostfs" -writeGlobalVar "HOST_PROC" "/hostfs/proc" -writeGlobalVar "HOST_SYS" "/hostfs/sys" -writeGlobalVar "HOST_ETC" "/hostfs/etc" -writeGlobalVar "HOST_VAR" "/hostfs/var" +setGlobalEnvVar "HOST_MOUNT_PREFIX" "/hostfs" +setGlobalEnvVar "HOST_PROC" "/hostfs/proc" +setGlobalEnvVar "HOST_SYS" "/hostfs/sys" +setGlobalEnvVar "HOST_ETC" "/hostfs/etc" +setGlobalEnvVar "HOST_VAR" "/hostfs/var" if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then if [ ! -e "/etc/config/kube.conf" ]; then @@ -683,7 +683,7 @@ if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "telegraf version: $(/opt/telegraf --version)" dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' else - echo "not starting telegraf (monitor_kubernetes_pods != true)" + echo "not starting telegraf (no metrics to scrape)" fi #dpkg -l | grep telegraf | awk '{print $2 " " $3}' From b5af70424b6e8cfcd804088545a7cca18d78258d Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 5 Jan 2022 16:30:09 -0800 Subject: [PATCH 03/21] switching plans, will replace export && source ~/.bashrc in main.sh incrementally --- kubernetes/linux/main.sh | 207 ++++++++++++++++++++++++++------------- 1 file changed, 137 insertions(+), 70 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 66fa64f6a..c8f0a177b 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -108,9 +108,13 @@ fi if [ -z $AKS_RESOURCE_ID ]; then echo "not setting customResourceId" else - setGlobalEnvVar "customResourceId" "$AKS_RESOURCE_ID" + export customResourceId=$AKS_RESOURCE_ID + echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc + source ~/.bashrc echo "customResourceId:$customResourceId" - setGlobalEnvVar "customRegion" "$AKS_REGION" + export customRegion=$AKS_REGION + echo "export customRegion=$AKS_REGION" >> ~/.bashrc + source ~/.bashrc echo "customRegion:$customRegion" fi @@ -123,7 +127,9 @@ if [ -e "/etc/config/settings/schema-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_schema_version="$(echo $config_schema_version| cut -c1-10)" - setGlobalEnvVar "AZMON_AGENT_CFG_SCHEMA_VERSION" "$config_schema_version" + export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version + echo "export AZMON_AGENT_CFG_SCHEMA_VERSION=$config_schema_version" >> ~/.bashrc + source ~/.bashrc echo "AZMON_AGENT_CFG_SCHEMA_VERSION:$AZMON_AGENT_CFG_SCHEMA_VERSION" fi @@ -136,7 +142,9 @@ if [ -e "/etc/config/settings/config-version" ] && [ -s "/etc/config/settings/ #take first 10 characters config_file_version="$(echo $config_file_version| cut -c1-10)" - setGlobalEnvVar "AZMON_AGENT_CFG_FILE_VERSION" $config_file_version + export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version + echo "export AZMON_AGENT_CFG_FILE_VERSION=$config_file_version" >> ~/.bashrc + source ~/.bashrc echo "AZMON_AGENT_CFG_FILE_VERSION:$AZMON_AGENT_CFG_FILE_VERSION" fi @@ -151,7 +159,9 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus #take first 10 characters osm_config_schema_version="$(echo $osm_config_schema_version| cut -c1-10)" - setGlobalEnvVar "AZMON_OSM_CFG_SCHEMA_VERSION" "$osm_config_schema_version" + export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version + echo "export AZMON_OSM_CFG_SCHEMA_VERSION=$osm_config_schema_version" >> ~/.bashrc + source ~/.bashrc echo "AZMON_OSM_CFG_SCHEMA_VERSION:$AZMON_OSM_CFG_SCHEMA_VERSION" fi fi @@ -197,12 +207,18 @@ if [ -e "/etc/omsagent-secret/WSID" ]; then echo $pwd > /opt/microsoft/docker-cimprov/proxy_password - setGlobalEnvVar "MDSD_PROXY_MODE" "application" - setGlobalEnvVar "MDSD_PROXY_ADDRESS" "$proto$hostport" - setGlobalEnvVar "MDSD_PROXY_USERNAME" "$user" - setGlobalEnvVar "MDSD_PROXY_PASSWORD_FILE" "/opt/microsoft/docker-cimprov/proxy_password" + export MDSD_PROXY_MODE=application + echo "export MDSD_PROXY_MODE=$MDSD_PROXY_MODE" >> ~/.bashrc + export MDSD_PROXY_ADDRESS=$proto$hostport + echo "export MDSD_PROXY_ADDRESS=$MDSD_PROXY_ADDRESS" >> ~/.bashrc + export MDSD_PROXY_USERNAME=$user + echo "export MDSD_PROXY_USERNAME=$MDSD_PROXY_USERNAME" >> ~/.bashrc + export MDSD_PROXY_PASSWORD_FILE=/opt/microsoft/docker-cimprov/proxy_password + echo "export MDSD_PROXY_PASSWORD_FILE=$MDSD_PROXY_PASSWORD_FILE" >> ~/.bashrc + #TODO: Compression + proxy creates a deserialization error in ODS. This needs a fix in MDSD - setGlobalEnvVar "MDSD_ODS_COMPRESSION_LEVEL" "0" + export MDSD_ODS_COMPRESSION_LEVEL=0 + echo "export MDSD_ODS_COMPRESSION_LEVEL=$MDSD_ODS_COMPRESSION_LEVEL" >> ~/.bashrc fi if [ ! -z "$PROXY_ENDPOINT" ]; then @@ -261,10 +277,14 @@ elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then CLOUD_ENVIRONMENT="ussec" fi -setGlobalEnvVar "CLOUD_ENVIRONMENT" $CLOUD_ENVIRONMENT +export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT +echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc + #consisten naming conventions with the windows -setGlobalEnvVar "DOMAIN" $domain -setGlobalEnvVar "WSID" $workspaceId +export DOMAIN=$domain +echo "export DOMAIN=$DOMAIN" >> ~/.bashrc +export WSID=$workspaceId +echo "export WSID=$WSID" >> ~/.bashrc # Check if the instrumentation key needs to be fetched from a storage account (as in airgapped clouds) if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSIGHTS_AUTH_URL has length >=1) @@ -280,24 +300,30 @@ if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSI # validate that the retrieved data is an instrumentation key if [[ $KEY =~ ^[A-Za-z0-9=]+$ ]]; then - setGlobalEnvVar "APPLICATIONINSIGHTS_AUTH" $(echo $KEY) + export APPLICATIONINSIGHTS_AUTH=$(echo $KEY) + echo "export APPLICATIONINSIGHTS_AUTH=$APPLICATIONINSIGHTS_AUTH" >> ~/.bashrc echo "Using cloud-specific instrumentation key" else # no ikey can be retrieved. Disable telemetry and continue - setGlobalEnvVar "DISABLE_TELEMETRY" "true" + export DISABLE_TELEMETRY=true + echo "export DISABLE_TELEMETRY=true" >> ~/.bashrc echo "Could not get cloud-specific instrumentation key (network error?). Disabling telemetry" fi fi + + aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -setGlobalEnvVar "TELEMETRY_APPLICATIONINSIGHTS_KEY" "$aikey" +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +source ~/.bashrc if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then #Parse the configmap to set the right environment variables. /usr/bin/ruby2.6 tomlparser.rb cat config_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source config_env_var fi @@ -308,15 +334,15 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-agent-config.rb cat agent_config_env_var | while read line; do - echo $line >> /opt/env_vars - done + echo $line >> ~/.bashrc + done source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. /usr/bin/ruby2.6 tomlparser-npm-config.rb cat integration_npm_config_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source integration_npm_config_env_var fi @@ -333,23 +359,23 @@ fi if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then cat defaultpromenvvariables-sidecar | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source defaultpromenvvariables-sidecar else cat defaultpromenvvariables | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source defaultpromenvvariables fi else cat defaultpromenvvariables-rs | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source defaultpromenvvariables-rs fi -#Sourcing telemetry environment variable file if it exists +#Sourcing environment variable file if it exists. This file has telemetry and whether kubernetes pods are monitored if [ -e "prom_config_env_var" ]; then cat prom_config_env_var | while read line; do echo $line >> /opt/env_vars @@ -365,7 +391,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then #Sourcing config environment variable file if it exists if [ -e "side_car_fbit_config_env_var" ]; then cat side_car_fbit_config_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source side_car_fbit_config_env_var fi @@ -378,7 +404,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-mdm-metrics-config.rb cat config_mdm_metrics_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source config_mdm_metrics_env_var @@ -386,7 +412,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then /usr/bin/ruby2.6 tomlparser-metric-collection-config.rb cat config_metric_collection_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source config_metric_collection_env_var fi @@ -428,14 +454,18 @@ export NODE_NAME="" if [ "$cAdvisorIsSecure" = true ]; then echo "Wget request using port 10250 succeeded. Using 10250" - setGlobalEnvVar "IS_SECURE_CADVISOR_PORT" "true" - setGlobalEnvVar "CADVISOR_METRICS_URL" "https://$NODE_IP:10250/metrics" + export IS_SECURE_CADVISOR_PORT=true + echo "export IS_SECURE_CADVISOR_PORT=true" >> ~/.bashrc + export CADVISOR_METRICS_URL="https://$NODE_IP:10250/metrics" + echo "export CADVISOR_METRICS_URL=https://$NODE_IP:10250/metrics" >> ~/.bashrc echo "Making curl request to cadvisor endpoint /pods with port 10250 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s -k -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" https://$NODE_IP:10250/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') else echo "Wget request using port 10250 failed. Using port 10255" - setGlobalEnvVar "IS_SECURE_CADVISOR_PORT" "false" - setGlobalEnvVar "CADVISOR_METRICS_URL" "http://$NODE_IP:10255/metrics" + export IS_SECURE_CADVISOR_PORT=false + echo "export IS_SECURE_CADVISOR_PORT=false" >> ~/.bashrc + export CADVISOR_METRICS_URL="http://$NODE_IP:10255/metrics" + echo "export CADVISOR_METRICS_URL=http://$NODE_IP:10255/metrics" >> ~/.bashrc echo "Making curl request to cadvisor endpoint with port 10255 to get the configured container runtime on kubelet" podWithValidContainerId=$(curl -s http://$NODE_IP:10255/pods | jq -R 'fromjson? | [ .items[] | select( any(.status.phase; contains("Running")) ) ] | .[0]') fi @@ -456,30 +486,36 @@ if [ ! -z "$podWithValidContainerId" ]; then if [ -z "$nodeName" -o "$nodeName" == null ]; then echo "-e error nodeName in /pods API response is empty" else - setGlobalEnvVar NODE_NAME $nodeName + export NODE_NAME=$nodeName fi else echo "-e error either /pods API request failed or no running pods" fi echo "configured container runtime on kubelet is : "$CONTAINER_RUNTIME -setGlobalEnvVar "CONTAINER_RUNTIME" "$CONTAINER_RUNTIME" +echo "export CONTAINER_RUNTIME="$CONTAINER_RUNTIME >> ~/.bashrc -setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC" "kubelet_runtime_operations_total" -setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC" "kubelet_runtime_operations_errors_total" +export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="kubelet_runtime_operations_total" +echo "export KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_TOTAL_METRIC >> ~/.bashrc +export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="kubelet_runtime_operations_errors_total" +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_TOTAL_METRIC >> ~/.bashrc # default to docker metrics +export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_docker_operations" +export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_errors" + if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 - setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_runtime_operations" - setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_runtime_operations_errors" -else - setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_METRIC" "kubelet_docker_operations" - setGlobalEnvVar "KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC" "kubelet_docker_operations_errors" + export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" sudo setcap cap_sys_ptrace,cap_dac_read_search+ep /usr/bin/ruby2.6 +echo "export KUBELET_RUNTIME_OPERATIONS_METRIC="$KUBELET_RUNTIME_OPERATIONS_METRIC >> ~/.bashrc +echo "export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="$KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC >> ~/.bashrc + +source ~/.bashrc echo $NODE_NAME > /var/opt/microsoft/docker-cimprov/state/containerhostname #check if file was written successfully. @@ -493,14 +529,17 @@ dpkg -l | grep docker-cimprov | awk '{print $2 " " $3}' DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" -setGlobalEnvVar "DOCKER_CIMPROV_VERSION" "$DOCKER_CIMPROV_VERSION" +export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION +echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc #skip imds lookup since not used either legacy or aad msi auth path -setGlobalEnvVar "SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" "true" +export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc # this used by mdsd to determine cloud specific LA endpoints -setGlobalEnvVar "OMS_TLD" "$domain" +export OMS_TLD=$domain +echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source /etc/mdsd.d/envmdsd MDSD_AAD_MSI_AUTH_ARGS="" @@ -510,28 +549,46 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then echo "*** activating oneagent in aad auth msi mode ***" # msi auth specific args MDSD_AAD_MSI_AUTH_ARGS="-a -A" - setGlobalEnvVar "AAD_MSI_AUTH_MODE" "true" + export AAD_MSI_AUTH_MODE=true + echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc # this used by mdsd to determine the cloud specific AMCS endpoints - setGlobalEnvVar "customEnvironment" "$CLOUD_ENVIRONMENT" - setGlobalEnvVar "ENABLE_MCS" "true" - setGlobalEnvVar "MONITORING_USE_GENEVA_CONFIG_SERVICE" "false" - setGlobalEnvVar "MDSD_USE_LOCAL_PERSISTENCY" "false" + export customEnvironment=$CLOUD_ENVIRONMENT + echo "export customEnvironment=$customEnvironment" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="28230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + export ENABLE_MCS="true" + echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc + export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" + echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc + export MDSD_USE_LOCAL_PERSISTENCY="false" + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc else - echo "*** activating oneagent in legacy auth mode ***" - setGlobalEnvVar "AAD_MSI_AUTH_MODE" "false" - #use the file path as its secure than env - setGlobalEnvVar "CIWORKSPACE_id" "$(cat /etc/omsagent-secret/WSID)" - setGlobalEnvVar "CIWORKSPACE_keyFile" "/etc/omsagent-secret/KEY" - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + echo "*** activating oneagent in legacy auth mode ***" + CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" + #use the file path as its secure than env + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile + echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="29230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc fi -setGlobalEnvVar "MDSD_FLUENT_SOCKET_PORT" "29230" +source ~/.bashrc dpkg -l | grep mdsd | awk '{print $2 " " $3}' if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." - setGlobalEnvVar "MDSD_ROLE_PREFIX" "/var/run/mdsd-${CONTAINER_TYPE}/default" + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc mkdir /var/run/mdsd-${CONTAINER_TYPE} # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & @@ -639,11 +696,16 @@ else telemetry_cluster_type="AKS" fi -setGlobalEnvVar "TELEMETRY_AKS_RESOURCE_ID" "$telemetry_aks_resource_id" -setGlobalEnvVar "TELEMETRY_AKS_REGION" "$telemetry_aks_region" -setGlobalEnvVar "TELEMETRY_CLUSTER_NAME" "$telemetry_cluster_name" -setGlobalEnvVar "TELEMETRY_ACS_RESOURCE_NAME" "$telemetry_acs_resource_name" -setGlobalEnvVar "TELEMETRY_CLUSTER_TYPE" "$telemetry_cluster_type" +export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id +echo "export TELEMETRY_AKS_RESOURCE_ID=$telemetry_aks_resource_id" >> ~/.bashrc +export TELEMETRY_AKS_REGION=$telemetry_aks_region +echo "export TELEMETRY_AKS_REGION=$telemetry_aks_region" >> ~/.bashrc +export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name +echo "export TELEMETRY_CLUSTER_NAME=$telemetry_cluster_name" >> ~/.bashrc +export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name +echo "export TELEMETRY_ACS_RESOURCE_NAME=$telemetry_acs_resource_name" >> ~/.bashrc +export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type +echo "export TELEMETRY_CLUSTER_TYPE=$telemetry_cluster_type" >> ~/.bashrc #if [ ! -e "/etc/config/kube.conf" ]; then # nodename=$(cat /hostfs/etc/hostname) @@ -654,15 +716,20 @@ echo "nodename: $nodename" echo "replacing nodename in telegraf config" sed -i -e "s/placeholder_hostname/$nodename/g" $telegrafConfFile -setGlobalEnvVar "HOST_MOUNT_PREFIX" "/hostfs" -setGlobalEnvVar "HOST_PROC" "/hostfs/proc" -setGlobalEnvVar "HOST_SYS" "/hostfs/sys" -setGlobalEnvVar "HOST_ETC" "/hostfs/etc" -setGlobalEnvVar "HOST_VAR" "/hostfs/var" +export HOST_MOUNT_PREFIX=/hostfs +echo "export HOST_MOUNT_PREFIX=/hostfs" >> ~/.bashrc +export HOST_PROC=/hostfs/proc +echo "export HOST_PROC=/hostfs/proc" >> ~/.bashrc +export HOST_SYS=/hostfs/sys +echo "export HOST_SYS=/hostfs/sys" >> ~/.bashrc +export HOST_ETC=/hostfs/etc +echo "export HOST_ETC=/hostfs/etc" >> ~/.bashrc +export HOST_VAR=/hostfs/var +echo "export HOST_VAR=/hostfs/var" >> ~/.bashrc -if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then - if [ ! -e "/etc/config/kube.conf" ]; then - if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then +if [ ! -e "/etc/config/kube.conf" ]; then + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." waitforlisteneronTCPport 25229 30 else From 53a46fe391934cea7db2772e4889d1c76cb9808a Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 5 Jan 2022 16:35:06 -0800 Subject: [PATCH 04/21] fixed a mis-matched if/fi and indentation --- kubernetes/linux/main.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c8f0a177b..f47456973 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -335,7 +335,7 @@ if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then cat agent_config_env_var | while read line; do echo $line >> ~/.bashrc - done + done source agent_config_env_var #Parse the configmap to set the right environment variables for network policy manager (npm) integration. @@ -733,17 +733,20 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." waitforlisteneronTCPport 25229 30 else - echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." - waitforlisteneronTCPport 25226 30 - echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." - waitforlisteneronTCPport 25228 30 + echo "no metrics to scrape, not checking for listener on tcp #25229" fi else echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." waitforlisteneronTCPport 25226 30 + echo "checking for listener on tcp #25228 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25228 30 fi +else + echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." + waitforlisteneronTCPport 25226 30 fi + #start telegraf if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then /opt/telegraf --config $telegrafConfFile & From ba4b65ebb437b5a07c69a73c11daf76a0bd95882 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Wed, 5 Jan 2022 17:35:29 -0800 Subject: [PATCH 05/21] fixing some prints in main.sh --- kubernetes/linux/main.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index f47456973..d5b17b360 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -546,7 +546,7 @@ MDSD_AAD_MSI_AUTH_ARGS="" # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH export AAD_MSI_AUTH_MODE=false if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then - echo "*** activating oneagent in aad auth msi mode ***" + echo "*** setting up oneagent in aad auth msi mode ***" # msi auth specific args MDSD_AAD_MSI_AUTH_ARGS="-a -A" export AAD_MSI_AUTH_MODE=true @@ -563,7 +563,7 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then export MDSD_USE_LOCAL_PERSISTENCY="false" echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc else - echo "*** activating oneagent in legacy auth mode ***" + echo "*** setting up oneagent in legacy auth mode ***" CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" #use the file path as its secure than env CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" @@ -592,6 +592,8 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then mkdir /var/run/mdsd-${CONTAINER_TYPE} # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + else + echo "not starting mdsd (no metrics to scrape)" fi else echo "starting mdsd mode in main container..." @@ -656,10 +658,10 @@ fi #telegraf & fluentbit requirements if [ ! -e "/etc/config/kube.conf" ]; then if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & - telegrafConfFile="/etc/opt/microsoft/docker-cimprov/telegraf-prom-side-car.conf" else echo "not starting fluent-bit in prometheus sidecar (no metrics to scrape)" fi From 6efb508cc4daffde31e63f0bf03f4555c2865df9 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 18 Jan 2022 14:40:48 -0800 Subject: [PATCH 06/21] removing changes to container-azm-ms-agentconfig.yaml --- kubernetes/container-azm-ms-agentconfig.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index c3b010287..328acb201 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -34,14 +34,14 @@ data: # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] - # In the absense of this configmap, default value for enrich_container_logs is false + # In the absense of this configmap, default value for enrich_container_logs is false enabled = false - # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image + # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] - # In the absense of this configmap, default value for collect_all_kube_events is false + # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false - # When this is enabled (enabled = true), all kube events including normal events will be collected + # When this is enabled (enabled = true), all kube events including normal events will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -106,10 +106,10 @@ data: metric_collection_settings: |- # Metrics collection settings for metrics sent to Log Analytics and MDM [metric_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected enabled = false - # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization From b2773b8a27819f956858e8aea0c7a7725458d765 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 18 Jan 2022 14:46:00 -0800 Subject: [PATCH 07/21] re-removing special characters from container-azm-ms-agentconfig.yaml --- kubernetes/container-azm-ms-agentconfig.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 328acb201..c3b010287 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -34,14 +34,14 @@ data: # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] - # In the absense of this configmap, default value for enrich_container_logs is false + # In the absense of this configmap, default value for enrich_container_logs is false enabled = false - # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image + # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] - # In the absense of this configmap, default value for collect_all_kube_events is false + # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false - # When this is enabled (enabled = true), all kube events including normal events will be collected + # When this is enabled (enabled = true), all kube events including normal events will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -106,10 +106,10 @@ data: metric_collection_settings: |- # Metrics collection settings for metrics sent to Log Analytics and MDM [metric_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected enabled = false - # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization From 776b8539ebc6cd42fad4af9feb57ebbaa55780c5 Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 18 Jan 2022 17:39:13 -0800 Subject: [PATCH 08/21] changed a comment --- .../scripts/tomlparser-prom-customconfig.rb | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb index 33a3d611e..1f3ba6759 100644 --- a/build/common/installer/scripts/tomlparser-prom-customconfig.rb +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -323,7 +323,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) File.open(file_name, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" - #Set environment variables for telemetry in the sidecar container + #Set environment variables in the sidecar container if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) file = File.open("prom_config_env_var", "w") if !file.nil? diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index c3b010287..328acb201 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -34,14 +34,14 @@ data: # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] - # In the absense of this configmap, default value for enrich_container_logs is false + # In the absense of this configmap, default value for enrich_container_logs is false enabled = false - # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image + # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] - # In the absense of this configmap, default value for collect_all_kube_events is false + # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false - # When this is enabled (enabled = true), all kube events including normal events will be collected + # When this is enabled (enabled = true), all kube events including normal events will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -106,10 +106,10 @@ data: metric_collection_settings: |- # Metrics collection settings for metrics sent to Log Analytics and MDM [metric_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected enabled = false - # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization From 64a3291852f64b011403acf81d5bdd5937c7ef9c Mon Sep 17 00:00:00 2001 From: David Michelman Date: Tue, 18 Jan 2022 17:43:16 -0800 Subject: [PATCH 09/21] making comment more specific --- build/common/installer/scripts/tomlparser-prom-customconfig.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb index 1f3ba6759..15cf2d3c2 100644 --- a/build/common/installer/scripts/tomlparser-prom-customconfig.rb +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -323,7 +323,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) File.open(file_name, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" - #Set environment variables in the sidecar container + #Set environment variables for configuration and telemetry in the sidecar container if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) file = File.open("prom_config_env_var", "w") if !file.nil? From be7dc15c3ddb75533c95c38e4d52a09c8e5d7a94 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Mon, 16 May 2022 12:01:09 -0700 Subject: [PATCH 10/21] Taking only necessary changes --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- kubernetes/linux/main.sh | 22 +- kubernetes/omsagent.yaml | 370 ++++++++++--------- 3 files changed, 202 insertions(+), 192 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 328acb201..ce3ce03b6 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -69,7 +69,7 @@ data: # set this to `https` & most likely set the tls config. # - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. # - prometheus.io/port: If port is not 9102 use this annotation - monitor_kubernetes_pods = false + monitor_kubernetes_pods = true ## Restricts Kubernetes monitoring to namespaces for pods that have annotations set and are scraped using the monitor_kubernetes_pods setting. ## This will take effect when monitor_kubernetes_pods is set to true diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index d5b17b360..d869853ce 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -3,11 +3,11 @@ # please use this instead of adding env vars to bashrc directly # usage: setGlobalEnvVar ENABLE_SIDECAR_SCRAPING true -setGlobalEnvVar() { - export "$1"="$2" - echo "export \"$1\"=\"$2\"" >> /opt/env_vars -} -echo "source /opt/env_vars" >> ~/.bashrc +# setGlobalEnvVar() { +# export "$1"="$2" +# echo "export \"$1\"=\"$2\"" >> /opt/env_vars +# } +# echo "source /opt/env_vars" >> ~/.bashrc waitforlisteneronTCPport() { local sleepdurationsecs=1 @@ -378,7 +378,7 @@ fi #Sourcing environment variable file if it exists. This file has telemetry and whether kubernetes pods are monitored if [ -e "prom_config_env_var" ]; then cat prom_config_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source prom_config_env_var fi @@ -424,7 +424,7 @@ if [[ ( ( ! -e "/etc/config/kube.conf" ) && ( "${CONTAINER_TYPE}" == "Prometheus if [ -e "integration_osm_config_env_var" ]; then cat integration_osm_config_env_var | while read line; do - echo $line >> /opt/env_vars + echo $line >> ~/.bashrc done source integration_osm_config_env_var fi @@ -434,9 +434,13 @@ fi if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then - setGlobalEnvVar MUTE_PROM_SIDECAR true + export "MUTE_PROM_SIDECAR"="true" + echo "export \"MUTE_PROM_SIDECAR\"=\"true\"" >> ~/.bashrc + # setGlobalEnvVar MUTE_PROM_SIDECAR true else - setGlobalEnvVar MUTE_PROM_SIDECAR false + export "MUTE_PROM_SIDECAR"="false" + echo "export \"MUTE_PROM_SIDECAR\"=\"false\"" >> ~/.bashrc + # setGlobalEnvVar MUTE_PROM_SIDECAR false fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index a1a843196..d4a1a6a95 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -27,9 +27,11 @@ rules: - apiGroups: ["apps", "extensions", "autoscaling"] resources: ["replicasets", "deployments", "horizontalpodautoscalers"] verbs: ["list"] - - apiGroups: ["azmon.container.insights"] - resources: ["healthstates"] - verbs: ["get", "create", "patch"] + # Uncomment below lines for MSI Auth Mode testing + # - apiGroups: [""] + # resources: ["secrets"] + # resourceNames: [ "omsagent-aad-msi-token" ] + # verbs: ["get", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- @@ -51,14 +53,6 @@ apiVersion: v1 data: kube.conf: |- # Fluentd config file for OMS Docker - cluster components (kubeAPI) - #fluent forward plugin - - type forward - port "#{ENV['HEALTHMODEL_REPLICASET_SERVICE_SERVICE_PORT']}" - bind 0.0.0.0 - chunk_size_limit 4m - - #Kubernetes pod inventory type kubepodinventory @@ -66,7 +60,6 @@ data: run_interval 60 log_level debug - #Kubernetes Persistent Volume inventory type kubepvinventory @@ -74,7 +67,6 @@ data: run_interval 60 log_level debug - #Kubernetes events type kubeevents @@ -82,7 +74,6 @@ data: run_interval 60 log_level debug - #Kubernetes Nodes type kubenodeinventory @@ -90,15 +81,6 @@ data: run_interval 60 log_level debug - - #Kubernetes health - - type kubehealth - tag kubehealth.ReplicaSet - run_interval 60 - log_level debug - - #cadvisor perf- Windows nodes type wincadvisorperf @@ -106,7 +88,6 @@ data: run_interval 60 log_level debug - #Kubernetes object state - deployments type kubestatedeployments @@ -114,7 +95,6 @@ data: run_interval 60 log_level debug - #Kubernetes object state - HPA type kubestatehpa @@ -122,24 +102,16 @@ data: run_interval 60 log_level debug - type filter_inventory2mdm log_level info - #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info - - #health model aggregation filter - - type filter_health_model_builder - - type out_oms log_level debug @@ -154,7 +126,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -169,7 +140,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -184,7 +154,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -199,7 +168,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -214,7 +182,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -228,7 +195,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_oms log_level debug @@ -243,7 +209,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_mdm log_level debug @@ -259,7 +224,6 @@ data: max_retry_wait 5m retry_mdm_post_wait_minutes 30 - type out_oms log_level debug @@ -274,7 +238,6 @@ data: retry_wait 5s max_retry_wait 5m - type out_mdm log_level debug @@ -290,22 +253,6 @@ data: max_retry_wait 5m retry_mdm_post_wait_minutes 30 - - - type out_oms - log_level debug - num_threads 5 - buffer_chunk_limit 4m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_kubehealth*.buffer - buffer_queue_limit 20 - buffer_queue_full_action drop_oldest_chunk - flush_interval 20s - retry_limit 10 - retry_wait 5s - max_retry_wait 5m - - type out_oms log_level debug @@ -320,7 +267,6 @@ data: retry_wait 5s max_retry_wait 5m - metadata: name: omsagent-rs-config namespace: kube-system @@ -333,9 +279,19 @@ metadata: type: Opaque data: #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") - WSID: "VALUE_WSID" - KEY: "VALUE_KEY" + WSID: "ZjZmZjU5MmQtNGQ3OS00MDJhLWIxZTEtMzMwNTY4ZmYwNmE4Cg==" + KEY: "MFRwU05OdnE2ZEtpalE3UUpDWE9abWxKMHRwdlZhNUlNMm5TVlRVUXRQZkxnQ0tZREpmVXNpTU9wK2xWa29RdEkySExzWGV5L1lNc1psNG1EUnR5Qnc9PQo=" --- +# Uncomment below lines for MSI Auth Mode testing +# apiVersion: v1 +# kind: Secret +# metadata: +# name: omsagent-aad-msi-token +# namespace: kube-system +# type: Opaque +# data: +# token: "VALUE_MSI_TOKEN" +# --- apiVersion: apps/v1 kind: DaemonSet metadata: @@ -357,7 +313,7 @@ spec: component: oms-agent tier: node annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -367,8 +323,43 @@ spec: - name: ndots value: "3" containers: + # Uncomment below lines for MSI Auth Mode testing + # - name: addon-token-adapter + # command: + # - /addon-token-adapter + # args: + # - --secret-namespace=kube-system + # - --secret-name=omsagent-aad-msi-token + # - --token-server-listening-port=8888 + # - --health-server-listening-port=9999 + # # Make sure this matching with version in AKS RP side + # image: mcr.microsoft.com/aks/msi/addon-token-adapter:master.220318.3 + # imagePullPolicy: IfNotPresent + # env: + # - name: AZMON_COLLECT_ENV + # value: "false" + # livenessProbe: + # httpGet: + # path: /healthz + # port: 9999 + # initialDelaySeconds: 10 + # periodSeconds: 60 + # resources: + # limits: + # cpu: 500m + # memory: 500Mi + # requests: + # cpu: 100m + # memory: 100Mi + # securityContext: + # capabilities: + # drop: + # - ALL + # add: + # - NET_ADMIN + # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" imagePullPolicy: IfNotPresent resources: limits: @@ -386,9 +377,9 @@ spec: value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID_VALUE" + value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" - name: AKS_REGION - value: "VALUE_AKS_RESOURCE_REGION_VALUE" + value: "eastus" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" @@ -403,11 +394,12 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" + value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS value: "koreacentral,norwayeast,eastus2" - - name: USING_AAD_MSI_AUTH - value: "false" + # Uncomment below lines for MSI Auth Mode testing + # - name: USING_AAD_MSI_AUTH + # value: "true" securityContext: privileged: true ports: @@ -453,65 +445,66 @@ spec: periodSeconds: 60 timeoutSeconds: 15 #Only in sidecar scraping mode - # - name: omsagent-prometheus - # image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" - # imagePullPolicy: IfNotPresent - # resources: - # limits: - # cpu: 500m - # memory: 1Gi - # requests: - # cpu: 75m - # memory: 225Mi - # env: - # # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - # - name: AKS_CLUSTER_NAME - # value: "VALUE_AKS_CLUSTER_NAME" - # - name: AKS_RESOURCE_ID - # value: "VALUE_AKS_RESOURCE_ID_VALUE" - # - name: AKS_REGION - # value: "VALUE_AKS_RESOURCE_REGION_VALUE" - # - name: AKS_NODE_RESOURCE_GROUP - # value: "VALUE_AKS_NODE_RESOURCE_GROUP" - # #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters - # #- name: ACS_RESOURCE_NAME - # # value: "my_acs_cluster_name" - # - name: CONTAINER_TYPE - # value: "PrometheusSidecar" - # - name: CONTROLLER_TYPE - # value: "DaemonSet" - # - name: NODE_IP - # valueFrom: - # fieldRef: - # fieldPath: status.hostIP - # # Update this with the user assigned msi client id for omsagent - # - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - # value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" - # - name: USING_AAD_MSI_AUTH - # value: "false" - # securityContext: - # privileged: true - # volumeMounts: - # - mountPath: /etc/kubernetes/host - # name: azure-json-path - # - mountPath: /etc/omsagent-secret - # name: omsagent-secret - # readOnly: true - # - mountPath: /etc/config/settings - # name: settings-vol-config - # readOnly: true - # - mountPath: /etc/config/osm-settings - # name: osm-settings-vol-config - # readOnly: true - # livenessProbe: - # exec: - # command: - # - /bin/bash - # - -c - # - /opt/livenessprobe.sh - # initialDelaySeconds: 60 - # periodSeconds: 60 - # timeoutSeconds: 15 + - name: omsagent-prometheus + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: 500m + memory: 1Gi + requests: + cpu: 75m + memory: 225Mi + env: + # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these + - name: AKS_CLUSTER_NAME + value: "VALUE_AKS_CLUSTER_NAME" + - name: AKS_RESOURCE_ID + value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" + - name: AKS_REGION + value: "eastus" + - name: AKS_NODE_RESOURCE_GROUP + value: "VALUE_AKS_NODE_RESOURCE_GROUP" + #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters + #- name: ACS_RESOURCE_NAME + # value: "my_acs_cluster_name" + - name: CONTAINER_TYPE + value: "PrometheusSidecar" + - name: CONTROLLER_TYPE + value: "DaemonSet" + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + # Update this with the user assigned msi client id for omsagent + - name: USER_ASSIGNED_IDENTITY_CLIENT_ID + value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" + # Uncomment below lines for MSI Auth Mode testing + # - name: USING_AAD_MSI_AUTH + # value: "true" + securityContext: + privileged: true + volumeMounts: + - mountPath: /etc/kubernetes/host + name: azure-json-path + - mountPath: /etc/omsagent-secret + name: omsagent-secret + readOnly: true + - mountPath: /etc/config/settings + name: settings-vol-config + readOnly: true + - mountPath: /etc/config/osm-settings + name: osm-settings-vol-config + readOnly: true + livenessProbe: + exec: + command: + - /bin/bash + - -c + - /opt/livenessprobe.sh + initialDelaySeconds: 60 + periodSeconds: 60 + timeoutSeconds: 15 affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -596,14 +589,49 @@ spec: labels: rsName: "omsagent-rs" annotations: - agentVersion: "1.10.0.1" + agentVersion: "azure-mdsd-1.17.0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: serviceAccountName: omsagent containers: + # Uncomment below lines for MSI Auth Mode testing + # - name: addon-token-adapter + # command: + # - /addon-token-adapter + # args: + # - --secret-namespace=kube-system + # - --secret-name=omsagent-aad-msi-token + # - --token-server-listening-port=8888 + # - --health-server-listening-port=9999 + # # Make sure this matching with version in AKS RP side + # image: mcr.microsoft.com/aks/msi/addon-token-adapter:master.220318.3 + # imagePullPolicy: IfNotPresent + # env: + # - name: AZMON_COLLECT_ENV + # value: "false" + # livenessProbe: + # httpGet: + # path: /healthz + # port: 9999 + # initialDelaySeconds: 10 + # periodSeconds: 60 + # resources: + # limits: + # cpu: 500m + # memory: 500Mi + # requests: + # cpu: 100m + # memory: 100Mi + # securityContext: + # capabilities: + # drop: + # - ALL + # add: + # - NET_ADMIN + # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" imagePullPolicy: IfNotPresent resources: limits: @@ -614,9 +642,9 @@ spec: memory: 250Mi env: - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID_VALUE" + value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" - name: AKS_REGION - value: "VALUE_AKS_RESOURCE_REGION_VALUE" + value: "eastus" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" @@ -631,12 +659,13 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" + value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" # Add the below environment variable to true only in sidecar enabled regions, else set it to false - name: SIDECAR_SCRAPING_ENABLED - value: "false" - - name: USING_AAD_MSI_AUTH - value: "false" + value: "true" + # Uncomment below lines for MSI Auth Mode testing + # - name: USING_AAD_MSI_AUTH + # value: "true" securityContext: privileged: true ports: @@ -644,9 +673,6 @@ spec: protocol: TCP - containerPort: 25224 protocol: UDP - - containerPort: 25227 - protocol: TCP - name: in-rs-tcp volumeMounts: - mountPath: /var/run/host name: docker-sock @@ -765,7 +791,7 @@ spec: component: oms-agent-win tier: node-win annotations: - agentVersion: "1.10.0.1" + agentVersion: "0.0.0-0" dockerProviderVersion: "16.0.0-0" schema-versions: "v1" spec: @@ -776,18 +802,24 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod10132021" + image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" imagePullPolicy: IfNotPresent resources: limits: - cpu: 200m + cpu: 500m memory: 600Mi env: + - name: FBIT_SERVICE_FLUSH_INTERVAL + value: "15" + - name: FBIT_TAIL_BUFFER_CHUNK_SIZE + value: "1" + - name: FBIT_TAIL_BUFFER_MAX_SIZE + value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID - value: "VALUE_AKS_RESOURCE_ID_VALUE" + value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/tests" - name: AKS_REGION - value: "VALUE_AKS_RESOURCE_REGION_VALUE" + value: "eastus" #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" - name: CONTROLLER_TYPE @@ -805,13 +837,16 @@ spec: fieldRef: fieldPath: status.hostIP - name: SIDECAR_SCRAPING_ENABLED - value: "false" + value: "true" # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" + value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" # Add this only for clouds that require cert bootstrapping # - name: REQUIRES_CERT_BOOTSTRAP # value: "true" + # Uncomment below lines for MSI Auth Mode testing + # - name: USING_AAD_MSI_AUTH + # value: "true" volumeMounts: - mountPath: C:\ProgramData\docker\containers name: docker-windows-containers @@ -834,6 +869,10 @@ spec: - mountPath: C:\etc\kubernetes\host name: azure-json-path readOnly: true + # Uncomment below lines for MSI Auth Mode testing + # - mountPath: C:\etc\IMDS-access-token + # name: imds-token + # readOnly: true livenessProbe: exec: command: @@ -891,40 +930,7 @@ spec: secret: secretName: omsagent-adx-secret optional: true ---- -kind: Service -apiVersion: v1 -metadata: - name: healthmodel-replicaset-service - namespace: kube-system -spec: - selector: - rsName: "omsagent-rs" - ports: - - protocol: TCP - port: 25227 - targetPort: in-rs-tcp ---- -# this is for versions >=1.19, for versions <1.19 we continue to use v1beta1 -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: healthstates.azmon.container.insights - namespace: kube-system -spec: - group: azmon.container.insights - versions: - - name: v1 - served: true - storage: true - schema: - openAPIV3Schema: - type: object - properties: - state: - type: string - scope: Namespaced - names: - plural: healthstates - kind: HealthState - \ No newline at end of file + # Uncomment below lines for MSI Auth Mode testing + # - name: imds-token + # secret: + # secretName: omsagent-aad-msi-token From f5d7f13383e8cae824d116f8db6bf702c4a47e81 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 17 May 2022 15:06:36 -0700 Subject: [PATCH 11/21] Muting all the processes if no prometheus monitoring --- build/linux/installer/scripts/livenessprobe.sh | 3 +-- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- kubernetes/linux/main.sh | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index e74da46b3..d8b1d500b 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -1,5 +1,4 @@ -#!/bin/bash -source /opt/env_vars +#!/bin/bash if [ -s "inotifyoutput.txt" ] then diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index ce3ce03b6..328acb201 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -69,7 +69,7 @@ data: # set this to `https` & most likely set the tls config. # - prometheus.io/path: If the metrics path is not /metrics, define it with this annotation. # - prometheus.io/port: If port is not 9102 use this annotation - monitor_kubernetes_pods = true + monitor_kubernetes_pods = false ## Restricts Kubernetes monitoring to namespaces for pods that have annotations set and are scraped using the monitor_kubernetes_pods setting. ## This will take effect when monitor_kubernetes_pods is set to true diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index bfeae6779..065b977aa 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -476,11 +476,11 @@ if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then export "MUTE_PROM_SIDECAR"="true" - echo "export \"MUTE_PROM_SIDECAR\"=\"true\"" >> ~/.bashrc + echo "export MUTE_PROM_SIDECAR=true" >> ~/.bashrc # setGlobalEnvVar MUTE_PROM_SIDECAR true else export "MUTE_PROM_SIDECAR"="false" - echo "export \"MUTE_PROM_SIDECAR\"=\"false\"" >> ~/.bashrc + echo "export MUTE_PROM_SIDECAR=false" >> ~/.bashrc # setGlobalEnvVar MUTE_PROM_SIDECAR false fi From 6f92bb38ddc78385016fcc2fc91cbbc307740d2e Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 17 May 2022 15:10:02 -0700 Subject: [PATCH 12/21] Changes to omsagent.yaml --- kubernetes/omsagent.yaml | 57 +++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index d4a1a6a95..a8d193a31 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -60,6 +60,7 @@ data: run_interval 60 log_level debug + #Kubernetes Persistent Volume inventory type kubepvinventory @@ -67,6 +68,7 @@ data: run_interval 60 log_level debug + #Kubernetes events type kubeevents @@ -74,6 +76,7 @@ data: run_interval 60 log_level debug + #Kubernetes Nodes type kubenodeinventory @@ -81,6 +84,7 @@ data: run_interval 60 log_level debug + #cadvisor perf- Windows nodes type wincadvisorperf @@ -88,6 +92,7 @@ data: run_interval 60 log_level debug + #Kubernetes object state - deployments type kubestatedeployments @@ -95,6 +100,7 @@ data: run_interval 60 log_level debug + #Kubernetes object state - HPA type kubestatehpa @@ -102,16 +108,19 @@ data: run_interval 60 log_level debug + type filter_inventory2mdm log_level info + #custom_metrics_mdm filter plugin for perf data from windows nodes type filter_cadvisor2mdm metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info + type out_oms log_level debug @@ -126,6 +135,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -140,6 +150,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -154,6 +165,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -168,6 +180,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -182,6 +195,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -195,6 +209,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_oms log_level debug @@ -209,6 +224,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_mdm log_level debug @@ -224,6 +240,7 @@ data: max_retry_wait 5m retry_mdm_post_wait_minutes 30 + type out_oms log_level debug @@ -238,6 +255,7 @@ data: retry_wait 5s max_retry_wait 5m + type out_mdm log_level debug @@ -253,6 +271,7 @@ data: max_retry_wait 5m retry_mdm_post_wait_minutes 30 + type out_oms log_level debug @@ -267,6 +286,7 @@ data: retry_wait 5s max_retry_wait 5m + metadata: name: omsagent-rs-config namespace: kube-system @@ -279,8 +299,8 @@ metadata: type: Opaque data: #BASE64 ENCODED (Both WSID & KEY) INSIDE DOUBLE QUOTE ("") - WSID: "ZjZmZjU5MmQtNGQ3OS00MDJhLWIxZTEtMzMwNTY4ZmYwNmE4Cg==" - KEY: "MFRwU05OdnE2ZEtpalE3UUpDWE9abWxKMHRwdlZhNUlNMm5TVlRVUXRQZkxnQ0tZREpmVXNpTU9wK2xWa29RdEkySExzWGV5L1lNc1psNG1EUnR5Qnc9PQo=" + WSID: "VALUE_WSID" + KEY: "VALUE_KEY" --- # Uncomment below lines for MSI Auth Mode testing # apiVersion: v1 @@ -359,7 +379,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03172022" imagePullPolicy: IfNotPresent resources: limits: @@ -377,9 +397,9 @@ spec: value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID - value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" + value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION - value: "eastus" + value: "VALUE_AKS_RESOURCE_REGION_VALUE" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" @@ -394,7 +414,7 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" - name: AZMON_CONTAINERLOGS_ONEAGENT_REGIONS value: "koreacentral,norwayeast,eastus2" # Uncomment below lines for MSI Auth Mode testing @@ -446,7 +466,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03172022" imagePullPolicy: IfNotPresent resources: limits: @@ -460,9 +480,9 @@ spec: - name: AKS_CLUSTER_NAME value: "VALUE_AKS_CLUSTER_NAME" - name: AKS_RESOURCE_ID - value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" + value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION - value: "eastus" + value: "VALUE_AKS_RESOURCE_REGION_VALUE" - name: AKS_NODE_RESOURCE_GROUP value: "VALUE_AKS_NODE_RESOURCE_GROUP" #Uncomment below two lines for ACS clusters and set the cluster names manually. Also comment out the above two lines for ACS clusters @@ -478,7 +498,7 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # Uncomment below lines for MSI Auth Mode testing # - name: USING_AAD_MSI_AUTH # value: "true" @@ -501,6 +521,7 @@ spec: command: - /bin/bash - -c + - -i - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 @@ -631,7 +652,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod03172022" imagePullPolicy: IfNotPresent resources: limits: @@ -642,9 +663,9 @@ spec: memory: 250Mi env: - name: AKS_RESOURCE_ID - value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/test" + value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION - value: "eastus" + value: "VALUE_AKS_RESOURCE_REGION_VALUE" # this used for e2e test and setting this just emits some additional log statements which used for the e2e tests - name: ISTEST value: "true" @@ -659,7 +680,7 @@ spec: fieldPath: status.hostIP # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # Add the below environment variable to true only in sidecar enabled regions, else set it to false - name: SIDECAR_SCRAPING_ENABLED value: "true" @@ -802,7 +823,7 @@ spec: value: "3" containers: - name: omsagent-win - image: "mcr.microsoft.com/azuremonitor/containerinsights/cidev:prom_mem_1" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod03172022" imagePullPolicy: IfNotPresent resources: limits: @@ -817,9 +838,9 @@ spec: value: "1" # azure devops pipeline uses AKS_RESOURCE_ID and AKS_REGION hence ensure to uncomment these - name: AKS_RESOURCE_ID - value: "/subscriptions/81895b44-c46d-44c3-9368-42f0386af28e/resourceGroups/marinerv2/providers/Microsoft.ContainerService/managedClusters/tests" + value: "VALUE_AKS_RESOURCE_ID_VALUE" - name: AKS_REGION - value: "eastus" + value: "VALUE_AKS_RESOURCE_REGION_VALUE" #- name: ACS_RESOURCE_NAME # value: "my_acs_cluster_name" - name: CONTROLLER_TYPE @@ -840,7 +861,7 @@ spec: value: "true" # Update this with the user assigned msi client id for omsagent - name: USER_ASSIGNED_IDENTITY_CLIENT_ID - value: "2d157884-9578-44d1-b4ec-4211a1c9ed67" + value: "VALUE_USER_ASSIGNED_IDENTITY_CLIENT_ID_VALUE" # Add this only for clouds that require cert bootstrapping # - name: REQUIRES_CERT_BOOTSTRAP # value: "true" From 250e57ad4c627b7e632036805bdbba5b64a63283 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 17 May 2022 16:19:52 -0700 Subject: [PATCH 13/21] Removing unncessary changes --- build/linux/installer/datafiles/base_container.data | 1 - kubernetes/linux/main.sh | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 1c15e3ce6..7dcbde31f 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -55,7 +55,6 @@ MAINTAINER: 'Microsoft Corporation' /opt/tomlparser-npm-config.rb; build/linux/installer/scripts/tomlparser-npm-config.rb; 755; root; root /opt/tomlparser-osm-config.rb; build/linux/installer/scripts/tomlparser-osm-config.rb; 755; root; root /opt/test.json; build/linux/installer/conf/test.json; 644; root; root -/opt/env_vars; build/linux/installer/conf/env_vars; 644; root; root /etc/fluent/plugin/lib/application_insights/version.rb; source/plugins/ruby/lib/application_insights/version.rb; 644; root; root diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 065b977aa..561143541 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,14 +1,5 @@ #!/bin/bash - -# please use this instead of adding env vars to bashrc directly -# usage: setGlobalEnvVar ENABLE_SIDECAR_SCRAPING true -# setGlobalEnvVar() { -# export "$1"="$2" -# echo "export \"$1\"=\"$2\"" >> /opt/env_vars -# } -# echo "source /opt/env_vars" >> ~/.bashrc - waitforlisteneronTCPport() { local sleepdurationsecs=1 local totalsleptsecs=0 @@ -477,11 +468,9 @@ if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then export "MUTE_PROM_SIDECAR"="true" echo "export MUTE_PROM_SIDECAR=true" >> ~/.bashrc - # setGlobalEnvVar MUTE_PROM_SIDECAR true else export "MUTE_PROM_SIDECAR"="false" echo "export MUTE_PROM_SIDECAR=false" >> ~/.bashrc - # setGlobalEnvVar MUTE_PROM_SIDECAR false fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request From 79f1f5d73a1801269873cad263a6072bdd1c96e2 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Thu, 19 May 2022 11:03:38 -0700 Subject: [PATCH 14/21] Addig /opt/env_vars --- build/linux/installer/scripts/livenessprobe.sh | 1 + kubernetes/linux/main.sh | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index d8b1d500b..98e7a1019 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -1,4 +1,5 @@ #!/bin/bash +source /opt/env_vars if [ -s "inotifyoutput.txt" ] then diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 561143541..c2ec87b6a 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -1,5 +1,13 @@ #!/bin/bash +# please use this instead of adding env vars to bashrc directly +# usage: setGlobalEnvVar ENABLE_SIDECAR_SCRAPING true +setGlobalEnvVar() { + export "$1"="$2" + echo "export \"$1\"=\"$2\"" >> /opt/env_vars +} +echo "source /opt/env_vars" >> ~/.bashrc + waitforlisteneronTCPport() { local sleepdurationsecs=1 local totalsleptsecs=0 @@ -466,11 +474,13 @@ fi if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then - export "MUTE_PROM_SIDECAR"="true" - echo "export MUTE_PROM_SIDECAR=true" >> ~/.bashrc + # export MUTE_PROM_SIDECAR="true" + # echo "export MUTE_PROM_SIDECAR=true" >> ~/.bashrc + setGlobalEnvVar MUTE_PROM_SIDECAR true else - export "MUTE_PROM_SIDECAR"="false" - echo "export MUTE_PROM_SIDECAR=false" >> ~/.bashrc + # export MUTE_PROM_SIDECAR="false" + # echo "export MUTE_PROM_SIDECAR=false" >> ~/.bashrc + setGlobalEnvVar MUTE_PROM_SIDECAR false fi #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request From 38c777a41b0d19043e4e0044a7db7bec63c5cb92 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Mon, 23 May 2022 14:12:52 -0700 Subject: [PATCH 15/21] Removing unnecessary files --- build/linux/installer/conf/env_vars | 0 kubernetes/linux/main.sh | 4 ---- kubernetes/omsagent.yaml | 1 - 3 files changed, 5 deletions(-) delete mode 100644 build/linux/installer/conf/env_vars diff --git a/build/linux/installer/conf/env_vars b/build/linux/installer/conf/env_vars deleted file mode 100644 index e69de29bb..000000000 diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index c2ec87b6a..7e5cb0c88 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -474,12 +474,8 @@ fi if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then - # export MUTE_PROM_SIDECAR="true" - # echo "export MUTE_PROM_SIDECAR=true" >> ~/.bashrc setGlobalEnvVar MUTE_PROM_SIDECAR true else - # export MUTE_PROM_SIDECAR="false" - # echo "export MUTE_PROM_SIDECAR=false" >> ~/.bashrc setGlobalEnvVar MUTE_PROM_SIDECAR false fi diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index d5cad8494..9e40e0a9c 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -521,7 +521,6 @@ spec: command: - /bin/bash - -c - - -i - /opt/livenessprobe.sh initialDelaySeconds: 60 periodSeconds: 60 From 0d347cc0f30fe098b2b96522f9f4740b1e517981 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Mon, 23 May 2022 14:23:29 -0700 Subject: [PATCH 16/21] Irrelevant change --- kubernetes/omsagent.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 9e40e0a9c..e5f0b566b 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -954,3 +954,4 @@ spec: # - name: imds-token # secret: # secretName: omsagent-aad-msi-token + From afdd4d1ee53ecff041688b2e337f916690db6b41 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Mon, 23 May 2022 14:46:09 -0700 Subject: [PATCH 17/21] Irrelevant change --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e5f0b566b..4e021e1b8 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -954,4 +954,4 @@ spec: # - name: imds-token # secret: # secretName: omsagent-aad-msi-token - + From 21948abd4e6044f3d0a5be3f22c73b0115f43094 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 24 May 2022 11:58:51 -0700 Subject: [PATCH 18/21] Resolving comments - Adding MUTE_PROM_SIDECAR in logs and also found a missing name change --- kubernetes/linux/main.sh | 8 ++++---- source/plugins/go/src/telemetry.go | 2 +- sudo | 0 3 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 sudo diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 7e5cb0c88..4f303bbf5 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -645,7 +645,7 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & else - echo "not starting mdsd (no metrics to scrape)" + echo "not starting mdsd (no metrics to scrape since MUTE_PROM_SIDECAR is true)" fi else echo "starting mdsd mode in main container..." @@ -682,7 +682,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then fi echo "****************End Telegraf Run in Test Mode**************************" else - echo "****************Skipping Telegraf Run in Test Mode**************************" + echo "****************Skipping Telegraf Run in Test Mode since MUTE_PROM_SIDECAR is true**************************" fi else if [ -e "/opt/telegraf-test.conf" ]; then @@ -715,7 +715,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "starting fluent-bit and setting telegraf conf file for prometheus sidecar" /opt/td-agent-bit/bin/td-agent-bit -c /etc/opt/microsoft/docker-cimprov/td-agent-bit-prom-side-car.conf -e /opt/td-agent-bit/bin/out_oms.so & else - echo "not starting fluent-bit in prometheus sidecar (no metrics to scrape)" + echo "not starting fluent-bit in prometheus sidecar (no metrics to scrape since MUTE_PROM_SIDECAR is true)" fi else echo "starting fluent-bit and setting telegraf conf file for daemonset" @@ -787,7 +787,7 @@ if [ ! -e "/etc/config/kube.conf" ]; then echo "checking for listener on tcp #25229 and waiting for 30 secs if not.." waitforlisteneronTCPport 25229 30 else - echo "no metrics to scrape, not checking for listener on tcp #25229" + echo "no metrics to scrape since MUTE_PROM_SIDECAR is true, not checking for listener on tcp #25229" fi else echo "checking for listener on tcp #25226 and waiting for 30 secs if not.." diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index 3fdc465a1..a0615b2f9 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -362,7 +362,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted OSMNamespaceCount = 0 - osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") + osmNsCount := os.Getenv("OSM_CONFIGURATION_NAMESPACES_COUNT") if osmNsCount != "" { OSMNamespaceCount, err = strconv.Atoi(osmNsCount) if err != nil { diff --git a/sudo b/sudo new file mode 100644 index 000000000..e69de29bb From 66cf8c059e2428cf4eac1e8c80a4d9ec4cefdf05 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 24 May 2022 17:21:12 -0700 Subject: [PATCH 19/21] Delete unnecessary file --- sudo | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 sudo diff --git a/sudo b/sudo deleted file mode 100644 index e69de29bb..000000000 From c85341e58368d34635ef04eefb41b5dffbc105b1 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Tue, 31 May 2022 15:38:32 -0700 Subject: [PATCH 20/21] Adding MUTE_PROM_SIDECAR telemetry to main.sh Removing unnecessary variable name changes Adding additional logs --- .../scripts/tomlparser-prom-customconfig.rb | 8 ++++---- build/linux/installer/scripts/livenessprobe.sh | 2 +- .../installer/scripts/tomlparser-osm-config.rb | 2 +- kubernetes/linux/main.sh | 16 ++++++++++------ source/plugins/go/src/telemetry.go | 4 ++-- source/plugins/ruby/in_kube_nodes.rb | 2 +- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/build/common/installer/scripts/tomlparser-prom-customconfig.rb b/build/common/installer/scripts/tomlparser-prom-customconfig.rb index 15cf2d3c2..642eadc14 100644 --- a/build/common/installer/scripts/tomlparser-prom-customconfig.rb +++ b/build/common/installer/scripts/tomlparser-prom-customconfig.rb @@ -223,7 +223,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) File.open(file_name, "w") { |file| file.puts new_contents } puts "config::Successfully substituted the placeholders in telegraf conf file for replicaset" #Set environment variables for telemetry - file = File.open("prom_config_env_var", "w") + file = File.open("telemetry_prom_config_env_var", "w") if !file.nil? file.write("export TELEMETRY_RS_PROM_INTERVAL=\"#{interval}\"\n") #Setting array lengths as environment variables for telemetry purposes @@ -325,10 +325,10 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully substituted the placeholders in telegraf conf file for custom prometheus scraping" #Set environment variables for configuration and telemetry in the sidecar container if (!@containerType.nil? && @containerType.casecmp(@promSideCar) == 0) - file = File.open("prom_config_env_var", "w") + file = File.open("telemetry_prom_config_env_var", "w") if !file.nil? #Setting array lengths as environment variables for telemetry purposes - file.write("export CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") + file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS=\"#{monitorKubernetesPods}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH=\"#{monitorKubernetesPodsNamespacesLength}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_LABEL_SELECTOR_LENGTH=\"#{kubernetesLabelSelectorsLength}\"\n") file.write("export TELEMETRY_CUSTOM_PROM_FIELD_SELECTOR_LENGTH=\"#{kubernetesFieldSelectorsLength}\"\n") @@ -381,7 +381,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Successfully substituted the placeholders in telegraf conf file for daemonset" #Set environment variables for telemetry - file = File.open("prom_config_env_var", "w") + file = File.open("telemetry_prom_config_env_var", "w") if !file.nil? file.write("export TELEMETRY_DS_PROM_INTERVAL=\"#{interval}\"\n") #Setting array lengths as environment variables for telemetry purposes diff --git a/build/linux/installer/scripts/livenessprobe.sh b/build/linux/installer/scripts/livenessprobe.sh index 98e7a1019..3d74810d3 100644 --- a/build/linux/installer/scripts/livenessprobe.sh +++ b/build/linux/installer/scripts/livenessprobe.sh @@ -21,7 +21,7 @@ fi # if this is the prometheus sidecar and there are no prometheus metrics to scrape then the rest of the liveness probe doesn't apply if [[ "${CONTAINER_TYPE}" == "PrometheusSidecar" && "${MUTE_PROM_SIDECAR}" == "true" ]]; then - exit 0 + exit 0 fi #test to exit non zero value if mdsd is not running diff --git a/build/linux/installer/scripts/tomlparser-osm-config.rb b/build/linux/installer/scripts/tomlparser-osm-config.rb index 676ff8832..096064db8 100644 --- a/build/linux/installer/scripts/tomlparser-osm-config.rb +++ b/build/linux/installer/scripts/tomlparser-osm-config.rb @@ -151,7 +151,7 @@ def replaceOsmTelegrafConfigPlaceHolders telemetryFile = File.open("integration_osm_config_env_var", "w") if !telemetryFile.nil? - telemetryFile.write("export OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") + telemetryFile.write("export TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT=#{@osmMetricNamespaces.length}\n") # Close file after writing all environment variables telemetryFile.close else diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 4f303bbf5..3e25fc3a4 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -416,11 +416,11 @@ else fi #Sourcing environment variable file if it exists. This file has telemetry and whether kubernetes pods are monitored -if [ -e "prom_config_env_var" ]; then - cat prom_config_env_var | while read line; do +if [ -e "telemetry_prom_config_env_var" ]; then + cat telemetry_prom_config_env_var | while read line; do echo $line >> ~/.bashrc done - source prom_config_env_var + source telemetry_prom_config_env_var fi #Parse sidecar agent settings for custom configuration @@ -472,13 +472,15 @@ fi # If the prometheus sidecar isn't doing anything then there's no need to run mdsd and telegraf in it. if [[ ( "${CONTAINER_TYPE}" == "PrometheusSidecar" ) && - ( "${CUSTOM_PROM_MONITOR_PODS}" == "false" ) && - ( "${OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then + ( "${TELEMETRY_CUSTOM_PROM_MONITOR_PODS}" == "false" ) && + ( "${TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT}" -eq 0 ) ]]; then setGlobalEnvVar MUTE_PROM_SIDECAR true else setGlobalEnvVar MUTE_PROM_SIDECAR false fi +echo "MUTE_PROM_SIDECAR = $MUTE_PROM_SIDECAR" + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use secure port: 10250 @@ -807,7 +809,7 @@ if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "telegraf version: $(/opt/telegraf --version)" dpkg -l | grep td-agent-bit | awk '{print $2 " " $3}' else - echo "not starting telegraf (no metrics to scrape)" + echo "not starting telegraf (no metrics to scrape since MUTE_PROM_SIDECAR is true)" fi #dpkg -l | grep telegraf | awk '{print $2 " " $3}' @@ -823,6 +825,8 @@ service rsyslog status if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then checkAgentOnboardingStatus $AAD_MSI_AUTH_MODE 30 +else + echo "not checking onboarding status (no metrics to scrape since MUTE_PROM_SIDECAR is true)" fi shutdown() { diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index a0615b2f9..b4f8ab89d 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -362,7 +362,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { // Getting the namespace count, monitor kubernetes pods values and namespace count once at start because it wont change unless the configmap is applied and the container is restarted OSMNamespaceCount = 0 - osmNsCount := os.Getenv("OSM_CONFIGURATION_NAMESPACES_COUNT") + osmNsCount := os.Getenv("TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT") if osmNsCount != "" { OSMNamespaceCount, err = strconv.Atoi(osmNsCount) if err != nil { @@ -370,7 +370,7 @@ func InitializeTelemetryClient(agentVersion string) (int, error) { } } - PromMonitorPods = os.Getenv("CUSTOM_PROM_MONITOR_PODS") + PromMonitorPods = os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS") PromMonitorPodsNamespaceLength = 0 promMonPodsNamespaceLength := os.Getenv("TELEMETRY_CUSTOM_PROM_MONITOR_PODS_NS_LENGTH") diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 2a9df24a6..5a52a089b 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -47,7 +47,7 @@ def initialize(kubernetesApiClient = nil, @@rsPromMonitorPodsLabelSelectorLength = @env["TELEMETRY_RS_PROM_LABEL_SELECTOR_LENGTH"] @@rsPromMonitorPodsFieldSelectorLength = @env["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = @env["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] - @@osmNamespaceCount = @env["OSM_CONFIGURATION_NAMESPACES_COUNT"] + @@osmNamespaceCount = @env["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" From c00d7bea4cd1537408b47406c495f02815bd8715 Mon Sep 17 00:00:00 2001 From: Janvi Jatakia Date: Wed, 1 Jun 2022 08:45:50 -0700 Subject: [PATCH 21/21] Adding new found vulnerability --- .trivyignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.trivyignore b/.trivyignore index 3a8089422..f8c029116 100644 --- a/.trivyignore +++ b/.trivyignore @@ -13,4 +13,7 @@ CVE-2021-43809 CVE-2021-41816 CVE-2021-41819 CVE-2021-31799 -CVE-2021-28965 \ No newline at end of file +CVE-2021-28965 + +#dpkg vulnerability in ubuntu +CVE-2022-1664 \ No newline at end of file