From cacf15aee6910f29ef5e2d32b2bf57bc557c90dd Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 13 Jun 2021 22:58:38 -0700 Subject: [PATCH 01/28] changes related to aad msi auth feature --- .../installer/datafiles/base_container.data | 3 + kubernetes/linux/main.sh | 139 ++++-- kubernetes/linux/setup.sh | 2 +- kubernetes/omsagent.yaml | 7 + kubernetes/windows/main.ps1 | 24 + .../ci-extension-dcr-streams.md | 186 ++++++++ scripts/dcr-onboarding/ci-extension-dcr.json | 59 +++ source/plugins/go/src/extension/extension.go | 101 +++++ source/plugins/go/src/extension/interfaces.go | 34 ++ .../plugins/go/src/extension/socket_writer.go | 85 ++++ .../plugins/go/src/ingestion_token_utils.go | 429 ++++++++++++++++++ .../go/src/ingestion_token_utils_test.go | 54 +++ source/plugins/go/src/oms.go | 74 ++- source/plugins/go/src/utils.go | 43 +- .../ruby/ApplicationInsightsUtility.rb | 9 +- source/plugins/ruby/constants.rb | 23 + .../ruby/filter_health_model_builder.rb | 10 +- source/plugins/ruby/in_cadvisor_perf.rb | 14 +- source/plugins/ruby/in_containerinventory.rb | 10 +- source/plugins/ruby/in_kube_events.rb | 8 + source/plugins/ruby/in_kube_nodes.rb | 22 +- source/plugins/ruby/in_kube_podinventory.rb | 24 + source/plugins/ruby/in_kube_pvinventory.rb | 7 + .../plugins/ruby/in_kubestate_deployments.rb | 7 + source/plugins/ruby/in_kubestate_hpa.rb | 10 +- source/plugins/ruby/in_win_cadvisor_perf.rb | 14 +- source/plugins/ruby/out_mdm.rb | 13 +- source/plugins/utils/extension.rb | 77 ++++ source/plugins/utils/extension_utils.rb | 27 ++ 29 files changed, 1443 insertions(+), 72 deletions(-) create mode 100644 scripts/dcr-onboarding/ci-extension-dcr-streams.md create mode 100644 scripts/dcr-onboarding/ci-extension-dcr.json create mode 100644 source/plugins/go/src/extension/extension.go create mode 100644 source/plugins/go/src/extension/interfaces.go create mode 100644 source/plugins/go/src/extension/socket_writer.go create mode 100644 source/plugins/go/src/ingestion_token_utils.go create mode 100644 source/plugins/go/src/ingestion_token_utils_test.go create mode 100644 source/plugins/utils/extension.rb create mode 100644 source/plugins/utils/extension_utils.rb diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index de8ccbba0..5d7301aa3 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -146,6 +146,9 @@ MAINTAINER: 'Microsoft Corporation' /etc/fluent/plugin/omslog.rb; source/plugins/utils/omslog.rb; 644; root; root /etc/fluent/plugin/oms_common.rb; source/plugins/utils/oms_common.rb; 644; root; root +/etc/fluent/plugin/extension.rb; source/plugins/utils/extension.rb; 644; root; root +/etc/fluent/plugin/extension_utils.rb; source/plugins/utils/extension_utils.rb; 644; root; root + /etc/fluent/kube.conf; build/linux/installer/conf/kube.conf; 644; root; root /etc/fluent/container.conf; build/linux/installer/conf/container.conf; 644; root; root diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index b9e338fa9..7950f30fd 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -58,6 +58,10 @@ else echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc source ~/.bashrc echo "customResourceId:$customResourceId" + export customRegion=$AKS_REGION + echo "export customRegion=$AKS_REGION" >> ~/.bashrc + source ~/.bashrc + echo "customRegion:$customRegion" fi #set agent config schema version @@ -430,48 +434,105 @@ DOCKER_CIMPROV_VERSION=$(dpkg -l | grep docker-cimprov | awk '{print $3}') echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc -echo "*** activating oneagent in legacy auth mode ***" -CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" -#use the file path as its secure than env -CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" -cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc -done -source /etc/mdsd.d/envmdsd -echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" -export CIWORKSPACE_id=$CIWORKSPACE_id -echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc -export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile -echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc -export OMS_TLD=$domain -echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc -export MDSD_FLUENT_SOCKET_PORT="29230" -echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - -#skip imds lookup since not used in legacy auth path -export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc - -source ~/.bashrc -dpkg -l | grep mdsd | awk '{print $2 " " $3}' +# check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH +export AAD_MSI_AUTH_MODE=false +if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then + echo "*** activating oneagent in aad auth msi mode ***" + export AAD_MSI_AUTH_MODE=true + echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc + + cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc + done + source /etc/mdsd.d/envmdsd + + + export MDSD_FLUENT_SOCKET_PORT="28230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + export MCS_ENDPOINT="handler.control.monitor.azure.com" + echo "export MCS_ENDPOINT=$MCS_ENDPOINT" >> ~/.bashrc + export AZURE_ENDPOINT="https://monitor.azure.com/" + echo "export AZURE_ENDPOINT=$AZURE_ENDPOINT" >> ~/.bashrc + export ADD_REGION_TO_MCS_ENDPOINT="true" + echo "export ADD_REGION_TO_MCS_ENDPOINT=$ADD_REGION_TO_MCS_ENDPOINT" >> ~/.bashrc + export ENABLE_MCS="true" + echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc + export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" + echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc + export MDSD_USE_LOCAL_PERSISTENCY="false" + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc + #skip imds metadata lookup since we dont use + #legacy auth misnomer here + export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" + echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc -if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." - #use tenant name to avoid unix socket conflict and different ports for port conflict - #roleprefix to use container specific mdsd socket - export TENANT_NAME="${CONTAINER_TYPE}" - echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc - export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default - echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc source ~/.bashrc - mkdir /var/run/mdsd-${CONTAINER_TYPE} - # add -T 0xFFFF for full traces - mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & -else - echo "starting mdsd in legacy auth mode in main container..." - # add -T 0xFFFF for full traces - mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + + dpkg -l | grep mdsd | awk '{print $2 " " $3}' + + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in aad auth msi mode in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd -a -A -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + else + echo "starting mdsd in aad auth msi mode in main container..." + # add -T 0xFFFF for full traces + mdsd -a -A -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + fi + +else + echo "*** activating oneagent in legacy auth mode ***" + CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" + #use the file path as its secure than env + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc + done + source /etc/mdsd.d/envmdsd + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile + echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc + export OMS_TLD=$domain + echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="29230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + + #skip imds lookup since not used in legacy auth path + export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" + echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc + + source ~/.bashrc + + dpkg -l | grep mdsd | awk '{print $2 " " $3}' + + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + else + echo "starting mdsd in legacy auth mode in main container..." + # add -T 0xFFFF for full traces + mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & + fi fi # no dependency on fluentd for prometheus side car container diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 17cfb3f77..0276ed7af 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -10,7 +10,7 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ update-locale LANG=en_US.UTF-8 #install oneagent - Official bits (05/17/2021) -wget https://github.com/microsoft/Docker-Provider/releases/download/05172021-oneagent/azure-mdsd_1.10.1-build.master.213_x86_64.deb +wget https://github.com/microsoft/Docker-Provider/raw/gangams/ci-aad-auth-msi/oneagent-dev/azure-mdsd_1.10.1-build.dev_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 617c81f38..0a8b39426 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -811,6 +811,9 @@ spec: - mountPath: C:\etc\config\adx name: omsagent-adx-secret readOnly: true + - mountPath: C:\etc\IMDS-access-token + name: imds-token + readOnly: true # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds # - mountPath: C:\ca # name: ca-certs @@ -871,6 +874,10 @@ spec: secret: secretName: omsagent-adx-secret optional: true + - name: imds-token + secret: + secretName: omsagent-aad-msi-token + optional: true --- kind: Service apiVersion: v1 diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index bc053b0d6..c28554da9 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -43,6 +43,7 @@ function Start-FileSystemWatcher { function Set-EnvironmentVariables { $domain = "opinsights.azure.com" + $mcs_endpoint = "monitor.azure.com" $cloud_environment = "public" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging @@ -53,11 +54,24 @@ function Set-EnvironmentVariables { # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") + + # Set MCS Endpoint + [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Process") + [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Machine") # Set CLOUD_ENVIRONMENT [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") + + # Set Region + [System.Environment]::SetEnvironmentVariable("customRegion", $env:AKS_REGION, "Process") + [System.Environment]::SetEnvironmentVariable("customRegion", $env:AKS_REGION, "Machine") + + # Set resource ID + [System.Environment]::SetEnvironmentVariable("customResourceId", $env:AKS_RESOURCE_ID, "Process") + [System.Environment]::SetEnvironmentVariable("customResourceId", $env:AKS_RESOURCE_ID, "Machine") + $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { # TODO: Change to omsagent-secret before merging @@ -228,6 +242,16 @@ function Set-EnvironmentVariables { else { Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" } + # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH environment variable + $isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH", "process") + if (![string]::IsNullOrEmpty($isAADMSIAuth) -and ($isAADMSIAuth -eq "true")) { + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", "true", "Process") + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", "true", "Machine") + Write-Host "Using AAD MSI auth" + } + else { + Write-Host "Using LA Legacy Auth" + } # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb diff --git a/scripts/dcr-onboarding/ci-extension-dcr-streams.md b/scripts/dcr-onboarding/ci-extension-dcr-streams.md new file mode 100644 index 000000000..cbac41838 --- /dev/null +++ b/scripts/dcr-onboarding/ci-extension-dcr-streams.md @@ -0,0 +1,186 @@ +# 1 - ContainerLogV2 +> Note- Please note, this table uses NG schema +``` +stream-id: Microsoft-ContainerLogV2 +data-type: CONTAINERINSIGHTS_CONTAINERLOGV2 +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerLogV2 +alias-stream-id: Microsoft-ContainerLogV2 +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 2 - InsightsMetrics +``` +stream-id: Microsoft-InsightsMetrics +data-type: INSIGHTS_METRICS_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: InsightsMetrics +alias-stream-id: Microsoft-InsightsMetrics +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 3 - ContainerInventory + +``` +stream-id: Microsoft-ContainerInventory +data-type: CONTAINER_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerInventory +alias-stream-id: Microsoft-ContainerInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 4 - ContainerLog + +``` +stream-id: Microsoft-ContainerLog +data-type: CONTAINER_LOG_BLOB +intelligence-pack: Containers +solutions: ContainerInsights +platform: Any +la-table-name: ContainerLog +alias-stream-id: Microsoft-ContainerLog +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 5 - ContainerNodeInventory + +``` +stream-id: Microsoft-ContainerNodeInventory +data-type: CONTAINER_NODE_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: ContainerNodeInventory +alias-stream-id: Microsoft-ContainerNodeInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 6 - KubePodInventory +``` +stream-id: Microsoft-KubePodInventory +data-type: KUBE_POD_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubePodInventory +alias-stream-id: Microsoft-KubePodInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 7 - KubeNodeInventory +``` +stream-id: Microsoft-KubeNodeInventory +data-type: KUBE_NODE_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeNodeInventory +alias-stream-id: Microsoft-KubeNodeInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 8 - KubePVInventory +``` +stream-id: Microsoft-KubePVInventory +data-type: KUBE_PV_INVENTORY_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubePVInventory +alias-stream-id: Microsoft-KubePVInventory +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 9 - KubeEvents +``` +stream-id: Microsoft-KubeEvents +data-type: KUBE_EVENTS_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeEvents +alias-stream-id: Microsoft-KubeEvents +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 10 - KubeServices +``` +stream-id: Microsoft-KubeServices +data-type: KUBE_SERVICES_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeServices +alias-stream-id: Microsoft-KubeServices +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 11 - KubeMonAgentEvents +``` +stream-id: Microsoft-KubeMonAgentEvents +data-type: KUBE_MON_AGENT_EVENTS_BLOB +intelligence-pack: Containers +solutions: ContainerInsights +platform: Any +la-table-name: KubeMonAgentEvents +alias-stream-id: Microsoft-KubeMonAgentEvents +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 12 - KubeHealth +``` +stream-id: Microsoft-KubeHealth +data-type: KUBE_HEALTH_BLOB +intelligence-pack: ContainerInsights +solutions: ContainerInsights +platform: Any +la-table-name: KubeHealth +alias-stream-id: Microsoft-KubeHealth +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` + +# 13 - Perf +``` +> Note - This stream already exists +stream-id: Microsoft-Perf +data-type: LINUX_PERF_BLOB +intelligence-pack: LogManagement +solutions: ContainerInsights +platform: Any +la-table-name: LogManagement +alias-stream-id: Microsoft-Perf +contact-alias: OMScontainers@microsoft.com +stage: to review +tags: agent +``` diff --git a/scripts/dcr-onboarding/ci-extension-dcr.json b/scripts/dcr-onboarding/ci-extension-dcr.json new file mode 100644 index 000000000..03eba1400 --- /dev/null +++ b/scripts/dcr-onboarding/ci-extension-dcr.json @@ -0,0 +1,59 @@ +{ + "location": "", + "properties": { + "dataSources": { + "extensions": [ + { + "name": "ContainerInsightsExtension", + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + + ], + "extensionName": "ContainerInsights" + } + ] + }, + "destinations": { + "logAnalytics": [ + { + "workspaceResourceId": "/subscriptions//resourcegroups//providers/microsoft.operationalinsights/workspaces/", + "name": "ciworkspace" + } + ] + }, + "dataFlows": [ + { + "streams": [ + "Microsoft-Perf", + "Microsoft-ContainerInventory", + "Microsoft-ContainerLog", + "Microsoft-ContainerLogV2", + "Microsoft-ContainerNodeInventory", + "Microsoft-KubeEvents", + "Microsoft-KubeHealth", + "Microsoft-KubeMonAgentEvents", + "Microsoft-KubeNodeInventory", + "Microsoft-KubePodInventory", + "Microsoft-KubePVInventory", + "Microsoft-KubeServices", + "Microsoft-InsightsMetrics" + ], + "destinations": [ + "ciworkspace" + ] + } + ] + } +} \ No newline at end of file diff --git a/source/plugins/go/src/extension/extension.go b/source/plugins/go/src/extension/extension.go new file mode 100644 index 000000000..c68140ded --- /dev/null +++ b/source/plugins/go/src/extension/extension.go @@ -0,0 +1,101 @@ +package extension + +import ( + "encoding/json" + "fmt" + "log" + "sync" + "strings" + uuid "github.com/google/uuid" + "github.com/ugorji/go/codec" +) + +type Extension struct { + datatypeStreamIdMap map[string]string +} + +var singleton *Extension +var once sync.Once +var extensionconfiglock sync.Mutex +var logger *log.Logger +var containerType string + +func GetInstance(flbLogger *log.Logger, containerType string) *Extension { + once.Do(func() { + singleton = &Extension{make(map[string]string)} + flbLogger.Println("Extension Instance created") + }) + logger = flbLogger + containerType = containerType + return singleton +} + +func (e *Extension) GetOutputStreamId(datatype string) string { + extensionconfiglock.Lock() + defer extensionconfiglock.Unlock() + if len(e.datatypeStreamIdMap) > 0 && e.datatypeStreamIdMap[datatype] != "" { + message := fmt.Sprintf("OutputstreamId: %s for the datatype: %s", e.datatypeStreamIdMap[datatype], datatype) + logger.Printf(message) + return e.datatypeStreamIdMap[datatype] + } + var err error + e.datatypeStreamIdMap, err = getDataTypeToStreamIdMapping() + if err != nil { + message := fmt.Sprintf("Error getting datatype to streamid mapping: %s", err.Error()) + logger.Printf(message) + } + return e.datatypeStreamIdMap[datatype] +} + +func getDataTypeToStreamIdMapping() (map[string]string, error) { + logger.Printf("extensionconfig::getDataTypeToStreamIdMapping:: getting extension config from fluent socket - start") + guid := uuid.New() + datatypeOutputStreamMap := make(map[string]string) + + taggedData := map[string]interface{}{"Request": "AgentTaggedData", "RequestId": guid.String(), "Tag": "ContainerInsights", "Version": "1"} + jsonBytes, err := json.Marshal(taggedData) + + var data []byte + enc := codec.NewEncoderBytes(&data, new(codec.MsgpackHandle)) + if err := enc.Encode(string(jsonBytes)); err != nil { + return datatypeOutputStreamMap, err + } + + fs := &FluentSocketWriter{ } + fs.sockAddress = "/var/run/mdsd/default_fluent.socket" + if containerType != "" && strings.Compare(strings.ToLower(containerType), "prometheussidecar") == 0 { + fs.sockAddress = fmt.Sprintf("/var/run/mdsd-%s/default_fluent.socket", containerType) + } + responseBytes, err := fs.WriteAndRead(data) + defer fs.disConnect() + logger.Printf("Info::mdsd::Making call to FluentSocket: %s to write and read the config data", fs.sockAddress) + if err != nil { + return datatypeOutputStreamMap, err + } + response := string(responseBytes) + + var responseObjet AgentTaggedDataResponse + err = json.Unmarshal([]byte(response), &responseObjet) + if err != nil { + logger.Printf("Error::mdsd::Failed to unmarshal config data. Error message: %s", string(err.Error())) + return datatypeOutputStreamMap, err + } + + var extensionData TaggedData + json.Unmarshal([]byte(responseObjet.TaggedData), &extensionData) + + extensionConfigs := extensionData.ExtensionConfigs + logger.Printf("Info::mdsd::build the datatype and streamid map -- start") + for _, extensionConfig := range extensionConfigs { + outputStreams := extensionConfig.OutputStreams + for dataType, outputStreamID := range outputStreams { + logger.Printf("Info::mdsd::datatype: %s, outputstreamId: %s", dataType, outputStreamID) + datatypeOutputStreamMap[dataType] = outputStreamID.(string) + } + } + logger.Printf("Info::mdsd::build the datatype and streamid map -- end") + + logger.Printf("extensionconfig::getDataTypeToStreamIdMapping:: getting extension config from fluent socket-end") + + return datatypeOutputStreamMap, nil +} diff --git a/source/plugins/go/src/extension/interfaces.go b/source/plugins/go/src/extension/interfaces.go new file mode 100644 index 000000000..c70ef17b8 --- /dev/null +++ b/source/plugins/go/src/extension/interfaces.go @@ -0,0 +1,34 @@ +package extension + +// AgentTaggedDataResponse struct for response from AgentTaggedData request +type AgentTaggedDataResponse struct { + Request string `json:"Request"` + RequestID string `json:"RequestId"` + Version string `json:"Version"` + Success bool `json:"Success"` + Description string `json:"Description"` + TaggedData string `json:"TaggedData"` +} + +// TaggedData structure for respone +type TaggedData struct { + SchemaVersion int `json:"schemaVersion"` + Version int `json:"version"` + ExtensionName string `json:"extensionName"` + ExtensionConfigs []ExtensionConfig `json:"extensionConfigurations"` + OutputStreamDefinitions map[string]StreamDefinition `json:"outputStreamDefinitions"` +} + +// StreamDefinition structure for named pipes +type StreamDefinition struct { + NamedPipe string `json:"namedPipe"` +} + +// ExtensionConfig structure for extension definition in DCR +type ExtensionConfig struct { + ID string `json:"id"` + OriginIds []string `json:"originIds"` + ExtensionSettings map[string]interface{} `json:"extensionSettings"` + InputStreams map[string]interface{} `json:"inputStreams"` + OutputStreams map[string]interface{} `json:"outputStreams"` +} diff --git a/source/plugins/go/src/extension/socket_writer.go b/source/plugins/go/src/extension/socket_writer.go new file mode 100644 index 000000000..1b16b319c --- /dev/null +++ b/source/plugins/go/src/extension/socket_writer.go @@ -0,0 +1,85 @@ +package extension + +import ( + "net" +) + +//MaxRetries for trying to write data to the socket +const MaxRetries = 5 + +//ReadBufferSize for reading data from sockets +//Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios +const ReadBufferSize = 20480 + +//FluentSocketWriter writes data to AMA's default fluent socket +type FluentSocketWriter struct { + socket net.Conn + sockAddress string +} + +func (fs *FluentSocketWriter) connect() error { + c, err := net.Dial("unix", fs.sockAddress) + if err != nil { + return err + } + fs.socket = c + return nil +} + +func (fs *FluentSocketWriter) disConnect() error { + if (fs.socket != nil) { + fs.socket.Close() + fs.socket = nil + } + return nil +} + +func (fs *FluentSocketWriter) writeWithRetries(data []byte) (int, error) { + var ( + err error + n int + ) + for i := 0; i < MaxRetries; i++ { + n, err = fs.socket.Write(data) + if err == nil { + return n, nil + } + } + if err, ok := err.(net.Error); !ok || !err.Temporary() { + // so that connect() is called next time if write fails + // this happens when mdsd is restarted + _ = fs.socket.Close() // no need to log the socket closing error + fs.socket = nil + } + return 0, err +} + +func (fs *FluentSocketWriter) read() ([]byte, error) { + buf := make([]byte, ReadBufferSize) + n, err := fs.socket.Read(buf) + if err != nil { + return nil, err + } + return buf[:n], nil + +} + +func (fs *FluentSocketWriter) Write(payload []byte) (int, error) { + if fs.socket == nil { + // previous write failed with permanent error and socket was closed. + if err := fs.connect(); err != nil { + return 0, err + } + } + + return fs.writeWithRetries(payload) +} + +//WriteAndRead writes data to the socket and sends the response back +func (fs *FluentSocketWriter) WriteAndRead(payload []byte) ([]byte, error) { + _, err := fs.Write(payload) + if err != nil { + return nil, err + } + return fs.read() +} diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go new file mode 100644 index 000000000..4dffd7974 --- /dev/null +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -0,0 +1,429 @@ +package main + +import ( + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +const IMDSTokenPathForWindows = "c:/etc/imds-access-token/token" // only used in windows + +var IMDSToken string +var IMDSTokenExpiration int64 + +var ConfigurationId string +var ChannelId string + +var IngestionAuthToken string +var IngestionAuthTokenExpiration int64 + +type IMDSResponse struct { + AccessToken string `json:"access_token"` + ClientID string `json:"client_id"` + ExpiresIn string `json:"expires_in"` + ExpiresOn string `json:"expires_on"` + ExtExpiresIn string `json:"ext_expires_in"` + NotBefore string `json:"not_before"` + Resource string `json:"resource"` + TokenType string `json:"token_type"` +} + +type AgentConfiguration struct { + Configurations []struct { + Configurationid string `json:"configurationId"` + Etag string `json:"eTag"` + Op string `json:"op"` + Content struct { + Datasources []struct { + Configuration struct { + Extensionname string `json:"extensionName"` + } `json:"configuration"` + ID string `json:"id"` + Kind string `json:"kind"` + Streams []struct { + Stream string `json:"stream"` + Solution string `json:"solution"` + Extensionoutputstream string `json:"extensionOutputStream"` + } `json:"streams"` + Sendtochannels []string `json:"sendToChannels"` + } `json:"dataSources"` + Channels []struct { + Endpoint string `json:"endpoint"` + ID string `json:"id"` + Protocol string `json:"protocol"` + } `json:"channels"` + Extensionconfigurations struct { + Containerinsights []struct { + ID string `json:"id"` + Originids []string `json:"originIds"` + //TODO: make this a map so that if more types are added in the future it won't break. Also test if removing a type breaks the json deserialization + Outputstreams struct { + LinuxPerfBlob string `json:"LINUX_PERF_BLOB"` + ContainerInventoryBlob string `json:"CONTAINER_INVENTORY_BLOB"` + ContainerLogBlob string `json:"CONTAINER_LOG_BLOB"` + ContainerinsightsContainerlogv2 string `json:"CONTAINERINSIGHTS_CONTAINERLOGV2"` + ContainerNodeInventoryBlob string `json:"CONTAINER_NODE_INVENTORY_BLOB"` + KubeEventsBlob string `json:"KUBE_EVENTS_BLOB"` + KubeHealthBlob string `json:"KUBE_HEALTH_BLOB"` + KubeMonAgentEventsBlob string `json:"KUBE_MON_AGENT_EVENTS_BLOB"` + KubeNodeInventoryBlob string `json:"KUBE_NODE_INVENTORY_BLOB"` + KubePodInventoryBlob string `json:"KUBE_POD_INVENTORY_BLOB"` + KubePvInventoryBlob string `json:"KUBE_PV_INVENTORY_BLOB"` + KubeServicesBlob string `json:"KUBE_SERVICES_BLOB"` + InsightsMetricsBlob string `json:"INSIGHTS_METRICS_BLOB"` + } `json:"outputStreams"` + } `json:"ContainerInsights"` + } `json:"extensionConfigurations"` + } `json:"content"` + } `json:"configurations"` +} + +type IngestionTokenResponse struct { + Configurationid string `json:"configurationId"` + Ingestionauthtoken string `json:"ingestionAuthToken"` +} + +func getIngestionToken() (authToken string, err error) { + // check if the ingestion token has not been fetched yet or is expiring soon. If so then re-fetch it first. + // TODO: re-fetch token in a new thread if it is about to expire (don't block the main thread) + + if IMDSToken == "" || IMDSTokenExpiration <= time.Now().Unix()+30 { // refresh the token 30 seconds before it expires + IMDSToken, IMDSTokenExpiration, err = getAccessTokenFromIMDS(true) //TODO: read from an env variable? Or maybe from OS? + if err != nil { + message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) + Log(message) + SendException(message) + return "", err + } + } + + // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) + if ConfigurationId == "" || ChannelId == "" { + ConfigurationId, ChannelId, _, err = getAgentConfiguration(IMDSToken) + if err != nil { + message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) + Log(message) + SendException(message) + return "", err + } + } + + if IngestionAuthToken == "" || IngestionAuthTokenExpiration <= time.Now().Unix()+10 { + IngestionAuthToken, IngestionAuthTokenExpiration, err = getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) + if err != nil { + message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) + Log(message) + SendException(message) + return "", err + } + } + + return IngestionAuthToken, nil +} + +func getAccessTokenFromIMDS(fromfile bool) (string, int64, error) { + Log("Info getAccessTokenFromIMDS: start") + MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + imdsAccessToken := "" + + var responseBytes []byte + var err error + + if fromfile { + // only used in windows + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file", err) + return imdsAccessToken, 0, err + } + } else { + var msi_endpoint *url.URL + msi_endpoint, err := url.Parse("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://" + MCS_ENDPOINT + "/") + if err != nil { + Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) + return imdsAccessToken, 0, err + } + req, err := http.NewRequest("GET", msi_endpoint.String(), nil) + if err != nil { + Log("getAccessTokenFromIMDS: Error creating HTTP request: %s", err.Error()) + return imdsAccessToken, 0, err + } + req.Header.Add("Metadata", "true") + + // Call managed services for Azure resources token endpoint + var resp *http.Response = nil + for i := 0; i < 3; i++ { + // client := &http.Client{} + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s", err.Error()) + Log(message) + SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. + continue + } + //TODO: is this the best place to defer closing the response body? + defer resp.Body.Close() + + Log("getAccessTokenFromIMDS: IMDS Response Status: %d", resp.StatusCode) + if resp.StatusCode != 200 { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code : %d", resp.StatusCode) + Log(message) + SendException(message) + continue + } + break // call succeeded, don't retry any more + } + if resp == nil { + Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") + return imdsAccessToken, 0, err + } + + // Pull out response body + responseBytes, err = ioutil.ReadAll(resp.Body) + if err != nil { + Log("getAccessTokenFromIMDS: Error reading response body : %s", err.Error()) + return imdsAccessToken, 0, err + } + } + + // Unmarshall response body into struct + var imdsResponse IMDSResponse + err = json.Unmarshal(responseBytes, &imdsResponse) + if err != nil { + Log("getAccessTokenFromIMDS: Error unmarshalling the response: %s", err.Error()) + return imdsAccessToken, 0, err + } + imdsAccessToken = imdsResponse.AccessToken + + expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) + if err != nil { + Log("Error reading expiration time from response header body: %s", err.Error()) + return imdsAccessToken, 0, err + } + + Log("IMDS Token obtained: %s", imdsAccessToken) + + Log("Info getAccessTokenFromIMDS: end") + + return imdsAccessToken, expiration, nil +} + +func getAgentConfiguration(imdsAccessToken string) (configurationId string, channelId string, expiration int64, err error) { + Log("Info getAgentConfiguration: start") + configurationId = "" + channelId = "" + apiVersion := "2020-08-01-preview" + var amcs_endpoint *url.URL + resourceId := os.Getenv("customResourceId") + resourceRegion := os.Getenv("customRegion") + MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations?platform=windows&api-version=%s", resourceRegion, resourceId, apiVersion) + amcs_endpoint, err = url.Parse(amcs_endpoint_string) + + var bearer = "Bearer " + imdsAccessToken + // Create a new request using http + req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) + if err != nil { + message := fmt.Sprintf("Error creating HTTP request for AMCS endpoint: %s", err.Error()) + Log(message) + return configurationId, channelId, 0, err + } + + // add authorization header to the req + req.Header.Add("Authorization", bearer) + + var resp *http.Response = nil + + for i := 0; i < 3; i++ { + // Call managed services for Azure resources token endpoint + // client := &http.Client{} + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("Error calling amcs endpoint: %s", err.Error()) + Log(message) + SendException(message) + continue + } + + //TODO: is this the best place to defer closing the response body? + defer resp.Body.Close() + + Log("getAgentConfiguration Response Status: %d", resp.StatusCode) + if resp.StatusCode != 200 { + message := fmt.Sprintf("getAgentConfiguration Request failed with an error code : %d", resp.StatusCode) + Log(message) + SendException(message) + continue + } + break // call succeeded, don't retry any more + } + if resp == nil { + Log("getAgentConfiguration Request ran out of retries") + return configurationId, channelId, 0, err + } + + // get content timeout + expiration, err = getExpirationFromAmcsResponse(resp.Header) + if err != nil { + Log("getAgentConfiguration failed to parse auth token timeout") + return configurationId, channelId, 0, err + } + Log(fmt.Sprintf("getAgentConfiguration: DCR expires at %d seconds (unix timestamp)", expiration)) + + // Pull out response body + //TODO: does responseBytes need to be closed? + responseBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + Log("Error reading response body from AMCS API call : %s", err.Error()) + return configurationId, channelId, expiration, err + } + + // Unmarshall response body into struct + var agentConfiguration AgentConfiguration + err = json.Unmarshal(responseBytes, &agentConfiguration) + if err != nil { + Log("Error unmarshalling the response: %s", err.Error()) + return configurationId, channelId, expiration, err + } + + if len(agentConfiguration.Configurations) == 0 { + Log("Received empty agentConfiguration.Configurations array") + return configurationId, channelId, expiration, err + } + + if len(agentConfiguration.Configurations[0].Content.Channels) == 0 { + Log("Received empty agentConfiguration.Configurations[0].Content.Channels") + return configurationId, channelId, expiration, err + } + + configurationId = agentConfiguration.Configurations[0].Configurationid + channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID + + Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) + + Log("Info getAgentConfiguration: end") + + return configurationId, channelId, expiration, nil +} + +func getIngestionAuthToken(imdsAccessToken string, configurationId string, channelId string) (ingestionAuthToken string, expiration int64, err error) { + Log("Info getIngestionAuthToken: start") + ingestionAuthToken = "" + var amcs_endpoint *url.URL + resourceId := os.Getenv("customResourceId") + resourceRegion := os.Getenv("customRegion") + MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + apiVersion := "2020-04-01-preview" + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=windows&api-version=%s", resourceRegion, resourceId, configurationId, channelId, apiVersion) + amcs_endpoint, err = url.Parse(amcs_endpoint_string) + + var bearer = "Bearer " + imdsAccessToken + + // Create a new request using http + req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) + if err != nil { + Log("Error creating HTTP request for AMCS endpoint: %s", err.Error()) + return ingestionAuthToken, 0, err + } + + // add authorization header to the req + req.Header.Add("Authorization", bearer) + + var resp *http.Response = nil + + for i := 0; i < 3; i++ { + // Call managed services for Azure resources token endpoint + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("Error calling amcs endpoint for ingestion auth token: %s", err.Error()) + Log(message) + SendException(message) + resp = nil + continue + } + + //TODO: is this the best place to defer closing the response body? + defer resp.Body.Close() + + Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) + if resp.StatusCode != 200 { + message := fmt.Sprintf("getIngestionAuthToken Request failed with an error code : %d", resp.StatusCode) + Log(message) + SendException(message) + resp = nil + continue + } + break + } + + if resp == nil { + Log("ran out of retries calling AMCS for ingestion token") + return ingestionAuthToken, 0, err + } + + expiration, err = getExpirationFromAmcsResponse(resp.Header) + if err != nil { + Log("getIngestionAuthToken failed to parse auth token expiration time") + return ingestionAuthToken, 0, err + } + Log("getIngestionAuthToken: token times out at time %d (unix timestamp)", expiration) + + // Pull out response body + responseBytes, err := ioutil.ReadAll(resp.Body) + if err != nil { + Log("Error reading response body from AMCS Ingestion API call : %s", err.Error()) + return ingestionAuthToken, expiration, err + } + + // Unmarshall response body into struct + var ingestionTokenResponse IngestionTokenResponse + err = json.Unmarshal(responseBytes, &ingestionTokenResponse) + if err != nil { + Log("Error unmarshalling the response: %s", err.Error()) + return ingestionAuthToken, expiration, err + } + + ingestionAuthToken = ingestionTokenResponse.Ingestionauthtoken + + // Log("ingestionAuthToken obtained: %s", ingestionAuthToken) + + Log("Info getIngestionAuthToken: end") + return ingestionAuthToken, expiration, nil +} + +var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) + +func getExpirationFromAmcsResponse(header http.Header) (bestBeforeTime int64, err error) { + // Get a timeout from a AMCS HTTP response header + timeout := 0 + + cacheControlHeader, valueInMap := header["Cache-Control"] + if !valueInMap { + return 0, errors.New("Cache-Control not in passed header") + } + + for _, entry := range cacheControlHeader { + match := cacheControlHeaderRegex.FindStringSubmatch(entry) + if len(match) == 2 { + timeout, err = strconv.Atoi(match[1]) + if err != nil { + Log("getIngestionAuthToken: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) + return 0, err + } + + bestBeforeTime = time.Now().Unix() + int64(timeout) + + return bestBeforeTime, nil + } + } + + return 0, errors.New("didn't find timeout in header") +} diff --git a/source/plugins/go/src/ingestion_token_utils_test.go b/source/plugins/go/src/ingestion_token_utils_test.go new file mode 100644 index 000000000..f987a2408 --- /dev/null +++ b/source/plugins/go/src/ingestion_token_utils_test.go @@ -0,0 +1,54 @@ +package main + +import ( + "net/http" + "testing" + "time" +) + +// TODO: add unit tests for retry code. + +type headerElementTestType = struct { + in http.Header + want int64 + wantError bool +} + +func TestGetTimeoutFromAmcsResponse(t *testing.T) { + + var cases []headerElementTestType + + cases = append(cases, headerElementTestType{http.Header{ + "Request-Context": {"appId=cid-v1:*******-*********-*********"}, + "Api-Supported-Versions": {"2019-11-02-preview", "2020-04-01-preview", "2020-08-01-preview"}, + "Date": {"Thu, 06 May 2021 21:17:42 GMT"}, + "Cache-Control": {"public", "max-age=3600"}, + "Content-Type": {"application/json; charset=utf-8"}, + "Vary": {"Accept-Encoding"}, + "Server": {"Microsoft-HTTPAPI/2.0"}, + }, 3600, false}) + + cases = append(cases, headerElementTestType{http.Header{ + "Request-Context": {"appId=cid-v1:*******-*********-*********"}, + "Api-Supported-Versions": {"2019-11-02-preview", "2020-04-01-preview", "2020-08-01-preview"}, + "Date": {"Thu, 06 May 2021 21:17:42 GMT"}, + "Cache-Control": {"public", "max-age=300"}, + "Content-Type": {"application/json; charset=utf-8"}, + "Vary": {"Accept-Encoding"}, + "Server": {"Microsoft-HTTPAPI/2.0"}, + }, 300, false}) + + cases = append(cases, headerElementTestType{http.Header{ + "Request-Context": {"appId=cid-v1:*******-*********-*********"}, + }, 0, true}) + + for _, c := range cases { + got, err := getExpirationFromAmcsResponse(c.in) + if (err != nil) != c.wantError { + t.Errorf("getTimeoutFromAmcsResponse() did not return correct error, expected: %v, got %s", c.wantError, err) + } else if err == nil && (got < c.want+time.Now().Unix()-1 || got > c.want+time.Now().Unix()) { + // Adding a range of 1 second should be enough to keep this test from being flaky, but it might have to be re-thought. + t.Errorf("getTimeoutFromAmcsResponse() (%v) == %d, want %d", c.in, got, c.want+time.Now().Unix()) + } + } +} diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 25f364c55..814f411ba 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -22,6 +22,7 @@ import ( "github.com/tinylib/msgp/msgp" lumberjack "gopkg.in/natefinch/lumberjack.v2" + "Docker-Provider/source/plugins/go/src/extension" "github.com/Azure/azure-kusto-go/kusto/ingest" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -106,6 +107,11 @@ const ContainerLogsV1Route = "v1" //container logs schema (v2=ContainerLogsV2 table in LA, anything else ContainerLogs table in LA. This is applicable only if Container logs route is NOT ADX) const ContainerLogV2SchemaVersion = "v2" +//env variable for AAD MSI Auth mode +const AADMSIAuthMode = "AAD_MSI_AUTH_MODE" + +// Tag prefix of mdsd output streamid for AMA in MSI auth mode +const MdsdOutputStreamIdTagPrefix = "dcr-" //env variable to container type const ContainerTypeEnv = "CONTAINER_TYPE" @@ -168,7 +174,9 @@ var ( // flag to check if its Windows OS IsWindows bool // container type - ContainerType string + ContainerType string + // flag to check whether LA AAD MSI Auth Enabled or not + IsAADMSIAuthMode bool ) var ( @@ -702,7 +710,11 @@ func flushKubeMonAgentEventRecords() { } } } - if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if (IsWindows == false && len(msgPackEntries) > 0) { //for linux, mdsd route + if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdKubeMonAgentEventsTagName, MdsdOutputStreamIdTagPrefix) == false { + Log("Info::mdsd::obtaining output stream id for data type: %s", KubeMonAgentEventDataType) + MdsdKubeMonAgentEventsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(KubeMonAgentEventDataType) + } Log("Info::mdsd:: using mdsdsource name for KubeMonAgentEvents: %s", MdsdKubeMonAgentEventsTagName) msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdKubeMonAgentEventsTagName, msgPackEntries) if MdsdKubeMonMsgpUnixSocketClient == nil { @@ -760,6 +772,17 @@ func flushKubeMonAgentEventRecords() { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + ingestionAuthToken, err := getIngestionToken() + if err != nil { + Log("failed to get ODS Ingestion Auth Token (even after retries)") + message := fmt.Sprintf("Error string: %s \n", err.Error()) + Log(message) + } + // add authorization header to the req + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) @@ -904,7 +927,11 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int } } } - if (len(msgPackEntries) > 0) { + if (len(msgPackEntries) > 0) { + if IsAADMSIAuthMode == true && (strings.HasPrefix(MdsdInsightsMetricsTagName, MdsdOutputStreamIdTagPrefix) == false) { + Log("Info::mdsd::obtaining output stream id for InsightsMetricsDataType since Log Analytics AAD MSI Auth Enabled") + MdsdInsightsMetricsTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(InsightsMetricsDataType) + } msgpBytes := convertMsgPackEntriesToMsgpBytes(MdsdInsightsMetricsTagName, msgPackEntries) if MdsdInsightsMetricsMsgpUnixSocketClient == nil { Log("Error::mdsd::mdsd connection does not exist. re-connecting ...") @@ -979,6 +1006,17 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int if ResourceCentric == true { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + ingestionAuthToken, err := getIngestionToken() + if err != nil { + Log("failed to get ODS Ingestion Auth Token (even after retries)") + message := fmt.Sprintf("Error string: %s \n", err.Error()) + Log(message) + return output.FLB_RETRY + } + // add authorization header to the req + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } start := time.Now() resp, err := HTTPClient.Do(req) @@ -1184,6 +1222,16 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if len(msgPackEntries) > 0 && ContainerLogsRouteV2 == true { //flush to mdsd + if IsAADMSIAuthMode == true && strings.HasPrefix(MdsdContainerLogTagName, MdsdOutputStreamIdTagPrefix) == false { + Log("Info::mdsd::obtaining output stream id") + if ContainerLogSchemaV2 == true { + MdsdContainerLogTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(ContainerLogV2DataType) + } else { + MdsdContainerLogTagName = extension.GetInstance(FLBLogger, ContainerType).GetOutputStreamId(ContainerLogDataType) + } + Log("Info::mdsd:: using mdsdsource name: %s", MdsdContainerLogTagName) + } + fluentForward := MsgPackForward{ Tag: MdsdContainerLogTagName, Entries: msgPackEntries, @@ -1343,6 +1391,19 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { req.Header.Set("x-ms-AzureResourceId", ResourceID) } + if IsAADMSIAuthMode == true { + ingestionAuthToken, err := getIngestionToken() + if err != nil { + Log("failed to get ODS Ingestion Auth Token (even after retries)") + message := fmt.Sprintf("Error string: %s \n", err.Error()) + Log(message) + return output.FLB_RETRY + } + + // add authorization header to the req + req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) + } + resp, err := HTTPClient.Do(req) elapsed = time.Since(start) @@ -1439,8 +1500,7 @@ func GetContainerIDK8sNamespacePodNameFromFileName(filename string) (string, str } // InitializePlugin reads and populates plugin configuration -func InitializePlugin(pluginConfPath string, agentVersion string) { - +func InitializePlugin(pluginConfPath string, agentVersion string) { go func() { isTest := os.Getenv("ISTEST") if strings.Compare(strings.ToLower(strings.TrimSpace(isTest)), "true") == 0 { @@ -1541,6 +1601,10 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } Log("OMSEndpoint %s", OMSEndpoint) + IsAADMSIAuthMode = false + if strings.Compare(strings.ToLower(os.Getenv(AADMSIAuthMode)), "true") == 0 { + Log("AAD MSI Auth Mode Configured") + } ResourceID = os.Getenv(envAKSResourceID) if len(ResourceID) > 0 { diff --git a/source/plugins/go/src/utils.go b/source/plugins/go/src/utils.go index 3fe5c6d0e..02d30607e 100644 --- a/source/plugins/go/src/utils.go +++ b/source/plugins/go/src/utils.go @@ -63,27 +63,32 @@ func ReadConfiguration(filename string) (map[string]string, error) { // CreateHTTPClient used to create the client for sending post requests to OMSEndpoint func CreateHTTPClient() { - certFilePath := PluginConfiguration["cert_file_path"] - keyFilePath := PluginConfiguration["key_file_path"] - if IsWindows == false { - certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) - keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) - } - cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) - if err != nil { - message := fmt.Sprintf("Error when loading cert %s", err.Error()) - SendException(message) - time.Sleep(30 * time.Second) - Log(message) - log.Fatalf("Error when loading cert %s", err.Error()) - } + var transport *http.Transport + if IsAADMSIAuthMode { + transport = &http.Transport{} + } else { + certFilePath := PluginConfiguration["cert_file_path"] + keyFilePath := PluginConfiguration["key_file_path"] + if IsWindows == false { + certFilePath = fmt.Sprintf(certFilePath, WorkspaceID) + keyFilePath = fmt.Sprintf(keyFilePath, WorkspaceID) + } + cert, err := tls.LoadX509KeyPair(certFilePath, keyFilePath) + if err != nil { + message := fmt.Sprintf("Error when loading cert %s", err.Error()) + SendException(message) + time.Sleep(30 * time.Second) + Log(message) + log.Fatalf("Error when loading cert %s", err.Error()) + } - tlsConfig := &tls.Config{ - Certificates: []tls.Certificate{cert}, - } + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{cert}, + } - tlsConfig.BuildNameToCertificate() - transport := &http.Transport{TLSClientConfig: tlsConfig} + tlsConfig.BuildNameToCertificate() + transport = &http.Transport{TLSClientConfig: tlsConfig} + } // set the proxy if the proxy configured if ProxyEndpoint != "" { proxyEndpointUrl, err := url.Parse(ProxyEndpoint) diff --git a/source/plugins/ruby/ApplicationInsightsUtility.rb b/source/plugins/ruby/ApplicationInsightsUtility.rb index 31f9503cd..eaa1d903d 100644 --- a/source/plugins/ruby/ApplicationInsightsUtility.rb +++ b/source/plugins/ruby/ApplicationInsightsUtility.rb @@ -21,6 +21,8 @@ class ApplicationInsightsUtility @@EnvApplicationInsightsEndpoint = "APPLICATIONINSIGHTS_ENDPOINT" @@EnvControllerType = "CONTROLLER_TYPE" @@EnvContainerRuntime = "CONTAINER_RUNTIME" + @@EnvAADMSIAuthMode = "AAD_MSI_AUTH_MODE" + @@isWindows = false @@hostName = (OMS::Common.get_hostname) @@os_type = ENV["OS_TYPE"] @@ -82,7 +84,12 @@ def initializeUtility() isProxyConfigured = false $log.info("proxy is not configured") end - + aadAuthMSIMode = ENV[@@EnvAADMSIAuthMode] + if !aadAuthMSIMode.nil? && !aadAuthMSIMode.empty? && aadAuthMSIMode.downcase == "true".downcase + @@CustomProperties["aadAuthMSIMode"] = "true" + else + @@CustomProperties["aadAuthMSIMode"] = "false" + end #Check if telemetry is turned off telemetryOffSwitch = ENV["DISABLE_TELEMETRY"] if telemetryOffSwitch && !telemetryOffSwitch.nil? && !telemetryOffSwitch.empty? && telemetryOffSwitch.downcase == "true".downcase diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index c037c99f6..0971a9b96 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -103,5 +103,28 @@ class Constants #Pod Statuses POD_STATUS_TERMINATING = "Terminating" + # Data type ids + CONTAINER_INVENTORY_DATA_TYPE = "CONTAINER_INVENTORY_BLOB" + CONTAINER_NODE_INVENTORY_DATA_TYPE = "CONTAINER_NODE_INVENTORY_BLOB" + PERF_DATA_TYPE = "LINUX_PERF_BLOB" + INSIGHTS_METRICS_DATA_TYPE = "INSIGHTS_METRICS_BLOB" + KUBE_SERVICES_DATA_TYPE = "KUBE_SERVICES_BLOB" + KUBE_POD_INVENTORY_DATA_TYPE = "KUBE_POD_INVENTORY_BLOB" + KUBE_NODE_INVENTORY_DATA_TYPE = "KUBE_NODE_INVENTORY_BLOB" + KUBE_PV_INVENTORY_DATA_TYPE = "KUBE_PV_INVENTORY_BLOB" + KUBE_EVENTS_DATA_TYPE = "KUBE_EVENTS_BLOB" + KUBE_MON_AGENT_EVENTS_DATA_TYPE = "KUBE_MON_AGENT_EVENTS_BLOB" + KUBE_HEALTH_DATA_TYPE = "KUBE_HEALTH_BLOB" + CONTAINERLOGV2_DATA_TYPE = "CONTAINERINSIGHTS_CONTAINERLOGV2" + CONTAINERLOG_DATA_TYPE = "CONTAINER_LOG_BLOB" + + #ContainerInsights Extension (AMCS) + CI_EXTENSION_NAME = "ContainerInsights" + CI_EXTENSION_VERSION = "1" + #Current CI extension config size is ~5KB and going with 20KB to handle any future scenarios + CI_EXTENSION_CONFIG_MAX_BYTES = 20480 + ONEAGENT_FLUENT_SOCKET_NAME = "/var/run/mdsd/default_fluent.socket" + #Tag prefix for output stream + EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX = "dcr-" end diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index d491f17c2..bd42260e2 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -4,7 +4,8 @@ require 'fluent/plugin/filter' -module Fluent::Plugin +module Fluent::Plugin + require_relative 'extension_utils' require 'logger' require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } @@ -91,6 +92,13 @@ def filter_stream(tag, es) begin new_es = Fluent::MultiEventStream.new time = Time.now + if ExtensionUtils.isAADMSIAuthMode() + $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") + if !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @rewrite_tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_HEALTH_DATA_TYPE) + end + $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") + end if tag.start_with?("kubehealth.DaemonSet.Node") node_records = [] diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index b3f9bd08b..da42e4560 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -20,7 +20,8 @@ def initialize require_relative "CAdvisorMetricsAPIClient" require_relative "oms_common" require_relative "omslog" - require_relative "constants" + require_relative "constants" + require_relative "extension_utils" end config_param :run_interval, :time, :default => 60 @@ -68,6 +69,17 @@ def enumerate() eventStream.add(time, record) if record end + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream router.emit_stream(@containerhealthtag, eventStream) if eventStream diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index eebf422d6..6de1b0bb8 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -17,7 +17,8 @@ def initialize require_relative "ApplicationInsightsUtility" require_relative "omslog" require_relative "CAdvisorMetricsAPIClient" - require_relative "kubernetes_container_inventory" + require_relative "kubernetes_container_inventory" + require_relative "extension_utils" end config_param :run_interval, :time, :default => 60 @@ -57,6 +58,13 @@ def enumerate eventStream = Fluent::MultiEventStream.new hostName = "" $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + $log.info("in_container_inventory::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end begin containerRuntimeEnv = ENV["CONTAINER_RUNTIME"] $log.info("in_container_inventory::enumerate : container runtime : #{containerRuntimeEnv}") diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index 6f65dab92..f2d35be7b 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -18,6 +18,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -86,6 +87,13 @@ def enumerate newEventQueryState = [] @eventsCount = 0 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_events::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_EVENTS_DATA_TYPE) + end + $log.info("in_kube_events::enumerate: using kubeevents tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index ffc11de55..6018d5f34 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -35,7 +35,8 @@ def initialize require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" - require_relative "omslog" + require_relative "omslog" + require_relative "extension_utils" @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" @@ -111,6 +112,25 @@ def enumerate @nodeInventoryE2EProcessingLatencyMs = 0 nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") + if !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @ContainerNodeInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) + end + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 5598602cd..f3a2718e7 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -27,6 +27,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for updating defaults # this configurable via configmap @@ -108,6 +109,29 @@ def enumerate(podList = nil) serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 podInventoryStartTime = (Time.now.to_f * 1000).to_i + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) + end + if !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) + end + if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 40eebac8a..6a95ccd4f 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -20,6 +20,7 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" + require_relative "extension_utils" # Response size is around 1500 bytes per PV @PV_CHUNK_SIZE = "5000" @@ -62,6 +63,12 @@ def enumerate @pvTypeToCountHash = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kube_pvinventory::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_PV_INVENTORY_DATA_TYPE) + end + end continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index 182c3ffc1..dd875862e 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -22,6 +22,7 @@ def initialize require_relative "omslog" require_relative "ApplicationInsightsUtility" require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -83,6 +84,12 @@ def enumerate #set the running total for this batch to 0 @deploymentsRunningTotal = 0 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kubestate_deployments::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index 8f60bfb72..b1e112f45 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -18,7 +18,8 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "ApplicationInsightsUtility" - require_relative "constants" + require_relative "constants" + require_relative "extension_utils" # refer tomlparser-agent-config for defaults # this configurable via configmap @@ -79,6 +80,13 @@ def enumerate @hpaCount = 0 + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_kubestate_hpa::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_kubestate_hpa::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 9ab2474b1..77e84bc2b 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -20,7 +20,8 @@ def initialize require_relative "oms_common" require_relative "omslog" require_relative "constants" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + require_relative "extension_utils" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -58,6 +59,17 @@ def enumerate() timeDifference = (DateTime.now.to_time.to_i - @@winNodeQueryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 @@istestvar = ENV["ISTEST"] + if ExtensionUtils.isAADMSIAuthMode() + $log.info("in_win_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) + end + if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) + end + $log.info("in_win_cadvisor_perf::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_win_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + end #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 8e80fb753..247bc01b2 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -21,6 +21,9 @@ def initialize require_relative "proxy_utils" @@token_resource_url = "https://monitoring.azure.com/" + # AAD auth supported only in public cloud and handle other clouds when enabled + # this is unified new token audience for LA AAD MSI auth & metrics + @@token_resource_audience = "https://monitor.azure.com/" @@grant_type = "client_credentials" @@azure_json_path = "/etc/kubernetes/host/azure.json" @@post_request_url_template = "https://%{aks_region}.monitoring.azure.com%{aks_resource_id}/metrics" @@ -28,6 +31,8 @@ def initialize # msiEndpoint is the well known endpoint for getting MSI authentications tokens @@msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&client_id=%{user_assigned_client_id}&resource=%{resource}" + # IMDS msiEndpoint for AAD MSI Auth is the proxy endpoint whcih serves the MSI auth tokens with resource claim + @@imds_msi_endpoint_template = "http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=%{resource}" @@user_assigned_client_id = ENV["USER_ASSIGNED_IDENTITY_CLIENT_ID"] @@plugin_name = "AKSCustomMetricsMDM" @@ -124,7 +129,13 @@ def start @parsed_token_uri = URI.parse(aad_token_url) else @useMsi = true - msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } + if !@@user_assigned_client_id.nil? && !@@user_assigned_client_id.empty? + msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } + else + # in case of msi auth user_assigned_client_id will be empty + @log.info "using aad msi auth" + msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } + end @parsed_token_uri = URI.parse(msi_endpoint) end diff --git a/source/plugins/utils/extension.rb b/source/plugins/utils/extension.rb new file mode 100644 index 000000000..1b1844149 --- /dev/null +++ b/source/plugins/utils/extension.rb @@ -0,0 +1,77 @@ +require "socket" +require "msgpack" +require "securerandom" +require "singleton" +require_relative "omslog" +require_relative "constants" +require_relative "ApplicationInsightsUtility" + + +class Extension + include Singleton + + def initialize + @cache = {} + @cache_lock = Mutex.new + $log.info("Extension::initialize complete") + end + + def get_output_stream_id(datatypeId) + @cache_lock.synchronize { + if @cache.has_key?(datatypeId) + return @cache[datatypeId] + else + @cache = get_config() + return @cache[datatypeId] + end + } + end + + private + def get_config() + extConfig = Hash.new + $log.info("Extension::get_config start ...") + begin + clientSocket = UNIXSocket.open(Constants::ONEAGENT_FLUENT_SOCKET_NAME) + requestId = SecureRandom.uuid.to_s + requestBodyJSON = { "Request" => "AgentTaggedData", "RequestId" => requestId, "Tag" => Constants::CI_EXTENSION_NAME, "Version" => Constants::CI_EXTENSION_VERSION }.to_json + $log.info("sending request with request body: #{requestBodyJSON}") + requestBodyMsgPack = requestBodyJSON.to_msgpack + clientSocket.write(requestBodyMsgPack) + clientSocket.flush + $log.info("reading the response from fluent socket: #{Constants::ONEAGENT_FLUENT_SOCKET_NAME}") + resp = clientSocket.recv(Constants::CI_EXTENSION_CONFIG_MAX_BYTES) + if !resp.nil? && !resp.empty? + $log.info("successfully read the extension config from fluentsocket and number of bytes read is #{resp.length}") + respJSON = JSON.parse(resp) + taggedData = respJSON["TaggedData"] + if !taggedData.nil? && !taggedData.empty? + taggedAgentData = JSON.parse(taggedData) + extensionConfigurations = taggedAgentData["extensionConfigurations"] + if !extensionConfigurations.nil? && !extensionConfigurations.empty? + extensionConfigurations.each do |extensionConfig| + outputStreams = extensionConfig["outputStreams"] + if !outputStreams.nil? && !outputStreams.empty? + outputStreams.each do |datatypeId, streamId| + $log.info("datatypeId: #{datatypeId}, streamId: #{streamId}") + extConfig[datatypeId] = streamId + end + else + $log.info("received outputStreams is either nil or empty") + end + end + else + $log.info("received extensionConfigurations from fluentsocket is either nil or empty") + end + end + end + rescue => errorStr + $log.warn("Extension::get_config failed: #{errorStr}") + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + ensure + clientSocket.close unless clientSocket.nil? + end + $log.info("Extension::get_config complete ...") + return extConfig + end +end diff --git a/source/plugins/utils/extension_utils.rb b/source/plugins/utils/extension_utils.rb new file mode 100644 index 000000000..5d439c6b2 --- /dev/null +++ b/source/plugins/utils/extension_utils.rb @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "extension" + +class ExtensionUtils + class << self + def getOutputStreamId(dataType) + outputStreamId = "" + begin + if !dataType.nil? && !dataType.empty? + outputStreamId = Extension.instance.get_output_stream_id(dataType) + $log.info("ExtensionUtils::getOutputStreamId: got streamid: #{outputStreamId} for datatype: #{dataType}") + else + $log.warn("ExtensionUtils::getOutputStreamId: dataType shouldnt be nil or empty") + end + rescue => errorStr + $log.warn("ExtensionUtils::getOutputStreamId: failed with an exception: #{errorStr}") + end + return outputStreamId + end + def isAADMSIAuthMode() + return !ENV["AAD_MSI_AUTH_MODE"].nil? && !ENV["AAD_MSI_AUTH_MODE"].empty? && ENV["AAD_MSI_AUTH_MODE"].downcase == "true" + end + end +end From 679038ba1164814be03f999d4225d63898f3a693 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 13 Jun 2021 23:09:52 -0700 Subject: [PATCH 02/28] use existing envvars --- kubernetes/windows/main.ps1 | 11 +---------- source/plugins/go/src/ingestion_token_utils.go | 8 ++++---- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index c28554da9..d0cd2368e 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -61,16 +61,7 @@ function Set-EnvironmentVariables { # Set CLOUD_ENVIRONMENT [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") - [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") - - - # Set Region - [System.Environment]::SetEnvironmentVariable("customRegion", $env:AKS_REGION, "Process") - [System.Environment]::SetEnvironmentVariable("customRegion", $env:AKS_REGION, "Machine") - - # Set resource ID - [System.Environment]::SetEnvironmentVariable("customResourceId", $env:AKS_RESOURCE_ID, "Process") - [System.Environment]::SetEnvironmentVariable("customResourceId", $env:AKS_RESOURCE_ID, "Machine") + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 4dffd7974..8b1794e04 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -222,8 +222,8 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan channelId = "" apiVersion := "2020-08-01-preview" var amcs_endpoint *url.URL - resourceId := os.Getenv("customResourceId") - resourceRegion := os.Getenv("customRegion") + resourceId := os.Getenv("AKS_RESOURCE_ID") + resourceRegion := os.Getenv("AKS_REGION") MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations?platform=windows&api-version=%s", resourceRegion, resourceId, apiVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) @@ -318,8 +318,8 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann Log("Info getIngestionAuthToken: start") ingestionAuthToken = "" var amcs_endpoint *url.URL - resourceId := os.Getenv("customResourceId") - resourceRegion := os.Getenv("customRegion") + resourceId := os.Getenv("AKS_RESOURCE_ID") + resourceRegion := os.Getenv("AKS_REGION") MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") apiVersion := "2020-04-01-preview" amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=windows&api-version=%s", resourceRegion, resourceId, configurationId, channelId, apiVersion) From b0730e8036879b8997b7944470f9b1c929474b5c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 14 Jun 2021 13:28:20 -0700 Subject: [PATCH 03/28] fix imds token expiry interval --- source/plugins/go/src/ingestion_token_utils.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 8b1794e04..36b858722 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -95,7 +95,7 @@ func getIngestionToken() (authToken string, err error) { // check if the ingestion token has not been fetched yet or is expiring soon. If so then re-fetch it first. // TODO: re-fetch token in a new thread if it is about to expire (don't block the main thread) - if IMDSToken == "" || IMDSTokenExpiration <= time.Now().Unix()+30 { // refresh the token 30 seconds before it expires + if IMDSToken == "" || IMDSTokenExpiration <= time.Now().Unix()+ 30*60 { // refresh the token 30 minutes before it expires IMDSToken, IMDSTokenExpiration, err = getAccessTokenFromIMDS(true) //TODO: read from an env variable? Or maybe from OS? if err != nil { message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) @@ -116,7 +116,7 @@ func getIngestionToken() (authToken string, err error) { } } - if IngestionAuthToken == "" || IngestionAuthTokenExpiration <= time.Now().Unix()+10 { + if IngestionAuthToken == "" || IngestionAuthTokenExpiration <= time.Now().Unix()+30*60 { IngestionAuthToken, IngestionAuthTokenExpiration, err = getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) if err != nil { message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) From e2ad0a393baff962f7db0ed38f7f05c43fd443b7 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 00:07:29 -0700 Subject: [PATCH 04/28] refactor the windows agent ingestion token code --- .../plugins/go/src/ingestion_token_utils.go | 255 +++++++----------- .../go/src/ingestion_token_utils_test.go | 54 ---- source/plugins/go/src/oms.go | 50 ++-- 3 files changed, 126 insertions(+), 233 deletions(-) delete mode 100644 source/plugins/go/src/ingestion_token_utils_test.go diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 36b858722..8d65540e7 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -91,109 +91,18 @@ type IngestionTokenResponse struct { Ingestionauthtoken string `json:"ingestionAuthToken"` } -func getIngestionToken() (authToken string, err error) { - // check if the ingestion token has not been fetched yet or is expiring soon. If so then re-fetch it first. - // TODO: re-fetch token in a new thread if it is about to expire (don't block the main thread) - - if IMDSToken == "" || IMDSTokenExpiration <= time.Now().Unix()+ 30*60 { // refresh the token 30 minutes before it expires - IMDSToken, IMDSTokenExpiration, err = getAccessTokenFromIMDS(true) //TODO: read from an env variable? Or maybe from OS? - if err != nil { - message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) - Log(message) - SendException(message) - return "", err - } - } - - // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) - if ConfigurationId == "" || ChannelId == "" { - ConfigurationId, ChannelId, _, err = getAgentConfiguration(IMDSToken) - if err != nil { - message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) - Log(message) - SendException(message) - return "", err - } - } - - if IngestionAuthToken == "" || IngestionAuthTokenExpiration <= time.Now().Unix()+30*60 { - IngestionAuthToken, IngestionAuthTokenExpiration, err = getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) - if err != nil { - message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) - Log(message) - SendException(message) - return "", err - } - } - - return IngestionAuthToken, nil -} - -func getAccessTokenFromIMDS(fromfile bool) (string, int64, error) { +func getAccessTokenFromIMDS() (string, int64, error) { Log("Info getAccessTokenFromIMDS: start") - MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") imdsAccessToken := "" - var responseBytes []byte var err error - if fromfile { - // only used in windows - responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) - if err != nil { - Log("getAccessTokenFromIMDS: Could not read IMDS token from file", err) - return imdsAccessToken, 0, err - } - } else { - var msi_endpoint *url.URL - msi_endpoint, err := url.Parse("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://" + MCS_ENDPOINT + "/") - if err != nil { - Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) - return imdsAccessToken, 0, err - } - req, err := http.NewRequest("GET", msi_endpoint.String(), nil) - if err != nil { - Log("getAccessTokenFromIMDS: Error creating HTTP request: %s", err.Error()) - return imdsAccessToken, 0, err - } - req.Header.Add("Metadata", "true") - - // Call managed services for Azure resources token endpoint - var resp *http.Response = nil - for i := 0; i < 3; i++ { - // client := &http.Client{} - resp, err = HTTPClient.Do(req) - if err != nil { - message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s", err.Error()) - Log(message) - SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. - continue - } - //TODO: is this the best place to defer closing the response body? - defer resp.Body.Close() - - Log("getAccessTokenFromIMDS: IMDS Response Status: %d", resp.StatusCode) - if resp.StatusCode != 200 { - message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code : %d", resp.StatusCode) - Log(message) - SendException(message) - continue - } - break // call succeeded, don't retry any more - } - if resp == nil { - Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") - return imdsAccessToken, 0, err - } - - // Pull out response body - responseBytes, err = ioutil.ReadAll(resp.Body) - if err != nil { - Log("getAccessTokenFromIMDS: Error reading response body : %s", err.Error()) - return imdsAccessToken, 0, err - } + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) + return imdsAccessToken, 0, err } - + // Unmarshall response body into struct var imdsResponse IMDSResponse err = json.Unmarshal(responseBytes, &imdsResponse) @@ -205,21 +114,17 @@ func getAccessTokenFromIMDS(fromfile bool) (string, int64, error) { expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) if err != nil { - Log("Error reading expiration time from response header body: %s", err.Error()) + Log("Error reading expiration time from IMDS response: %s", err.Error()) return imdsAccessToken, 0, err } - - Log("IMDS Token obtained: %s", imdsAccessToken) - Log("Info getAccessTokenFromIMDS: end") - return imdsAccessToken, expiration, nil } -func getAgentConfiguration(imdsAccessToken string) (configurationId string, channelId string, expiration int64, err error) { +func getAgentConfiguration(imdsAccessToken string) (configurationId string, channelId string, err error) { Log("Info getAgentConfiguration: start") configurationId = "" - channelId = "" + channelId = "" apiVersion := "2020-08-01-preview" var amcs_endpoint *url.URL resourceId := os.Getenv("AKS_RESOURCE_ID") @@ -234,56 +139,40 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan if err != nil { message := fmt.Sprintf("Error creating HTTP request for AMCS endpoint: %s", err.Error()) Log(message) - return configurationId, channelId, 0, err + return configurationId, channelId, err } - - // add authorization header to the req - req.Header.Add("Authorization", bearer) + req.Header.Set("Authorization", bearer) var resp *http.Response = nil - for i := 0; i < 3; i++ { - // Call managed services for Azure resources token endpoint - // client := &http.Client{} + for i := 0; i < 3; i++ { resp, err = HTTPClient.Do(req) if err != nil { - message := fmt.Sprintf("Error calling amcs endpoint: %s", err.Error()) + message := fmt.Sprintf("Error calling AMCS endpoint: %s", err.Error()) Log(message) SendException(message) continue } - - //TODO: is this the best place to defer closing the response body? - defer resp.Body.Close() - + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) if resp.StatusCode != 200 { message := fmt.Sprintf("getAgentConfiguration Request failed with an error code : %d", resp.StatusCode) Log(message) SendException(message) continue - } + } break // call succeeded, don't retry any more } if resp == nil { Log("getAgentConfiguration Request ran out of retries") - return configurationId, channelId, 0, err - } - - // get content timeout - expiration, err = getExpirationFromAmcsResponse(resp.Header) - if err != nil { - Log("getAgentConfiguration failed to parse auth token timeout") - return configurationId, channelId, 0, err + return configurationId, channelId, err } - Log(fmt.Sprintf("getAgentConfiguration: DCR expires at %d seconds (unix timestamp)", expiration)) - - // Pull out response body - //TODO: does responseBytes need to be closed? responseBytes, err := ioutil.ReadAll(resp.Body) if err != nil { Log("Error reading response body from AMCS API call : %s", err.Error()) - return configurationId, channelId, expiration, err + return configurationId, channelId, err } // Unmarshall response body into struct @@ -291,32 +180,32 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan err = json.Unmarshal(responseBytes, &agentConfiguration) if err != nil { Log("Error unmarshalling the response: %s", err.Error()) - return configurationId, channelId, expiration, err + return configurationId, channelId, err } if len(agentConfiguration.Configurations) == 0 { Log("Received empty agentConfiguration.Configurations array") - return configurationId, channelId, expiration, err + return configurationId, channelId, err } if len(agentConfiguration.Configurations[0].Content.Channels) == 0 { Log("Received empty agentConfiguration.Configurations[0].Content.Channels") - return configurationId, channelId, expiration, err + return configurationId, channelId, err } configurationId = agentConfiguration.Configurations[0].Configurationid channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) - Log("Info getAgentConfiguration: end") - return configurationId, channelId, expiration, nil + return configurationId, channelId, nil } func getIngestionAuthToken(imdsAccessToken string, configurationId string, channelId string) (ingestionAuthToken string, expiration int64, err error) { Log("Info getIngestionAuthToken: start") ingestionAuthToken = "" + refreshInterval = 0 var amcs_endpoint *url.URL resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") @@ -349,9 +238,10 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann resp = nil continue } - - //TODO: is this the best place to defer closing the response body? - defer resp.Body.Close() + + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) if resp.StatusCode != 200 { @@ -368,19 +258,12 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann Log("ran out of retries calling AMCS for ingestion token") return ingestionAuthToken, 0, err } - - expiration, err = getExpirationFromAmcsResponse(resp.Header) - if err != nil { - Log("getIngestionAuthToken failed to parse auth token expiration time") - return ingestionAuthToken, 0, err - } - Log("getIngestionAuthToken: token times out at time %d (unix timestamp)", expiration) - + // Pull out response body responseBytes, err := ioutil.ReadAll(resp.Body) if err != nil { Log("Error reading response body from AMCS Ingestion API call : %s", err.Error()) - return ingestionAuthToken, expiration, err + return ingestionAuthToken, refreshInterval, err } // Unmarshall response body into struct @@ -388,23 +271,26 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann err = json.Unmarshal(responseBytes, &ingestionTokenResponse) if err != nil { Log("Error unmarshalling the response: %s", err.Error()) - return ingestionAuthToken, expiration, err + return ingestionAuthToken, refreshInterval, err } ingestionAuthToken = ingestionTokenResponse.Ingestionauthtoken - // Log("ingestionAuthToken obtained: %s", ingestionAuthToken) + refreshInterval, err = getTokenRefreshIntervalFromAmcsResponse(resp.Header) + if err != nil { + Log("getIngestionAuthToken failed to parse max-age response header") + return ingestionAuthToken, 0, err + } + Log("getIngestionAuthToken: refresh interval %d seconds", refreshInterval) Log("Info getIngestionAuthToken: end") - return ingestionAuthToken, expiration, nil + return ingestionAuthToken, refreshInterval, nil } var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) -func getExpirationFromAmcsResponse(header http.Header) (bestBeforeTime int64, err error) { - // Get a timeout from a AMCS HTTP response header - timeout := 0 - +func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (bestBeforeTime int64, err error) { + refreshInterval := 0 cacheControlHeader, valueInMap := header["Cache-Control"] if !valueInMap { return 0, errors.New("Cache-Control not in passed header") @@ -413,17 +299,66 @@ func getExpirationFromAmcsResponse(header http.Header) (bestBeforeTime int64, er for _, entry := range cacheControlHeader { match := cacheControlHeaderRegex.FindStringSubmatch(entry) if len(match) == 2 { - timeout, err = strconv.Atoi(match[1]) + refreshInterval, err = strconv.Atoi(match[1]) if err != nil { Log("getIngestionAuthToken: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) return 0, err - } - - bestBeforeTime = time.Now().Unix() + int64(timeout) - - return bestBeforeTime, nil + } + return refreshInterval, nil } } return 0, errors.New("didn't find timeout in header") } + +func refreshIngestionAuthToken() { + for ; true; <-IngestionAuthTokenRefreshTicker.C { + var err error + if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // refresh the token 1 hr expiry + imdsToken, imdsTokenExpiry, err = getAccessTokenFromIMDS() + if err != nil { + message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) + Log(message) + SendException(message) + } else { + IMDSToken = imdsToken + IMDSTokenExpiration = imdsTokenExpiry + } + } + if IMDSToken == "" { + message := "IMDSToken is empty" + Log(message) + SendException(message) + continue + } + // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) + if ConfigurationId == "" || ChannelId == "" { + ConfigurationId, ChannelId, err = getAgentConfiguration(IMDSToken) + if err != nil { + message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) + Log(message) + SendException(message) + continue + } + } + if IMDSToken == "" ConfigurationId == "" || ChannelId == "" { + message := "IMDSToken or ConfigurationId or ChannelId empty" + Log(message) + SendException(message) + continue + } + ingestionAuthToken, refreshIntervalInSeconds, err = getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) + if err != nil { + message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) + Log(message) + SendException(message) + continue + } + IngestionAuthTokenUpdateMutex.Lock() + ODSIngestionAuthToken = ingestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() + if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { + ticker.Reset(time.Second * refreshIntervalInSeconds) + } + } +} diff --git a/source/plugins/go/src/ingestion_token_utils_test.go b/source/plugins/go/src/ingestion_token_utils_test.go deleted file mode 100644 index f987a2408..000000000 --- a/source/plugins/go/src/ingestion_token_utils_test.go +++ /dev/null @@ -1,54 +0,0 @@ -package main - -import ( - "net/http" - "testing" - "time" -) - -// TODO: add unit tests for retry code. - -type headerElementTestType = struct { - in http.Header - want int64 - wantError bool -} - -func TestGetTimeoutFromAmcsResponse(t *testing.T) { - - var cases []headerElementTestType - - cases = append(cases, headerElementTestType{http.Header{ - "Request-Context": {"appId=cid-v1:*******-*********-*********"}, - "Api-Supported-Versions": {"2019-11-02-preview", "2020-04-01-preview", "2020-08-01-preview"}, - "Date": {"Thu, 06 May 2021 21:17:42 GMT"}, - "Cache-Control": {"public", "max-age=3600"}, - "Content-Type": {"application/json; charset=utf-8"}, - "Vary": {"Accept-Encoding"}, - "Server": {"Microsoft-HTTPAPI/2.0"}, - }, 3600, false}) - - cases = append(cases, headerElementTestType{http.Header{ - "Request-Context": {"appId=cid-v1:*******-*********-*********"}, - "Api-Supported-Versions": {"2019-11-02-preview", "2020-04-01-preview", "2020-08-01-preview"}, - "Date": {"Thu, 06 May 2021 21:17:42 GMT"}, - "Cache-Control": {"public", "max-age=300"}, - "Content-Type": {"application/json; charset=utf-8"}, - "Vary": {"Accept-Encoding"}, - "Server": {"Microsoft-HTTPAPI/2.0"}, - }, 300, false}) - - cases = append(cases, headerElementTestType{http.Header{ - "Request-Context": {"appId=cid-v1:*******-*********-*********"}, - }, 0, true}) - - for _, c := range cases { - got, err := getExpirationFromAmcsResponse(c.in) - if (err != nil) != c.wantError { - t.Errorf("getTimeoutFromAmcsResponse() did not return correct error, expected: %v, got %s", c.wantError, err) - } else if err == nil && (got < c.want+time.Now().Unix()-1 || got > c.want+time.Now().Unix()) { - // Adding a range of 1 second should be enough to keep this test from being flaky, but it might have to be re-thought. - t.Errorf("getTimeoutFromAmcsResponse() (%v) == %d, want %d", c.in, got, c.want+time.Now().Unix()) - } - } -} diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 814f411ba..46ddda515 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -89,6 +89,7 @@ const IPName = "ContainerInsights" const defaultContainerInventoryRefreshInterval = 60 const kubeMonAgentConfigEventFlushInterval = 60 +const defaultIngestionAuthTokenRefreshIntervalSeconds = 3600 //Eventsource name in mdsd const MdsdContainerLogSourceName = "ContainerLogSource" @@ -202,6 +203,10 @@ var ( EventHashUpdateMutex = &sync.Mutex{} // parent context used by ADX uploader ParentContext = context.Background() + // IngestionAuthTokenUpdateMutex read and write mutex access to the container id set + IngestionAuthTokenUpdateMutex = &sync.Mutex{} + // ODSIngestionAuthToken for windows agent AAD MSI Auth + ODSIngestionAuthToken string ) var ( @@ -209,6 +214,8 @@ var ( ContainerImageNameRefreshTicker *time.Ticker // KubeMonAgentConfigEventsSendTicker to send config events every hour KubeMonAgentConfigEventsSendTicker *time.Ticker + // IngestionAuthTokenRefreshTicker to refresh ingestion token + IngestionAuthTokenRefreshTicker *time.Ticker ) var ( @@ -773,13 +780,12 @@ func flushKubeMonAgentEventRecords() { } if IsAADMSIAuthMode == true { - ingestionAuthToken, err := getIngestionToken() - if err != nil { - Log("failed to get ODS Ingestion Auth Token (even after retries)") - message := fmt.Sprintf("Error string: %s \n", err.Error()) - Log(message) - } - // add authorization header to the req + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken = ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.UnLock() + if ingestionAuthToken == "" { + Log("ODS Ingestion Auth Token is empty") + } req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } @@ -1007,10 +1013,11 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int req.Header.Set("x-ms-AzureResourceId", ResourceID) } if IsAADMSIAuthMode == true { - ingestionAuthToken, err := getIngestionToken() - if err != nil { - Log("failed to get ODS Ingestion Auth Token (even after retries)") - message := fmt.Sprintf("Error string: %s \n", err.Error()) + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken = ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.UnLock() + if ingestionAuthToken == "" { + message := "Ingestion Auth Token is empty" Log(message) return output.FLB_RETRY } @@ -1392,14 +1399,13 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { } if IsAADMSIAuthMode == true { - ingestionAuthToken, err := getIngestionToken() - if err != nil { - Log("failed to get ODS Ingestion Auth Token (even after retries)") - message := fmt.Sprintf("Error string: %s \n", err.Error()) - Log(message) + IngestionAuthTokenUpdateMutex.Lock() + ingestionAuthToken = ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.UnLock() + if ingestionAuthToken == "" { + Log("ODS Ingestion Auth Token is empty") return output.FLB_RETRY } - // add authorization header to the req req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } @@ -1776,5 +1782,11 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName -} \ No newline at end of file + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName + + if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { + Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) + go refreshIngestionAuthToken() + } +} From c2065535093a9d4c9f0ea59e5a8f040cd7ef889e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 08:03:54 -0700 Subject: [PATCH 05/28] code cleanup --- kubernetes/windows/main.ps1 | 22 ++++-- .../plugins/go/src/ingestion_token_utils.go | 79 ++++++++++++++++--- 2 files changed, 82 insertions(+), 19 deletions(-) diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index d0cd2368e..9205fdf6c 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -233,16 +233,22 @@ function Set-EnvironmentVariables { else { Write-Host "Failed to set environment variable HOSTNAME for target 'machine' since it is either null or empty" } + # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH environment variable $isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH", "process") - if (![string]::IsNullOrEmpty($isAADMSIAuth) -and ($isAADMSIAuth -eq "true")) { - [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", "true", "Process") - [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", "true", "Machine") - Write-Host "Using AAD MSI auth" - } - else { - Write-Host "Using LA Legacy Auth" - } + if (![string]::IsNullOrEmpty($isAADMSIAuth)) { + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Process") + [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Machine") + Write-Host "Successfully set environment variable AAD_MSI_AUTH_MODE - $($isAADMSIAuth) for target 'machine'..." + } + + # check if use token proxy endpoint set via USE_IMDS_TOKEN_PROXY_END_POINT environment variable + $useIMDSTokenProxyEndpoint = [System.Environment]::GetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", "process") + if (![string]::IsNullOrEmpty($useIMDSTokenProxyEndpoint)) { + [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Process") + [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Machine") + Write-Host "Successfully set environment variable USE_IMDS_TOKEN_PROXY_END_POINT - $($useIMDSTokenProxyEndpoint) for target 'machine'..." + } # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 8d65540e7..fa8db26dd 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -92,17 +92,72 @@ type IngestionTokenResponse struct { } func getAccessTokenFromIMDS() (string, int64, error) { - Log("Info getAccessTokenFromIMDS: start") + Log("Info getAccessTokenFromIMDS: start") + useIMDSTOkenProxyEndPoint := os.Getenv("USE_IMDS_TOKEN_PROXY_END_POINT") imdsAccessToken := "" var responseBytes []byte var err error - - responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) - if err != nil { - Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) - return imdsAccessToken, 0, err - } + if (useIMDSTOkenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTOkenProxyEndPoint), "true") == 0) { + Log("Info Reading IMDS Access Token from IMDS Token proxy endpoint") + MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + var msi_endpoint *url.URL + msi_endpoint, err := url.Parse("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://" + MCS_ENDPOINT + "/") + if err != nil { + Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) + return imdsAccessToken, 0, err + } + req, err := http.NewRequest("GET", msi_endpoint.String(), nil) + if err != nil { + Log("getAccessTokenFromIMDS: Error creating HTTP request: %s", err.Error()) + return imdsAccessToken, 0, err + } + req.Header.Add("Metadata", "true") + + // Call managed services for Azure resources token endpoint + var resp *http.Response = nil + for i := 0; i < 3; i++ { + // client := &http.Client{} + resp, err = HTTPClient.Do(req) + if err != nil { + message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s", err.Error()) + Log(message) + SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. + continue + } + //TODO: is this the best place to defer closing the response body? + defer resp.Body.Close() + + Log("getAccessTokenFromIMDS: IMDS Response Status: %d", resp.StatusCode) + if resp.StatusCode != 200 { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code : %d", resp.StatusCode) + Log(message) + SendException(message) + continue + } + break // call succeeded, don't retry any more + } + if resp == nil { + Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") + return imdsAccessToken, 0, err + } + + // Pull out response body + responseBytes, err = ioutil.ReadAll(resp.Body) + if err != nil { + Log("getAccessTokenFromIMDS: Error reading response body : %s", err.Error()) + return imdsAccessToken, 0, err + } + + } else { + Log("Info Reading IMDS Access Token from file : %s", IMDSTokenPathForWindows) + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) + return imdsAccessToken, 0, err + } + } + // Unmarshall response body into struct var imdsResponse IMDSResponse err = json.Unmarshal(responseBytes, &imdsResponse) @@ -114,7 +169,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) if err != nil { - Log("Error reading expiration time from IMDS response: %s", err.Error()) + Log("Error parsing ExpiresOn field from IMDS response: %s", err.Error()) return imdsAccessToken, 0, err } Log("Info getAccessTokenFromIMDS: end") @@ -127,10 +182,11 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan channelId = "" apiVersion := "2020-08-01-preview" var amcs_endpoint *url.URL + osType := os.Getenv("OS_TYPE") resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations?platform=windows&api-version=%s", resourceRegion, resourceId, apiVersion) + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, resourceId, osType, apiVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) var bearer = "Bearer " + imdsAccessToken @@ -207,11 +263,12 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann ingestionAuthToken = "" refreshInterval = 0 var amcs_endpoint *url.URL + osType := os.Getenv("OS_TYPE") resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") apiVersion := "2020-04-01-preview" - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=windows&api-version=%s", resourceRegion, resourceId, configurationId, channelId, apiVersion) + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, resourceId, configurationId, channelId, osType, apiVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) var bearer = "Bearer " + imdsAccessToken @@ -353,7 +410,7 @@ func refreshIngestionAuthToken() { Log(message) SendException(message) continue - } + } IngestionAuthTokenUpdateMutex.Lock() ODSIngestionAuthToken = ingestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() From 5d1ee06f2440553dde5a1dec34bde22edbde98d9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 09:17:50 -0700 Subject: [PATCH 06/28] fix build errors --- .../plugins/go/src/ingestion_token_utils.go | 26 +++++++++++-------- source/plugins/go/src/oms.go | 12 ++++----- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index fa8db26dd..992426522 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -258,7 +258,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan return configurationId, channelId, nil } -func getIngestionAuthToken(imdsAccessToken string, configurationId string, channelId string) (ingestionAuthToken string, expiration int64, err error) { +func getIngestionAuthToken(imdsAccessToken string, configurationId string, channelId string) (ingestionAuthToken string, refreshInterval int64, err error) { Log("Info getIngestionAuthToken: start") ingestionAuthToken = "" refreshInterval = 0 @@ -346,8 +346,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) -func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (bestBeforeTime int64, err error) { - refreshInterval := 0 +func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterval int64, err error) { cacheControlHeader, valueInMap := header["Cache-Control"] if !valueInMap { return 0, errors.New("Cache-Control not in passed header") @@ -356,11 +355,13 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (bestBeforeTime for _, entry := range cacheControlHeader { match := cacheControlHeaderRegex.FindStringSubmatch(entry) if len(match) == 2 { - refreshInterval, err = strconv.Atoi(match[1]) + interval := 0 + interval, err = strconv.Atoi(match[1]) if err != nil { Log("getIngestionAuthToken: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) return 0, err - } + } + refreshInterval = int64(interval) return refreshInterval, nil } } @@ -370,9 +371,8 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (bestBeforeTime func refreshIngestionAuthToken() { for ; true; <-IngestionAuthTokenRefreshTicker.C { - var err error if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // refresh the token 1 hr expiry - imdsToken, imdsTokenExpiry, err = getAccessTokenFromIMDS() + imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() if err != nil { message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) Log(message) @@ -388,8 +388,9 @@ func refreshIngestionAuthToken() { SendException(message) continue } + var err error // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) - if ConfigurationId == "" || ChannelId == "" { + if ConfigurationId == "" || ChannelId == "" { ConfigurationId, ChannelId, err = getAgentConfiguration(IMDSToken) if err != nil { message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) @@ -398,13 +399,13 @@ func refreshIngestionAuthToken() { continue } } - if IMDSToken == "" ConfigurationId == "" || ChannelId == "" { + if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { message := "IMDSToken or ConfigurationId or ChannelId empty" Log(message) SendException(message) continue } - ingestionAuthToken, refreshIntervalInSeconds, err = getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) + ingestionAuthToken, refreshIntervalInSeconds, err := getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) if err != nil { message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) Log(message) @@ -415,7 +416,10 @@ func refreshIngestionAuthToken() { ODSIngestionAuthToken = ingestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { - ticker.Reset(time.Second * refreshIntervalInSeconds) + //TODO - use Reset when go version upgraded to 1.15 or up rather Stp() and NewTicker + //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) + IngestionAuthTokenRefreshTicker.Stop() + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) } } } diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 46ddda515..664060b76 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -781,8 +781,8 @@ func flushKubeMonAgentEventRecords() { if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() - ingestionAuthToken = ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.UnLock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { Log("ODS Ingestion Auth Token is empty") } @@ -1014,8 +1014,8 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int } if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() - ingestionAuthToken = ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.UnLock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { message := "Ingestion Auth Token is empty" Log(message) @@ -1400,8 +1400,8 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { if IsAADMSIAuthMode == true { IngestionAuthTokenUpdateMutex.Lock() - ingestionAuthToken = ODSIngestionAuthToken - IngestionAuthTokenUpdateMutex.UnLock() + ingestionAuthToken := ODSIngestionAuthToken + IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { Log("ODS Ingestion Auth Token is empty") return output.FLB_RETRY From 3586e3d7691bf63b825e1cf30eaa327fe1699983 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 11:34:17 -0700 Subject: [PATCH 07/28] code clean up --- .../plugins/go/src/ingestion_token_utils.go | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 992426522..6c35977ae 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -63,8 +63,7 @@ type AgentConfiguration struct { Extensionconfigurations struct { Containerinsights []struct { ID string `json:"id"` - Originids []string `json:"originIds"` - //TODO: make this a map so that if more types are added in the future it won't break. Also test if removing a type breaks the json deserialization + Originids []string `json:"originIds"` Outputstreams struct { LinuxPerfBlob string `json:"LINUX_PERF_BLOB"` ContainerInventoryBlob string `json:"CONTAINER_INVENTORY_BLOB"` @@ -116,8 +115,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { // Call managed services for Azure resources token endpoint var resp *http.Response = nil - for i := 0; i < 3; i++ { - // client := &http.Client{} + for i := 0; i < 3; i++ { resp, err = HTTPClient.Do(req) if err != nil { message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s", err.Error()) @@ -125,8 +123,10 @@ func getAccessTokenFromIMDS() (string, int64, error) { SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. continue } - //TODO: is this the best place to defer closing the response body? - defer resp.Body.Close() + + if resp != nil && resp.Body != nil { + defer resp.Body.Close() + } Log("getAccessTokenFromIMDS: IMDS Response Status: %d", resp.StatusCode) if resp.StatusCode != 200 { @@ -151,6 +151,10 @@ func getAccessTokenFromIMDS() (string, int64, error) { } else { Log("Info Reading IMDS Access Token from file : %s", IMDSTokenPathForWindows) + if _, err = os.Stat(IMDSTokenPathForWindows); os.IsNotExist(err) { + Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) + return imdsAccessToken, 0, err + } responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) if err != nil { Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) @@ -371,7 +375,7 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva func refreshIngestionAuthToken() { for ; true; <-IngestionAuthTokenRefreshTicker.C { - if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // refresh the token 1 hr expiry + if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // token valid 24 hrs and refresh token 1 hr before expiry imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() if err != nil { message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) @@ -416,7 +420,7 @@ func refreshIngestionAuthToken() { ODSIngestionAuthToken = ingestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { - //TODO - use Reset when go version upgraded to 1.15 or up rather Stp() and NewTicker + //TODO - use Reset which is better when go version upgraded to 1.15 or up rather Stop() and NewTicker //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) IngestionAuthTokenRefreshTicker.Stop() IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) From bdb7037a0281c884861f89a694d148594f39d156 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 11:36:22 -0700 Subject: [PATCH 08/28] code clean up --- source/plugins/go/src/ingestion_token_utils.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 6c35977ae..d6d26dbcc 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -123,7 +123,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. continue } - + if resp != nil && resp.Body != nil { defer resp.Body.Close() } @@ -418,7 +418,7 @@ func refreshIngestionAuthToken() { } IngestionAuthTokenUpdateMutex.Lock() ODSIngestionAuthToken = ingestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() + IngestionAuthTokenUpdateMutex.Unlock() if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { //TODO - use Reset which is better when go version upgraded to 1.15 or up rather Stop() and NewTicker //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) From ef73233a720450a1feb9e03d3e9684be8a227476 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 11:50:38 -0700 Subject: [PATCH 09/28] code clean up --- source/plugins/go/src/oms.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 664060b76..5fa8eb810 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -783,8 +783,8 @@ func flushKubeMonAgentEventRecords() { IngestionAuthTokenUpdateMutex.Lock() ingestionAuthToken := ODSIngestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() - if ingestionAuthToken == "" { - Log("ODS Ingestion Auth Token is empty") + if ingestionAuthToken == "" { + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") } req.Header.Set("Authorization", "Bearer "+ingestionAuthToken) } @@ -1017,7 +1017,7 @@ func PostTelegrafMetricsToLA(telegrafRecords []map[interface{}]interface{}) int ingestionAuthToken := ODSIngestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { - message := "Ingestion Auth Token is empty" + message := "Error::ODS Ingestion Auth Token is empty. Please check error log." Log(message) return output.FLB_RETRY } @@ -1403,7 +1403,7 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { ingestionAuthToken := ODSIngestionAuthToken IngestionAuthTokenUpdateMutex.Unlock() if ingestionAuthToken == "" { - Log("ODS Ingestion Auth Token is empty") + Log("Error::ODS Ingestion Auth Token is empty. Please check error log.") return output.FLB_RETRY } // add authorization header to the req From bed0042efaeff9ebb44c19afbd36f62f47b1b59d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 12:15:43 -0700 Subject: [PATCH 10/28] code clean up --- scripts/dcr-onboarding/ci-extension-dcr.json | 2 +- source/plugins/go/src/go.mod | 2 +- source/plugins/go/src/ingestion_token_utils.go | 2 +- source/plugins/go/src/oms.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/dcr-onboarding/ci-extension-dcr.json b/scripts/dcr-onboarding/ci-extension-dcr.json index 03eba1400..f3fbec79b 100644 --- a/scripts/dcr-onboarding/ci-extension-dcr.json +++ b/scripts/dcr-onboarding/ci-extension-dcr.json @@ -56,4 +56,4 @@ } ] } -} \ No newline at end of file +} diff --git a/source/plugins/go/src/go.mod b/source/plugins/go/src/go.mod index 3fd38a9bd..6fde1a32f 100644 --- a/source/plugins/go/src/go.mod +++ b/source/plugins/go/src/go.mod @@ -23,7 +23,7 @@ require ( github.com/philhofer/fwd v1.0.0 // indirect github.com/satori/go.uuid v1.2.1-0.20181028125025-b2ce2384e17b // indirect github.com/tinylib/msgp v1.1.2 - github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 // indirect + github.com/ugorji/go v1.1.2-0.20180813092308-00b869d2f4a5 golang.org/x/net v0.0.0-20200421231249-e086a090c8fd // indirect golang.org/x/time v0.0.0-20161028155119-f51c12702a4d // indirect gopkg.in/inf.v0 v0.9.0 // indirect diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index d6d26dbcc..013bc60a4 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -370,7 +370,7 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva } } - return 0, errors.New("didn't find timeout in header") + return 0, errors.New("didn't find max-age in response header") } func refreshIngestionAuthToken() { diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 5fa8eb810..9f4b18373 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -203,7 +203,7 @@ var ( EventHashUpdateMutex = &sync.Mutex{} // parent context used by ADX uploader ParentContext = context.Background() - // IngestionAuthTokenUpdateMutex read and write mutex access to the container id set + // IngestionAuthTokenUpdateMutex read and write mutex access for ODSIngestionAuthToken IngestionAuthTokenUpdateMutex = &sync.Mutex{} // ODSIngestionAuthToken for windows agent AAD MSI Auth ODSIngestionAuthToken string From e44ea9b39760e193a0648ec49277707aafae8a75 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 12:40:33 -0700 Subject: [PATCH 11/28] more refactoring --- kubernetes/linux/main.sh | 144 +++++++++++++++------------------------ 1 file changed, 56 insertions(+), 88 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 0e8eeea1d..e3c9532c5 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -450,104 +450,72 @@ echo "DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc +#skip imds lookup since not used either legacy or aad msi auth path +export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +cat /etc/mdsd.d/envmdsd | while read line; do + echo $line >> ~/.bashrc +done +source /etc/mdsd.d/envmdsd +MDSD_AAD_MSI_AUTH_ARGS="" # check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH export AAD_MSI_AUTH_MODE=false if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then - echo "*** activating oneagent in aad auth msi mode ***" + echo "*** activating oneagent in aad auth msi mode ***" + # msi auth specific args + MDSD_AAD_MSI_AUTH_ARGS="-a -A" export AAD_MSI_AUTH_MODE=true - echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc - - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - + echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="28230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - export MCS_ENDPOINT="handler.control.monitor.azure.com" - echo "export MCS_ENDPOINT=$MCS_ENDPOINT" >> ~/.bashrc - export AZURE_ENDPOINT="https://monitor.azure.com/" - echo "export AZURE_ENDPOINT=$AZURE_ENDPOINT" >> ~/.bashrc - export ADD_REGION_TO_MCS_ENDPOINT="true" - echo "export ADD_REGION_TO_MCS_ENDPOINT=$ADD_REGION_TO_MCS_ENDPOINT" >> ~/.bashrc - export ENABLE_MCS="true" - echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc - export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" - echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc - export MDSD_USE_LOCAL_PERSISTENCY="false" - echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc - #skip imds metadata lookup since we dont use - #legacy auth misnomer here - export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" - echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc - - source ~/.bashrc - - dpkg -l | grep mdsd | awk '{print $2 " " $3}' - - if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in aad auth msi mode in sidecar container..." - #use tenant name to avoid unix socket conflict and different ports for port conflict - #roleprefix to use container specific mdsd socket - export TENANT_NAME="${CONTAINER_TYPE}" - echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc - export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default - echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc - source ~/.bashrc - mkdir /var/run/mdsd-${CONTAINER_TYPE} - # add -T 0xFFFF for full traces - mdsd -a -A -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - else - echo "starting mdsd in aad auth msi mode in main container..." - # add -T 0xFFFF for full traces - mdsd -a -A -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - fi - + export MDSD_FLUENT_SOCKET_PORT="28230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + export MCS_ENDPOINT="handler.control.monitor.azure.com" + echo "export MCS_ENDPOINT=$MCS_ENDPOINT" >> ~/.bashrc + export AZURE_ENDPOINT="https://monitor.azure.com/" + echo "export AZURE_ENDPOINT=$AZURE_ENDPOINT" >> ~/.bashrc + export ADD_REGION_TO_MCS_ENDPOINT="true" + echo "export ADD_REGION_TO_MCS_ENDPOINT=$ADD_REGION_TO_MCS_ENDPOINT" >> ~/.bashrc + export ENABLE_MCS="true" + echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc + export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" + echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc + export MDSD_USE_LOCAL_PERSISTENCY="false" + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc else echo "*** activating oneagent in legacy auth mode ***" CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" - #use the file path as its secure than env - CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" - cat /etc/mdsd.d/envmdsd | while read line; do - echo $line >> ~/.bashrc - done - source /etc/mdsd.d/envmdsd - echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" - export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc - export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile - echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc - export OMS_TLD=$domain - echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc - export MDSD_FLUENT_SOCKET_PORT="29230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - - #skip imds lookup since not used in legacy auth path - export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" - echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc - - source ~/.bashrc + #use the file path as its secure than env + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" + export CIWORKSPACE_id=$CIWORKSPACE_id + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile + echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc + export OMS_TLD=$domain + echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="29230" + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc +fi +source ~/.bashrc - dpkg -l | grep mdsd | awk '{print $2 " " $3}' +dpkg -l | grep mdsd | awk '{print $2 " " $3}' - if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." - #use tenant name to avoid unix socket conflict and different ports for port conflict - #roleprefix to use container specific mdsd socket - export TENANT_NAME="${CONTAINER_TYPE}" - echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc - export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default - echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc - source ~/.bashrc - mkdir /var/run/mdsd-${CONTAINER_TYPE} - # add -T 0xFFFF for full traces - mdsd -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - else - echo "starting mdsd in legacy auth mode in main container..." - # add -T 0xFFFF for full traces - mdsd -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & - fi +if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." + #use tenant name to avoid unix socket conflict and different ports for port conflict + #roleprefix to use container specific mdsd socket + export TENANT_NAME="${CONTAINER_TYPE}" + echo "export TENANT_NAME=$TENANT_NAME" >> ~/.bashrc + export MDSD_ROLE_PREFIX=/var/run/mdsd-${CONTAINER_TYPE}/default + echo "export MDSD_ROLE_PREFIX=$MDSD_ROLE_PREFIX" >> ~/.bashrc + source ~/.bashrc + mkdir /var/run/mdsd-${CONTAINER_TYPE} + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & +else + echo "starting mdsd in legacy auth mode in main container..." + # add -T 0xFFFF for full traces + mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi # no dependency on fluentd for prometheus side car container From 69128b7cea95c15e7fa91134732f2517fbfe2bb4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 15:19:28 -0700 Subject: [PATCH 12/28] fix bug --- source/plugins/go/src/oms.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 9f4b18373..c6a6d7805 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1609,7 +1609,8 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { Log("OMSEndpoint %s", OMSEndpoint) IsAADMSIAuthMode = false if strings.Compare(strings.ToLower(os.Getenv(AADMSIAuthMode)), "true") == 0 { - Log("AAD MSI Auth Mode Configured") + IsAADMSIAuthMode = true + Log("AAD MSI Auth Mode Configured") } ResourceID = os.Getenv(envAKSResourceID) From b1ddedfa08ab1f42e3bd5b4ed3c50e077200ca0e Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 15:52:51 -0700 Subject: [PATCH 13/28] fix bug --- source/plugins/go/src/ingestion_token_utils.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 013bc60a4..5c06c82c9 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -155,6 +155,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) return imdsAccessToken, 0, err } + //ensure this works and no issues accessing the file responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) if err != nil { Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) @@ -423,7 +424,7 @@ func refreshIngestionAuthToken() { //TODO - use Reset which is better when go version upgraded to 1.15 or up rather Stop() and NewTicker //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) IngestionAuthTokenRefreshTicker.Stop() - IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(refreshIntervalInSeconds)) } } } From d8df6b52d5c66eafddb173e9a48fed052b108ac1 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 17:06:15 -0700 Subject: [PATCH 14/28] add debug logs --- source/plugins/go/src/oms.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index c6a6d7805..934969217 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1784,7 +1784,8 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName - + Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) + Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(defaultIngestionAuthTokenRefreshIntervalSeconds)) From ffc5c207b6afd63f24fdb9d9c7a3e5723b5ff27f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 17:49:19 -0700 Subject: [PATCH 15/28] add nil checks --- source/plugins/go/src/ingestion_token_utils.go | 8 ++++++-- source/plugins/go/src/oms.go | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 5c06c82c9..2cc684a6d 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -254,8 +254,12 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan return configurationId, channelId, err } - configurationId = agentConfiguration.Configurations[0].Configurationid - channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID + if agentConfiguration.Configurations[0].Configurationid != nil && agentConfiguration.Configurations[0].Configurationid != "" { + configurationId = agentConfiguration.Configurations[0].Configurationid + } + if agentConfiguration.Configurations[0].Content.Channels[0].ID != nil && agentConfiguration.Configurations[0].Content.Channels[0].ID != "" { + channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID + } Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) Log("Info getAgentConfiguration: end") diff --git a/source/plugins/go/src/oms.go b/source/plugins/go/src/oms.go index 934969217..da432e3de 100644 --- a/source/plugins/go/src/oms.go +++ b/source/plugins/go/src/oms.go @@ -1783,8 +1783,7 @@ func InitializePlugin(pluginConfPath string, agentVersion string) { } MdsdInsightsMetricsTagName = MdsdInsightsMetricsSourceName - MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName - Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) + MdsdKubeMonAgentEventsTagName = MdsdKubeMonAgentEventsSourceName Log("ContainerLogsRouteADX: %v, IsWindows: %v, IsAADMSIAuthMode = %v \n", ContainerLogsRouteADX, IsWindows, IsAADMSIAuthMode) if !ContainerLogsRouteADX && IsWindows && IsAADMSIAuthMode { Log("defaultIngestionAuthTokenRefreshIntervalSeconds = %d \n", defaultIngestionAuthTokenRefreshIntervalSeconds) From 6faa97f4c21b63f6fe62c8eaa100858d3f5f38a4 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 15 Jun 2021 17:51:45 -0700 Subject: [PATCH 16/28] revert changes --- source/plugins/go/src/ingestion_token_utils.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 2cc684a6d..568322dfe 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -254,12 +254,8 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan return configurationId, channelId, err } - if agentConfiguration.Configurations[0].Configurationid != nil && agentConfiguration.Configurations[0].Configurationid != "" { - configurationId = agentConfiguration.Configurations[0].Configurationid - } - if agentConfiguration.Configurations[0].Content.Channels[0].ID != nil && agentConfiguration.Configurations[0].Content.Channels[0].ID != "" { - channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID - } + configurationId = agentConfiguration.Configurations[0].Configurationid + channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) Log("Info getAgentConfiguration: end") From ce9c483f0e58fa18965ebda9e72f4a57045fe4bf Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 16 Jun 2021 09:39:55 -0700 Subject: [PATCH 17/28] revert yaml change since this added in aks side --- kubernetes/omsagent.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index 0a8b39426..617c81f38 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -811,9 +811,6 @@ spec: - mountPath: C:\etc\config\adx name: omsagent-adx-secret readOnly: true - - mountPath: C:\etc\IMDS-access-token - name: imds-token - readOnly: true # Need to mount this only for airgapped clouds - Commenting this since it wont exist in non airgapped clouds # - mountPath: C:\ca # name: ca-certs @@ -874,10 +871,6 @@ spec: secret: secretName: omsagent-adx-secret optional: true - - name: imds-token - secret: - secretName: omsagent-aad-msi-token - optional: true --- kind: Service apiVersion: v1 From 46d1973bdefec11e63aaf0d174bc39f7dee26ab6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 17 Jun 2021 18:59:18 -0700 Subject: [PATCH 18/28] fix pr feedback --- kubernetes/linux/main.sh | 62 +++++------ .../plugins/go/src/ingestion_token_utils.go | 103 ++++++++++-------- 2 files changed, 86 insertions(+), 79 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index e3c9532c5..bb03a4365 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -12,7 +12,7 @@ waitforlisteneronTCPport() { echo "${FUNCNAME[0]} called with incorrect arguments<$1 , $2>. Required arguments <#port, #wait-time-in-seconds>" return -1 else - + if [[ $port =~ $numeric ]] && [[ $waittimesecs =~ $numeric ]]; then #local varlistener=$(netstat -lnt | awk '$6 == "LISTEN" && $4 ~ ":25228$"') while true @@ -57,11 +57,11 @@ else export customResourceId=$AKS_RESOURCE_ID echo "export customResourceId=$AKS_RESOURCE_ID" >> ~/.bashrc source ~/.bashrc - echo "customResourceId:$customResourceId" - export customRegion=$AKS_REGION + echo "customResourceId:$customResourceId" + export customRegion=$AKS_REGION echo "export customRegion=$AKS_REGION" >> ~/.bashrc source ~/.bashrc - echo "customRegion:$customRegion" + echo "customRegion:$customRegion" fi #set agent config schema version @@ -237,9 +237,9 @@ if [ ${#APPLICATIONINSIGHTS_AUTH_URL} -ge 1 ]; then # (check if APPLICATIONINSI fi -aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) -export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey -echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc +aikey=$(echo $APPLICATIONINSIGHTS_AUTH | base64 --decode) +export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey +echo "export TELEMETRY_APPLICATIONINSIGHTS_KEY=$aikey" >> ~/.bashrc source ~/.bashrc @@ -425,7 +425,7 @@ export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_docker_operations_error if [ "$CONTAINER_RUNTIME" != "docker" ]; then # these metrics are avialble only on k8s versions <1.18 and will get deprecated from 1.18 export KUBELET_RUNTIME_OPERATIONS_METRIC="kubelet_runtime_operations" - export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" + export KUBELET_RUNTIME_OPERATIONS_ERRORS_METRIC="kubelet_runtime_operations_errors" fi echo "set caps for ruby process to read container env from proc" @@ -452,21 +452,21 @@ echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc #skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" -echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc cat /etc/mdsd.d/envmdsd | while read line; do echo $line >> ~/.bashrc done -source /etc/mdsd.d/envmdsd +source /etc/mdsd.d/envmdsd MDSD_AAD_MSI_AUTH_ARGS="" -# check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH -export AAD_MSI_AUTH_MODE=false +# check if its AAD Auth MSI mode via USING_AAD_MSI_AUTH +export AAD_MSI_AUTH_MODE=false if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then - echo "*** activating oneagent in aad auth msi mode ***" + echo "*** activating oneagent in aad auth msi mode ***" # msi auth specific args - MDSD_AAD_MSI_AUTH_ARGS="-a -A" + MDSD_AAD_MSI_AUTH_ARGS="-a -A" export AAD_MSI_AUTH_MODE=true - echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc - + echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc + export MDSD_FLUENT_SOCKET_PORT="28230" echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc export MCS_ENDPOINT="handler.control.monitor.azure.com" @@ -480,28 +480,28 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" echo "export MONITORING_USE_GENEVA_CONFIG_SERVICE=$MONITORING_USE_GENEVA_CONFIG_SERVICE" >> ~/.bashrc export MDSD_USE_LOCAL_PERSISTENCY="false" - echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc -else + echo "export MDSD_USE_LOCAL_PERSISTENCY=$MDSD_USE_LOCAL_PERSISTENCY" >> ~/.bashrc +else echo "*** activating oneagent in legacy auth mode ***" - CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" + CIWORKSPACE_id="$(cat /etc/omsagent-secret/WSID)" #use the file path as its secure than env - CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" + CIWORKSPACE_keyFile="/etc/omsagent-secret/KEY" echo "setting mdsd workspaceid & key for workspace:$CIWORKSPACE_id" export CIWORKSPACE_id=$CIWORKSPACE_id - echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc + echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc export OMS_TLD=$domain - echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc + echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc export MDSD_FLUENT_SOCKET_PORT="29230" - echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc + echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc fi source ~/.bashrc dpkg -l | grep mdsd | awk '{print $2 " " $3}' if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then - echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in legacy auth mode in sidecar container..." + echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." #use tenant name to avoid unix socket conflict and different ports for port conflict #roleprefix to use container specific mdsd socket export TENANT_NAME="${CONTAINER_TYPE}" @@ -512,22 +512,22 @@ if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then mkdir /var/run/mdsd-${CONTAINER_TYPE} # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -r ${MDSD_ROLE_PREFIX} -p 26130 -f 26230 -i 26330 -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & -else - echo "starting mdsd in legacy auth mode in main container..." +else + echo "starting mdsd mode in main container..." # add -T 0xFFFF for full traces mdsd ${MDSD_AAD_MSI_AUTH_ARGS} -e ${MDSD_LOG}/mdsd.err -w ${MDSD_LOG}/mdsd.warn -o ${MDSD_LOG}/mdsd.info -q ${MDSD_LOG}/mdsd.qos & fi -# no dependency on fluentd for prometheus side car container -if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then +# no dependency on fluentd for prometheus side car container +if [ "${CONTAINER_TYPE}" != "PrometheusSidecar" ]; then if [ ! -e "/etc/config/kube.conf" ]; then echo "*** starting fluentd v1 in daemonset" fluentd -c /etc/fluent/container.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & else echo "*** starting fluentd v1 in replicaset" fluentd -c /etc/fluent/kube.conf -o /var/opt/microsoft/docker-cimprov/log/fluentd.log --log-rotate-age 5 --log-rotate-size 20971520 & - fi -fi + fi +fi #If config parsing was successful, a copy of the conf file with replaced custom settings file is created if [ ! -e "/etc/config/kube.conf" ]; then @@ -661,7 +661,7 @@ echo "getting rsyslog status..." service rsyslog status shutdown() { - pkill -f mdsd + pkill -f mdsd } trap "shutdown" SIGTERM diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 568322dfe..1cbf97c2f 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -63,7 +63,7 @@ type AgentConfiguration struct { Extensionconfigurations struct { Containerinsights []struct { ID string `json:"id"` - Originids []string `json:"originIds"` + Originids []string `json:"originIds"` Outputstreams struct { LinuxPerfBlob string `json:"LINUX_PERF_BLOB"` ContainerInventoryBlob string `json:"CONTAINER_INVENTORY_BLOB"` @@ -91,13 +91,13 @@ type IngestionTokenResponse struct { } func getAccessTokenFromIMDS() (string, int64, error) { - Log("Info getAccessTokenFromIMDS: start") - useIMDSTOkenProxyEndPoint := os.Getenv("USE_IMDS_TOKEN_PROXY_END_POINT") + Log("Info getAccessTokenFromIMDS: start") + useIMDSTokenProxyEndPoint := os.Getenv("USE_IMDS_TOKEN_PROXY_END_POINT") imdsAccessToken := "" var responseBytes []byte var err error - - if (useIMDSTOkenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTOkenProxyEndPoint), "true") == 0) { + + if (useIMDSTokenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTokenProxyEndPoint), "true") == 0) { Log("Info Reading IMDS Access Token from IMDS Token proxy endpoint") MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") var msi_endpoint *url.URL @@ -113,22 +113,25 @@ func getAccessTokenFromIMDS() (string, int64, error) { } req.Header.Add("Metadata", "true") + //IMDS endpoint nonroutable endpoint and requests doesnt go through proxy hence using separate client + httpClient := &http.Client{} + // Call managed services for Azure resources token endpoint var resp *http.Response = nil - for i := 0; i < 3; i++ { - resp, err = HTTPClient.Do(req) + for retryCount := 0; retryCount < 3; retryCount++ { + resp, err = httpClient.Do(req) if err != nil { - message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s", err.Error()) + message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s, retryCount: %d", err.Error(), retryCount) Log(message) SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. continue } if resp != nil && resp.Body != nil { - defer resp.Body.Close() + defer resp.Body.Close() } - Log("getAccessTokenFromIMDS: IMDS Response Status: %d", resp.StatusCode) + Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) if resp.StatusCode != 200 { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code : %d", resp.StatusCode) Log(message) @@ -155,12 +158,16 @@ func getAccessTokenFromIMDS() (string, int64, error) { Log("getAccessTokenFromIMDS: IMDS token file doesnt exist: %s", err.Error()) return imdsAccessToken, 0, err } - //ensure this works and no issues accessing the file - responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) - if err != nil { - Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s", err.Error()) - return imdsAccessToken, 0, err - } + //adding retries incase if we ended up reading the token file while the token file being written + for retryCount := 0; retryCount < 3; retryCount++ { + responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) + if err != nil { + Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) + time.Sleep(100 * time.Millisecond) + continue + } + break + } } // Unmarshall response body into struct @@ -174,7 +181,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) if err != nil { - Log("Error parsing ExpiresOn field from IMDS response: %s", err.Error()) + Log("Error parsing ExpiresOn field from IMDS response: %s", err.Error()) return imdsAccessToken, 0, err } Log("Info getAccessTokenFromIMDS: end") @@ -184,14 +191,14 @@ func getAccessTokenFromIMDS() (string, int64, error) { func getAgentConfiguration(imdsAccessToken string) (configurationId string, channelId string, err error) { Log("Info getAgentConfiguration: start") configurationId = "" - channelId = "" + channelId = "" apiVersion := "2020-08-01-preview" var amcs_endpoint *url.URL osType := os.Getenv("OS_TYPE") resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") - MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, resourceId, osType, apiVersion) + mcsEndpoint := os.Getenv("MCS_ENDPOINT") + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, osType, apiVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) var bearer = "Bearer " + imdsAccessToken @@ -206,7 +213,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan var resp *http.Response = nil - for i := 0; i < 3; i++ { + for i := 0; i < 3; i++ { resp, err = HTTPClient.Do(req) if err != nil { message := fmt.Sprintf("Error calling AMCS endpoint: %s", err.Error()) @@ -215,7 +222,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan continue } if resp != nil && resp.Body != nil { - defer resp.Body.Close() + defer resp.Body.Close() } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) if resp.StatusCode != 200 { @@ -223,7 +230,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan Log(message) SendException(message) continue - } + } break // call succeeded, don't retry any more } if resp == nil { @@ -255,7 +262,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan } configurationId = agentConfiguration.Configurations[0].Configurationid - channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID + channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) Log("Info getAgentConfiguration: end") @@ -271,9 +278,9 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann osType := os.Getenv("OS_TYPE") resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") - MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + mcsEndpoint := os.Getenv("MCS_ENDPOINT") apiVersion := "2020-04-01-preview" - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control."+MCS_ENDPOINT+"%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, resourceId, configurationId, channelId, osType, apiVersion) + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, configurationId, channelId, osType, apiVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) var bearer = "Bearer " + imdsAccessToken @@ -291,7 +298,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann var resp *http.Response = nil for i := 0; i < 3; i++ { - // Call managed services for Azure resources token endpoint + // Call managed services for Azure resources token endpoint resp, err = HTTPClient.Do(req) if err != nil { message := fmt.Sprintf("Error calling amcs endpoint for ingestion auth token: %s", err.Error()) @@ -300,9 +307,9 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann resp = nil continue } - + if resp != nil && resp.Body != nil { - defer resp.Body.Close() + defer resp.Body.Close() } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) @@ -320,7 +327,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann Log("ran out of retries calling AMCS for ingestion token") return ingestionAuthToken, 0, err } - + // Pull out response body responseBytes, err := ioutil.ReadAll(resp.Body) if err != nil { @@ -351,7 +358,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) -func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterval int64, err error) { +func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterval int64, err error) { cacheControlHeader, valueInMap := header["Cache-Control"] if !valueInMap { return 0, errors.New("Cache-Control not in passed header") @@ -365,8 +372,8 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva if err != nil { Log("getIngestionAuthToken: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) return 0, err - } - refreshInterval = int64(interval) + } + refreshInterval = int64(interval) return refreshInterval, nil } } @@ -375,56 +382,56 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva } func refreshIngestionAuthToken() { - for ; true; <-IngestionAuthTokenRefreshTicker.C { + for ; true; <-IngestionAuthTokenRefreshTicker.C { if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // token valid 24 hrs and refresh token 1 hr before expiry - imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() + imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() if err != nil { message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) Log(message) - SendException(message) + SendException(message) } else { IMDSToken = imdsToken IMDSTokenExpiration = imdsTokenExpiry } - } + } if IMDSToken == "" { message := "IMDSToken is empty" Log(message) SendException(message) - continue - } - var err error + continue + } + var err error // ignore agent configuration expiring, the configuration and channel IDs will never change (without creating an agent restart) - if ConfigurationId == "" || ChannelId == "" { + if ConfigurationId == "" || ChannelId == "" { ConfigurationId, ChannelId, err = getAgentConfiguration(IMDSToken) if err != nil { message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) Log(message) SendException(message) - continue + continue } } if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { message := "IMDSToken or ConfigurationId or ChannelId empty" Log(message) SendException(message) - continue + continue } ingestionAuthToken, refreshIntervalInSeconds, err := getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) if err != nil { message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) Log(message) SendException(message) - continue - } + continue + } IngestionAuthTokenUpdateMutex.Lock() ODSIngestionAuthToken = ingestionAuthToken - IngestionAuthTokenUpdateMutex.Unlock() + IngestionAuthTokenUpdateMutex.Unlock() if refreshIntervalInSeconds > 0 && refreshIntervalInSeconds != defaultIngestionAuthTokenRefreshIntervalSeconds { //TODO - use Reset which is better when go version upgraded to 1.15 or up rather Stop() and NewTicker - //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) + //IngestionAuthTokenRefreshTicker.Reset(time.Second * time.Duration(refreshIntervalInSeconds)) IngestionAuthTokenRefreshTicker.Stop() - IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(refreshIntervalInSeconds)) + IngestionAuthTokenRefreshTicker = time.NewTicker(time.Second * time.Duration(refreshIntervalInSeconds)) } } } From 8091e55d91d96cb0cb9259aa127aed1786e2e38f Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Fri, 18 Jun 2021 23:55:00 -0700 Subject: [PATCH 19/28] fix pr feedback --- .../plugins/go/src/ingestion_token_utils.go | 152 +++++++++++------- 1 file changed, 97 insertions(+), 55 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 1cbf97c2f..66b39906b 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -15,6 +15,9 @@ import ( ) const IMDSTokenPathForWindows = "c:/etc/imds-access-token/token" // only used in windows +const AMCSAgentConfigAPIVersion = "2020-08-01-preview" +const AMCSIngestionTokenAPIVersion = "2020-04-01-preview" +const MaxRetries = 3 var IMDSToken string var IMDSTokenExpiration int64 @@ -99,9 +102,10 @@ func getAccessTokenFromIMDS() (string, int64, error) { if (useIMDSTokenProxyEndPoint != "" && strings.Compare(strings.ToLower(useIMDSTokenProxyEndPoint), "true") == 0) { Log("Info Reading IMDS Access Token from IMDS Token proxy endpoint") - MCS_ENDPOINT := os.Getenv("MCS_ENDPOINT") + mcsEndpoint := os.Getenv("MCS_ENDPOINT") + msi_endpoint_string := fmt.Sprintf("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://%s/", mcsEndpoint) var msi_endpoint *url.URL - msi_endpoint, err := url.Parse("http://169.254.169.254/metadata/identity/oauth2/token?api-version=2018-02-01&resource=https://" + MCS_ENDPOINT + "/") + msi_endpoint, err := url.Parse(msi_endpoint_string) if err != nil { Log("getAccessTokenFromIMDS: Error creating IMDS endpoint URL: %s", err.Error()) return imdsAccessToken, 0, err @@ -113,17 +117,18 @@ func getAccessTokenFromIMDS() (string, int64, error) { } req.Header.Add("Metadata", "true") - //IMDS endpoint nonroutable endpoint and requests doesnt go through proxy hence using separate client + //IMDS endpoint nonroutable endpoint and requests doesnt go through proxy hence using dedicated http client httpClient := &http.Client{} // Call managed services for Azure resources token endpoint var resp *http.Response = nil - for retryCount := 0; retryCount < 3; retryCount++ { + IsSuccess := false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { resp, err = httpClient.Do(req) if err != nil { message := fmt.Sprintf("getAccessTokenFromIMDS: Error calling token endpoint: %s, retryCount: %d", err.Error(), retryCount) Log(message) - SendException(message) // send the exception here because this error is not returned. The calling function will send any returned errors to telemetry. + SendException(message) continue } @@ -132,15 +137,21 @@ func getAccessTokenFromIMDS() (string, int64, error) { } Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) - if resp.StatusCode != 200 { - message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code : %d", resp.StatusCode) + if resp.StatusCode >= 500 { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - SendException(message) + time.Sleep(100 * time.Millisecond) continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return imdsAccessToken, 0, err } + IsSuccess = true break // call succeeded, don't retry any more } - if resp == nil { + if !IsSuccess || resp == nil || resp.Body == nil { Log("getAccessTokenFromIMDS: IMDS Request ran out of retries") return imdsAccessToken, 0, err } @@ -148,7 +159,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { // Pull out response body responseBytes, err = ioutil.ReadAll(resp.Body) if err != nil { - Log("getAccessTokenFromIMDS: Error reading response body : %s", err.Error()) + Log("getAccessTokenFromIMDS: Error reading response body: %s", err.Error()) return imdsAccessToken, 0, err } @@ -159,7 +170,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { return imdsAccessToken, 0, err } //adding retries incase if we ended up reading the token file while the token file being written - for retryCount := 0; retryCount < 3; retryCount++ { + for retryCount := 0; retryCount < MaxRetries; retryCount++ { responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) if err != nil { Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) @@ -170,6 +181,11 @@ func getAccessTokenFromIMDS() (string, int64, error) { } } + if responseBytes == nil { + Log("getAccessTokenFromIMDS: Error responseBytes is nil") + return imdsAccessToken, 0, err + } + // Unmarshall response body into struct var imdsResponse IMDSResponse err = json.Unmarshal(responseBytes, &imdsResponse) @@ -181,7 +197,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { expiration, err := strconv.ParseInt(imdsResponse.ExpiresOn, 10, 64) if err != nil { - Log("Error parsing ExpiresOn field from IMDS response: %s", err.Error()) + Log("getAccessTokenFromIMDS: Error parsing ExpiresOn field from IMDS response: %s", err.Error()) return imdsAccessToken, 0, err } Log("Info getAccessTokenFromIMDS: end") @@ -192,31 +208,34 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan Log("Info getAgentConfiguration: start") configurationId = "" channelId = "" - apiVersion := "2020-08-01-preview" var amcs_endpoint *url.URL osType := os.Getenv("OS_TYPE") resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") mcsEndpoint := os.Getenv("MCS_ENDPOINT") - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, osType, apiVersion) + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, osType, AMCSAgentConfigAPIVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) + if err != nil { + Log("getAgentConfiguration: Error creating AMCS endpoint URL: %s", err.Error()) + return configurationId, channelId, err + } var bearer = "Bearer " + imdsAccessToken // Create a new request using http req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) if err != nil { - message := fmt.Sprintf("Error creating HTTP request for AMCS endpoint: %s", err.Error()) + message := fmt.Sprintf("getAgentConfiguration: Error creating HTTP request for AMCS endpoint: %s", err.Error()) Log(message) return configurationId, channelId, err } req.Header.Set("Authorization", bearer) var resp *http.Response = nil - - for i := 0; i < 3; i++ { + IsSuccess := false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { resp, err = HTTPClient.Do(req) if err != nil { - message := fmt.Sprintf("Error calling AMCS endpoint: %s", err.Error()) + message := fmt.Sprintf("getAgentConfiguration: Error calling AMCS endpoint: %s", err.Error()) Log(message) SendException(message) continue @@ -225,21 +244,29 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan defer resp.Body.Close() } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) - if resp.StatusCode != 200 { - message := fmt.Sprintf("getAgentConfiguration Request failed with an error code : %d", resp.StatusCode) + if resp.StatusCode >= 500 { + message := fmt.Sprintf("getAgentConfiguration: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - SendException(message) + time.Sleep(100 * time.Millisecond) continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getAgentConfiguration: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return configurationId, channelId, err } + IsSuccess = true break // call succeeded, don't retry any more } - if resp == nil { - Log("getAgentConfiguration Request ran out of retries") + if !IsSuccess || resp == nil || resp.Body == nil { + message := fmt.Sprintf("getAgentConfiguration Request ran out of retries") + Log(message) + SendException(message) return configurationId, channelId, err } responseBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - Log("Error reading response body from AMCS API call : %s", err.Error()) + Log("getAgentConfiguration: Error reading response body from AMCS API call: %s", err.Error()) return configurationId, channelId, err } @@ -247,24 +274,30 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan var agentConfiguration AgentConfiguration err = json.Unmarshal(responseBytes, &agentConfiguration) if err != nil { - Log("Error unmarshalling the response: %s", err.Error()) + message := fmt.Sprintf("getAgentConfiguration: Error unmarshalling the response: %s", err.Error()) + Log(message) + SendException(message) return configurationId, channelId, err } if len(agentConfiguration.Configurations) == 0 { - Log("Received empty agentConfiguration.Configurations array") + message := "getAgentConfiguration: Received empty agentConfiguration.Configurations array" + Log(message) + SendException(message) return configurationId, channelId, err } if len(agentConfiguration.Configurations[0].Content.Channels) == 0 { - Log("Received empty agentConfiguration.Configurations[0].Content.Channels") + message := "getAgentConfiguration: Received empty agentConfiguration.Configurations[0].Content.Channels" + Log(message) + SendException(message) return configurationId, channelId, err } configurationId = agentConfiguration.Configurations[0].Configurationid channelId = agentConfiguration.Configurations[0].Content.Channels[0].ID - Log("obtained configurationId: %s, channelId: %s", configurationId, channelId) + Log("getAgentConfiguration: obtained configurationId: %s, channelId: %s", configurationId, channelId) Log("Info getAgentConfiguration: end") return configurationId, channelId, nil @@ -279,29 +312,31 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann resourceId := os.Getenv("AKS_RESOURCE_ID") resourceRegion := os.Getenv("AKS_REGION") mcsEndpoint := os.Getenv("MCS_ENDPOINT") - apiVersion := "2020-04-01-preview" - amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, configurationId, channelId, osType, apiVersion) + amcs_endpoint_string := fmt.Sprintf("https://%s.handler.control.%s%s/agentConfigurations/%s/channels/%s/issueIngestionToken?platform=%s&api-version=%s", resourceRegion, mcsEndpoint, resourceId, configurationId, channelId, osType, AMCSIngestionTokenAPIVersion) amcs_endpoint, err = url.Parse(amcs_endpoint_string) + if err != nil { + Log("getIngestionAuthToken: Error creating AMCS endpoint URL: %s", err.Error()) + return ingestionAuthToken, refreshInterval, err + } var bearer = "Bearer " + imdsAccessToken - // Create a new request using http req, err := http.NewRequest("GET", amcs_endpoint.String(), nil) if err != nil { - Log("Error creating HTTP request for AMCS endpoint: %s", err.Error()) - return ingestionAuthToken, 0, err + Log("getIngestionAuthToken: Error creating HTTP request for AMCS endpoint: %s", err.Error()) + return ingestionAuthToken, refreshInterval, err } // add authorization header to the req req.Header.Add("Authorization", bearer) var resp *http.Response = nil - - for i := 0; i < 3; i++ { + IsSuccess : = false + for retryCount := 0; retryCount < MaxRetries; retryCount++ { // Call managed services for Azure resources token endpoint resp, err = HTTPClient.Do(req) if err != nil { - message := fmt.Sprintf("Error calling amcs endpoint for ingestion auth token: %s", err.Error()) + message := fmt.Sprintf("getIngestionAuthToken: Error calling AMCS endpoint for ingestion auth token: %s", err.Error()) Log(message) SendException(message) resp = nil @@ -313,25 +348,32 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) - if resp.StatusCode != 200 { - message := fmt.Sprintf("getIngestionAuthToken Request failed with an error code : %d", resp.StatusCode) + if resp.StatusCode >= 500 { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - SendException(message) - resp = nil + time.Sleep(100 * time.Millisecond) continue + } else if resp.StatusCode != 200 { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) + Log(message) + SendException(message) + return ingestionAuthToken, refreshInterval, err } + IsSuccess = true break } - if resp == nil { - Log("ran out of retries calling AMCS for ingestion token") - return ingestionAuthToken, 0, err + if !IsSuccess || resp == nil || resp.Body == nil { + message := "getIngestionAuthToken: ran out of retries calling AMCS for ingestion token" + Log(message) + SendException(message) + return ingestionAuthToken, refreshInterval, err } // Pull out response body responseBytes, err := ioutil.ReadAll(resp.Body) if err != nil { - Log("Error reading response body from AMCS Ingestion API call : %s", err.Error()) + Log("getIngestionAuthToken: Error reading response body from AMCS Ingestion API call : %s", err.Error()) return ingestionAuthToken, refreshInterval, err } @@ -339,7 +381,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann var ingestionTokenResponse IngestionTokenResponse err = json.Unmarshal(responseBytes, &ingestionTokenResponse) if err != nil { - Log("Error unmarshalling the response: %s", err.Error()) + Log("getIngestionAuthToken: Error unmarshalling the response: %s", err.Error()) return ingestionAuthToken, refreshInterval, err } @@ -347,8 +389,8 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann refreshInterval, err = getTokenRefreshIntervalFromAmcsResponse(resp.Header) if err != nil { - Log("getIngestionAuthToken failed to parse max-age response header") - return ingestionAuthToken, 0, err + Log("getIngestionAuthToken: Error failed to parse max-age response header") + return ingestionAuthToken, refreshInterval, err } Log("getIngestionAuthToken: refresh interval %d seconds", refreshInterval) @@ -361,7 +403,7 @@ var cacheControlHeaderRegex = regexp.MustCompile(`max-age=([0-9]+)`) func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterval int64, err error) { cacheControlHeader, valueInMap := header["Cache-Control"] if !valueInMap { - return 0, errors.New("Cache-Control not in passed header") + return 0, errors.New("getTokenRefreshIntervalFromAmcsResponse: Cache-Control not in passed header") } for _, entry := range cacheControlHeader { @@ -370,7 +412,7 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva interval := 0 interval, err = strconv.Atoi(match[1]) if err != nil { - Log("getIngestionAuthToken: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) + Log("getTokenRefreshIntervalFromAmcsResponse: error getting timeout from auth token. Header: " + strings.Join(cacheControlHeader, ",")) return 0, err } refreshInterval = int64(interval) @@ -378,7 +420,7 @@ func getTokenRefreshIntervalFromAmcsResponse(header http.Header) (refreshInterva } } - return 0, errors.New("didn't find max-age in response header") + return 0, errors.New("getTokenRefreshIntervalFromAmcsResponse: didn't find max-age in response header") } func refreshIngestionAuthToken() { @@ -386,7 +428,7 @@ func refreshIngestionAuthToken() { if IMDSToken == "" || IMDSTokenExpiration <= (time.Now().Unix() + 60 * 60) { // token valid 24 hrs and refresh token 1 hr before expiry imdsToken, imdsTokenExpiry, err := getAccessTokenFromIMDS() if err != nil { - message := fmt.Sprintf("Error on getAccessTokenFromIMDS %s \n", err.Error()) + message := fmt.Sprintf("refreshIngestionAuthToken: Error on getAccessTokenFromIMDS %s \n", err.Error()) Log(message) SendException(message) } else { @@ -395,7 +437,7 @@ func refreshIngestionAuthToken() { } } if IMDSToken == "" { - message := "IMDSToken is empty" + message := "refreshIngestionAuthToken: IMDSToken is empty" Log(message) SendException(message) continue @@ -405,21 +447,21 @@ func refreshIngestionAuthToken() { if ConfigurationId == "" || ChannelId == "" { ConfigurationId, ChannelId, err = getAgentConfiguration(IMDSToken) if err != nil { - message := fmt.Sprintf("Error getAgentConfiguration %s \n", err.Error()) + message := fmt.Sprintf("refreshIngestionAuthToken: Error getAgentConfiguration %s \n", err.Error()) Log(message) SendException(message) continue } } if IMDSToken == "" || ConfigurationId == "" || ChannelId == "" { - message := "IMDSToken or ConfigurationId or ChannelId empty" + message := "refreshIngestionAuthToken: IMDSToken or ConfigurationId or ChannelId empty" Log(message) SendException(message) continue } ingestionAuthToken, refreshIntervalInSeconds, err := getIngestionAuthToken(IMDSToken, ConfigurationId, ChannelId) if err != nil { - message := fmt.Sprintf("Error getIngestionAuthToken %s \n", err.Error()) + message := fmt.Sprintf("refreshIngestionAuthToken: Error getIngestionAuthToken %s \n", err.Error()) Log(message) SendException(message) continue From 1eff4c57901dcff8dc73f3b99747f739d7051ae9 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sat, 19 Jun 2021 14:03:24 -0700 Subject: [PATCH 20/28] refine retry code --- .../plugins/go/src/ingestion_token_utils.go | 55 ++++++++++++++++--- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index 66b39906b..c97de608e 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -118,7 +118,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { req.Header.Add("Metadata", "true") //IMDS endpoint nonroutable endpoint and requests doesnt go through proxy hence using dedicated http client - httpClient := &http.Client{} + httpClient := &http.Client{Timeout: 30 * time.Second} // Call managed services for Azure resources token endpoint var resp *http.Response = nil @@ -137,10 +137,19 @@ func getAccessTokenFromIMDS() (string, int64, error) { } Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) - if resp.StatusCode >= 500 { + if ShouldRetry(resp.StatusCode) { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - time.Sleep(100 * time.Millisecond) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) @@ -174,7 +183,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) if err != nil { Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) - time.Sleep(100 * time.Millisecond) + time.Sleep((retryCount + 1) * 100 * time.Millisecond) continue } break @@ -244,10 +253,19 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan defer resp.Body.Close() } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) - if resp.StatusCode >= 500 { + if ShouldRetry(resp.StatusCode) { message := fmt.Sprintf("getAgentConfiguration: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - time.Sleep(100 * time.Millisecond) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { message := fmt.Sprintf("getAgentConfiguration: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) @@ -348,10 +366,19 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) - if resp.StatusCode >= 500 { - message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) + if ShouldRetry(resp.StatusCode) { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) - time.Sleep(100 * time.Millisecond) + retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond + if resp.StatusCode == 429 { + if resp != nil && resp.Header.Get("Retry-After") != "" { + after, err := strconv.ParseInt(resp.Header.Get("Retry-After"), 10, 64) + if err != nil && after > 0 { + retryDelay = time.Duration(after) * time.Second + } + } + } + time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { message := fmt.Sprintf("getIngestionAuthToken: Request failed with nonretryable error code: %d, retryCount: %d", resp.StatusCode, retryCount) @@ -477,3 +504,13 @@ func refreshIngestionAuthToken() { } } } + +func ShouldRetry(httpStatusCode int) bool { + retryableStatusCodes := [5]int{408, 429, 502, 503, 504} + for _, code := range retryableStatusCodes { + if code == httpStatusCode { + return true + } + } + return false +} From a99597dd5720b624cfa36d7cfa3aac1a45d10d21 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Tue, 22 Jun 2021 20:09:53 -0700 Subject: [PATCH 21/28] update mdsd env as per official build --- kubernetes/linux/main.sh | 27 +++++++++++++---------- kubernetes/windows/main.ps1 | 18 +++++++-------- source/plugins/ruby/CustomMetricsUtils.rb | 4 ++-- source/plugins/ruby/out_mdm.rb | 18 ++++++++++----- 4 files changed, 39 insertions(+), 28 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index bb03a4365..3d83429cc 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -198,9 +198,15 @@ fi if [ -z $domain ]; then ClOUD_ENVIRONMENT="unknown" elif [ $domain == "opinsights.azure.com" ]; then - CLOUD_ENVIRONMENT="public" -else - CLOUD_ENVIRONMENT="national" + CLOUD_ENVIRONMENT="azurepubliccloud" +elif [ $domain == "opinsights.azure.cn" ]; then + CLOUD_ENVIRONMENT="azurechinacloud" +elif [ $domain == "opinsights.azure.us" ]; then + CLOUD_ENVIRONMENT="azureusgovernmentcloud" +elif [ $domain == "opinsights.azure.eaglex.ic.gov" ]; then + CLOUD_ENVIRONMENT="usnat" +elif [ $domain == "opinsights.azure.microsoft.scloud" ]; then + CLOUD_ENVIRONMENT="ussec" fi export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT echo "export CLOUD_ENVIRONMENT=$CLOUD_ENVIRONMENT" >> ~/.bashrc @@ -453,6 +459,9 @@ echo "export DOCKER_CIMPROV_VERSION=$DOCKER_CIMPROV_VERSION" >> ~/.bashrc #skip imds lookup since not used either legacy or aad msi auth path export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH="true" echo "export SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH=$SKIP_IMDS_LOOKUP_FOR_LEGACY_AUTH" >> ~/.bashrc +# this used by mdsd to determine cloud specific LA endpoints +export OMS_TLD=$domain +echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc cat /etc/mdsd.d/envmdsd | while read line; do echo $line >> ~/.bashrc done @@ -466,15 +475,11 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then MDSD_AAD_MSI_AUTH_ARGS="-a -A" export AAD_MSI_AUTH_MODE=true echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc - + # this used by mdsd to determine the cloud specific AMCS endpoints + export customEnvironment=$ClOUD_ENVIRONMENT + echo "export customEnvironment=$customEnvironment" >> ~/.bashrc export MDSD_FLUENT_SOCKET_PORT="28230" echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc - export MCS_ENDPOINT="handler.control.monitor.azure.com" - echo "export MCS_ENDPOINT=$MCS_ENDPOINT" >> ~/.bashrc - export AZURE_ENDPOINT="https://monitor.azure.com/" - echo "export AZURE_ENDPOINT=$AZURE_ENDPOINT" >> ~/.bashrc - export ADD_REGION_TO_MCS_ENDPOINT="true" - echo "export ADD_REGION_TO_MCS_ENDPOINT=$ADD_REGION_TO_MCS_ENDPOINT" >> ~/.bashrc export ENABLE_MCS="true" echo "export ENABLE_MCS=$ENABLE_MCS" >> ~/.bashrc export MONITORING_USE_GENEVA_CONFIG_SERVICE="false" @@ -491,8 +496,6 @@ else echo "export CIWORKSPACE_id=$CIWORKSPACE_id" >> ~/.bashrc export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile echo "export CIWORKSPACE_keyFile=$CIWORKSPACE_keyFile" >> ~/.bashrc - export OMS_TLD=$domain - echo "export OMS_TLD=$OMS_TLD" >> ~/.bashrc export MDSD_FLUENT_SOCKET_PORT="29230" echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc fi diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 9205fdf6c..330e0b40e 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -44,7 +44,7 @@ function Start-FileSystemWatcher { function Set-EnvironmentVariables { $domain = "opinsights.azure.com" $mcs_endpoint = "monitor.azure.com" - $cloud_environment = "public" + $cloud_environment = "azurepubliccloud" if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging $domain = Get-Content /etc/omsagent-secret/DOMAIN @@ -54,14 +54,14 @@ function Set-EnvironmentVariables { # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") - + # Set MCS Endpoint [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Process") [System.Environment]::SetEnvironmentVariable("MCS_ENDPOINT", $mcs_endpoint, "Machine") # Set CLOUD_ENVIRONMENT [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Process") - [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") + [System.Environment]::SetEnvironmentVariable("CLOUD_ENVIRONMENT", $cloud_environment, "Machine") $wsID = "" if (Test-Path /etc/omsagent-secret/WSID) { @@ -163,7 +163,7 @@ function Set-EnvironmentVariables { Write-Host $_.Exception } } - + # Check if the fetched IKey was properly encoded. if not then turn off telemetry if ($aiKeyFetched -match '^[A-Za-z0-9=]+$') { Write-Host "Using cloud-specific instrumentation key" @@ -239,16 +239,16 @@ function Set-EnvironmentVariables { if (![string]::IsNullOrEmpty($isAADMSIAuth)) { [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Process") [System.Environment]::SetEnvironmentVariable("AAD_MSI_AUTH_MODE", $isAADMSIAuth, "Machine") - Write-Host "Successfully set environment variable AAD_MSI_AUTH_MODE - $($isAADMSIAuth) for target 'machine'..." - } + Write-Host "Successfully set environment variable AAD_MSI_AUTH_MODE - $($isAADMSIAuth) for target 'machine'..." + } # check if use token proxy endpoint set via USE_IMDS_TOKEN_PROXY_END_POINT environment variable $useIMDSTokenProxyEndpoint = [System.Environment]::GetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", "process") if (![string]::IsNullOrEmpty($useIMDSTokenProxyEndpoint)) { [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Process") [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Machine") - Write-Host "Successfully set environment variable USE_IMDS_TOKEN_PROXY_END_POINT - $($useIMDSTokenProxyEndpoint) for target 'machine'..." - } + Write-Host "Successfully set environment variable USE_IMDS_TOKEN_PROXY_END_POINT - $($useIMDSTokenProxyEndpoint) for target 'machine'..." + } # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb @@ -431,7 +431,7 @@ function Start-Telegraf { else { Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" } - + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") if (![string]::IsNullOrEmpty($nodeIp)) { [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") diff --git a/source/plugins/ruby/CustomMetricsUtils.rb b/source/plugins/ruby/CustomMetricsUtils.rb index 220313e6b..fd9290b78 100644 --- a/source/plugins/ruby/CustomMetricsUtils.rb +++ b/source/plugins/ruby/CustomMetricsUtils.rb @@ -13,8 +13,8 @@ def check_custom_metrics_availability if aks_region.to_s.empty? || aks_resource_id.to_s.empty? return false # This will also take care of AKS-Engine Scenario. AKS_REGION/AKS_RESOURCE_ID is not set for AKS-Engine. Only ACS_RESOURCE_NAME is set end - - return aks_cloud_environment.to_s.downcase == 'public' + + return aks_cloud_environment.to_s.downcase == 'azurepubliccloud' end end end \ No newline at end of file diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 247bc01b2..32bc4d205 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -51,6 +51,7 @@ def initialize @last_telemetry_sent_time = nil # Setting useMsi to false by default @useMsi = false + @isAADMSIAuth = false @metrics_flushed_count = 0 @cluster_identity = nil @@ -132,8 +133,9 @@ def start if !@@user_assigned_client_id.nil? && !@@user_assigned_client_id.empty? msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } else - # in case of msi auth user_assigned_client_id will be empty - @log.info "using aad msi auth" + # in case of msi auth user_assigned_client_id will be empty + @log.info "using aad msi auth" + @isAADMSIAuth = true msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } end @parsed_token_uri = URI.parse(msi_endpoint) @@ -159,8 +161,14 @@ def get_access_token @log.info "Refreshing access token for out_mdm plugin.." if (!!@useMsi) - @log.info "Using msi to get the token to post MDM data" - ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", {}) + properties = {} + if (!!@isAADMSIAuth) + @log.info "Using aad msi auth to get the token to post MDM data" + properties["isAADMSIAuth"] = @isAADMSIAuth + else + @log.info "Using msi to get the token to post MDM data" + end + ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMToken-MSI", properties) @log.info "Opening TCP connection" http_access_token = Net::HTTP.start(@parsed_token_uri.host, @parsed_token_uri.port, :use_ssl => false) # http_access_token.use_ssl = false @@ -331,7 +339,7 @@ def send_to_mdm(post_body) ApplicationInsightsUtility.sendCustomEvent("AKSCustomMetricsMDMSendSuccessful", {}) @last_telemetry_sent_time = Time.now end - rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException + rescue Net::HTTPClientException => e # see https://docs.ruby-lang.org/en/2.6.0/NEWS.html about deprecating HTTPServerException and adding HTTPClientException if !response.nil? && !response.body.nil? #body will have actual error @log.info "Failed to Post Metrics to MDM : #{e} Response.body: #{response.body}" else From 72bcffff36ed6363f1990f49046896a350f6e143 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 23 Jun 2021 09:11:57 -0700 Subject: [PATCH 22/28] cleanup --- source/plugins/ruby/out_mdm.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 32bc4d205..82d6e07db 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -133,7 +133,7 @@ def start if !@@user_assigned_client_id.nil? && !@@user_assigned_client_id.empty? msi_endpoint = @@msi_endpoint_template % { user_assigned_client_id: @@user_assigned_client_id, resource: @@token_resource_url } else - # in case of msi auth user_assigned_client_id will be empty + # in case of aad msi auth user_assigned_client_id will be empty @log.info "using aad msi auth" @isAADMSIAuth = true msi_endpoint = @@imds_msi_endpoint_template % { resource: @@token_resource_audience } @@ -164,7 +164,7 @@ def get_access_token properties = {} if (!!@isAADMSIAuth) @log.info "Using aad msi auth to get the token to post MDM data" - properties["isAADMSIAuth"] = @isAADMSIAuth + properties["aadAuthMSIMode"] = "true" else @log.info "Using msi to get the token to post MDM data" end From 130d0ca100fb509f904e8d06cdb86dc28d3f227c Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Wed, 23 Jun 2021 13:05:50 -0700 Subject: [PATCH 23/28] update env vars per mdsd --- kubernetes/linux/main.sh | 2 +- kubernetes/linux/setup.sh | 2 +- kubernetes/windows/main.ps1 | 29 ++++++++++++++++++- .../plugins/go/src/ingestion_token_utils.go | 20 ++++++------- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 3d83429cc..3ab54fffd 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -476,7 +476,7 @@ if [ "${USING_AAD_MSI_AUTH}" == "true" ]; then export AAD_MSI_AUTH_MODE=true echo "export AAD_MSI_AUTH_MODE=true" >> ~/.bashrc # this used by mdsd to determine the cloud specific AMCS endpoints - export customEnvironment=$ClOUD_ENVIRONMENT + export customEnvironment=$CLOUD_ENVIRONMENT echo "export customEnvironment=$customEnvironment" >> ~/.bashrc export MDSD_FLUENT_SOCKET_PORT="28230" echo "export MDSD_FLUENT_SOCKET_PORT=$MDSD_FLUENT_SOCKET_PORT" >> ~/.bashrc diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index 467490d57..c5dba587a 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -10,7 +10,7 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ update-locale LANG=en_US.UTF-8 #install oneagent - Official bits (05/17/2021) -wget https://github.com/microsoft/Docker-Provider/raw/gangams/ci-aad-auth-msi/oneagent-dev/azure-mdsd_1.10.1-build.dev_x86_64.deb +wget https://github.com/microsoft/Docker-Provider/raw/gangams/ci-aad-auth-msi/oneagent-dev/azure-mdsd_1.10.3-build.develop.239_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 330e0b40e..fa7b2470a 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -48,9 +48,36 @@ function Set-EnvironmentVariables { if (Test-Path /etc/omsagent-secret/DOMAIN) { # TODO: Change to omsagent-secret before merging $domain = Get-Content /etc/omsagent-secret/DOMAIN - $cloud_environment = "national" + if (![string]::IsNullOrEmpty($domain)) { + if ($domain -eq "opinsights.azure.com") { + $cloud_environment = "azurepubliccloud" + $mcs_endpoint = "monitor.azure.com" + } elseif ($domain -eq "opinsights.azure.cn") { + $cloud_environment = "azurechinacloud" + $mcs_endpoint = "monitor.azure.cn" + } elseif ($domain -eq "opinsights.azure.us") { + $cloud_environment = "azureusgovernmentcloud" + $mcs_endpoint = "monitor.azure.us" + } elseif ($domain -eq "opinsights.azure.eaglex.ic.gov") { + $cloud_environment = "usnat" + $mcs_endpoint = "monitor.azure.eaglex.ic.gov" + } elseif ($domain -eq "opinsights.azure.microsoft.scloud") { + $cloud_environment = "ussec" + $mcs_endpoint = "monitor.azure.microsoft.scloud" + } else { + Write-Host "Invalid or Unsupported domain name $($domain). EXITING....." + exit 1 + } + } else { + Write-Host "Domain name either null or empty. EXITING....." + exit 1 + } } + Write-Host "Log analytics domain: $($domain)" + Write-Host "MCS endpoint: $($mcs_endpoint)" + Write-Host "Cloud Environment: $($cloud_environment)" + # Set DOMAIN [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Process") [System.Environment]::SetEnvironmentVariable("DOMAIN", $domain, "Machine") diff --git a/source/plugins/go/src/ingestion_token_utils.go b/source/plugins/go/src/ingestion_token_utils.go index c97de608e..c96685042 100644 --- a/source/plugins/go/src/ingestion_token_utils.go +++ b/source/plugins/go/src/ingestion_token_utils.go @@ -137,7 +137,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { } Log("getAccessTokenFromIMDS: IMDS Response Status: %d, retryCount: %d", resp.StatusCode, retryCount) - if ShouldRetry(resp.StatusCode) { + if IsRetriableError(resp.StatusCode) { message := fmt.Sprintf("getAccessTokenFromIMDS: IMDS Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond @@ -148,7 +148,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { retryDelay = time.Duration(after) * time.Second } } - } + } time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { @@ -183,7 +183,7 @@ func getAccessTokenFromIMDS() (string, int64, error) { responseBytes, err = ioutil.ReadFile(IMDSTokenPathForWindows) if err != nil { Log("getAccessTokenFromIMDS: Could not read IMDS token from file: %s, retryCount: %d", err.Error(), retryCount) - time.Sleep((retryCount + 1) * 100 * time.Millisecond) + time.Sleep(time.Duration((retryCount + 1) * 100) * time.Millisecond) continue } break @@ -253,7 +253,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan defer resp.Body.Close() } Log("getAgentConfiguration Response Status: %d", resp.StatusCode) - if ShouldRetry(resp.StatusCode) { + if IsRetriableError(resp.StatusCode) { message := fmt.Sprintf("getAgentConfiguration: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond @@ -264,7 +264,7 @@ func getAgentConfiguration(imdsAccessToken string) (configurationId string, chan retryDelay = time.Duration(after) * time.Second } } - } + } time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { @@ -349,7 +349,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann req.Header.Add("Authorization", bearer) var resp *http.Response = nil - IsSuccess : = false + IsSuccess := false for retryCount := 0; retryCount < MaxRetries; retryCount++ { // Call managed services for Azure resources token endpoint resp, err = HTTPClient.Do(req) @@ -366,8 +366,8 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann } Log("getIngestionAuthToken Response Status: %d", resp.StatusCode) - if ShouldRetry(resp.StatusCode) { - message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) + if IsRetriableError(resp.StatusCode) { + message := fmt.Sprintf("getIngestionAuthToken: Request failed with an error code: %d, retryCount: %d", resp.StatusCode, retryCount) Log(message) retryDelay := time.Duration((retryCount + 1) * 100) * time.Millisecond if resp.StatusCode == 429 { @@ -377,7 +377,7 @@ func getIngestionAuthToken(imdsAccessToken string, configurationId string, chann retryDelay = time.Duration(after) * time.Second } } - } + } time.Sleep(retryDelay) continue } else if resp.StatusCode != 200 { @@ -505,7 +505,7 @@ func refreshIngestionAuthToken() { } } -func ShouldRetry(httpStatusCode int) bool { +func IsRetriableError(httpStatusCode int) bool { retryableStatusCodes := [5]int{408, 429, 502, 503, 504} for _, code := range retryableStatusCodes { if code == httpStatusCode { From d4ab0720473d707271fd73353b4a73ee3ae38455 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 24 Jun 2021 13:59:30 -0700 Subject: [PATCH 24/28] update with mdsd official build --- kubernetes/linux/setup.sh | 4 ++-- source/plugins/ruby/in_cadvisor_perf.rb | 24 ++++++++++++------------ source/plugins/utils/extension.rb | 22 +++++++++++----------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kubernetes/linux/setup.sh b/kubernetes/linux/setup.sh index c5dba587a..933c14aed 100644 --- a/kubernetes/linux/setup.sh +++ b/kubernetes/linux/setup.sh @@ -9,8 +9,8 @@ sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ dpkg-reconfigure --frontend=noninteractive locales && \ update-locale LANG=en_US.UTF-8 -#install oneagent - Official bits (05/17/2021) -wget https://github.com/microsoft/Docker-Provider/raw/gangams/ci-aad-auth-msi/oneagent-dev/azure-mdsd_1.10.3-build.develop.239_x86_64.deb +#install oneagent - Official bits (06/24/2021) +wget https://github.com/microsoft/Docker-Provider/releases/download/06242021-oneagent/azure-mdsd_1.10.3-build.master.241_x86_64.deb /usr/bin/dpkg -i $TMPDIR/azure-mdsd*.deb cp -f $TMPDIR/mdsd.xml /etc/mdsd.d diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index da42e4560..70bf19631 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -62,24 +62,24 @@ def enumerate() batchTime = currentTime.utc.iso8601 @@istestvar = ENV["ISTEST"] begin - eventStream = Fluent::MultiEventStream.new + eventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new metricData = CAdvisorMetricsAPIClient.getMetrics(winNode: nil, metricTime: batchTime ) - metricData.each do |record| - eventStream.add(time, record) if record - end - + metricData.each do |record| + eventStream.add(time, record) if record + end + if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end + end if !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") - $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") + $log.info("in_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsmetricstag} @ #{Time.now.utc.iso8601}") + end router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(@mdmtag, eventStream) if eventStream router.emit_stream(@containerhealthtag, eventStream) if eventStream @@ -148,6 +148,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # CAdvisor_Perf_Input end # module diff --git a/source/plugins/utils/extension.rb b/source/plugins/utils/extension.rb index 1b1844149..78236fe15 100644 --- a/source/plugins/utils/extension.rb +++ b/source/plugins/utils/extension.rb @@ -12,10 +12,10 @@ class Extension def initialize @cache = {} - @cache_lock = Mutex.new + @cache_lock = Mutex.new $log.info("Extension::initialize complete") end - + def get_output_stream_id(datatypeId) @cache_lock.synchronize { if @cache.has_key?(datatypeId) @@ -27,7 +27,7 @@ def get_output_stream_id(datatypeId) } end - private + private def get_config() extConfig = Hash.new $log.info("Extension::get_config start ...") @@ -35,14 +35,14 @@ def get_config() clientSocket = UNIXSocket.open(Constants::ONEAGENT_FLUENT_SOCKET_NAME) requestId = SecureRandom.uuid.to_s requestBodyJSON = { "Request" => "AgentTaggedData", "RequestId" => requestId, "Tag" => Constants::CI_EXTENSION_NAME, "Version" => Constants::CI_EXTENSION_VERSION }.to_json - $log.info("sending request with request body: #{requestBodyJSON}") + $log.info("Extension::get_config::sending request with request body: #{requestBodyJSON}") requestBodyMsgPack = requestBodyJSON.to_msgpack clientSocket.write(requestBodyMsgPack) clientSocket.flush $log.info("reading the response from fluent socket: #{Constants::ONEAGENT_FLUENT_SOCKET_NAME}") resp = clientSocket.recv(Constants::CI_EXTENSION_CONFIG_MAX_BYTES) if !resp.nil? && !resp.empty? - $log.info("successfully read the extension config from fluentsocket and number of bytes read is #{resp.length}") + $log.info("Extension::get_config::successfully read the extension config from fluentsocket and number of bytes read is #{resp.length}") respJSON = JSON.parse(resp) taggedData = respJSON["TaggedData"] if !taggedData.nil? && !taggedData.empty? @@ -51,18 +51,18 @@ def get_config() if !extensionConfigurations.nil? && !extensionConfigurations.empty? extensionConfigurations.each do |extensionConfig| outputStreams = extensionConfig["outputStreams"] - if !outputStreams.nil? && !outputStreams.empty? + if !outputStreams.nil? && !outputStreams.empty? outputStreams.each do |datatypeId, streamId| - $log.info("datatypeId: #{datatypeId}, streamId: #{streamId}") + $log.info("Extension::get_config datatypeId:#{datatypeId}, streamId: #{streamId}") extConfig[datatypeId] = streamId end else - $log.info("received outputStreams is either nil or empty") - end + $log.warn("Extension::get_config::received outputStreams is either nil or empty") + end end else - $log.info("received extensionConfigurations from fluentsocket is either nil or empty") - end + $log.warn("Extension::get_config::received extensionConfigurations from fluentsocket is either nil or empty") + end end end rescue => errorStr From 9a2a94d2136fe1baeb9eb0f265b298ed18b4f73a Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 28 Jun 2021 18:56:07 -0700 Subject: [PATCH 25/28] skip cert gen & renewal incase of aad msi auth --- .../in_heartbeat_request.rb | 20 +++++++++++-------- kubernetes/windows/main.ps1 | 9 +++++++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb b/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb index e255c4a71..e525d8681 100644 --- a/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb +++ b/build/windows/installer/scripts/rubyKeepCertificateAlive/in_heartbeat_request.rb @@ -36,14 +36,18 @@ def start def enumerate begin - puts "Calling certificate renewal code..." - maintenance = OMS::OnboardingHelper.new( - ENV["WSID"], - ENV["DOMAIN"], - ENV["CI_AGENT_GUID"] - ) - ret_code = maintenance.register_certs() - puts "Return code from register certs : #{ret_code}" + if !ENV["AAD_MSI_AUTH_MODE"].nil? && !ENV["AAD_MSI_AUTH_MODE"].empty? && ENV["AAD_MSI_AUTH_MODE"].downcase == "true" + puts "skipping certificate renewal code since AAD MSI auth configured" + else + puts "Calling certificate renewal code..." + maintenance = OMS::OnboardingHelper.new( + ENV["WSID"], + ENV["DOMAIN"], + ENV["CI_AGENT_GUID"] + ) + ret_code = maintenance.register_certs() + puts "Return code from register certs : #{ret_code}" + end rescue => errorStr puts "in_heartbeat_request::enumerate:Failed in enumerate: #{errorStr}" # STDOUT telemetry should alredy be going to Traces in AI. diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index fa7b2470a..4cb8e5fb0 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -565,8 +565,13 @@ if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` Bootstrap-CACertificates } -Generate-Certificates -Test-CertificatePath +$isAADMSIAuth = [System.Environment]::GetEnvironmentVariable("USING_AAD_MSI_AUTH") +if (![string]::IsNullOrEmpty($isAADMSIAuth) -and $isAADMSIAuth.ToLower() -eq 'true') { + Write-Host "skipping agent onboarding via cert since AAD MSI Auth configured" +} else { + Generate-Certificates + Test-CertificatePath +} Start-Fluent-Telegraf # List all powershell processes running. This should have main.ps1 and filesystemwatcher.ps1 From 02374ff3205efb668de7081c4452477e14d2b4c6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 8 Jul 2021 14:18:19 -0700 Subject: [PATCH 26/28] add nil check --- .../ruby/filter_health_model_builder.rb | 28 ++++---- source/plugins/ruby/in_cadvisor_perf.rb | 4 +- source/plugins/ruby/in_containerinventory.rb | 42 ++++++------ source/plugins/ruby/in_kube_events.rb | 24 +++---- source/plugins/ruby/in_kube_nodes.rb | 64 +++++++++---------- source/plugins/ruby/in_kube_podinventory.rb | 64 +++++++++---------- source/plugins/ruby/in_kube_pvinventory.rb | 26 ++++---- .../plugins/ruby/in_kubestate_deployments.rb | 22 +++---- source/plugins/ruby/in_kubestate_hpa.rb | 18 +++--- source/plugins/ruby/in_win_cadvisor_perf.rb | 16 ++--- 10 files changed, 154 insertions(+), 154 deletions(-) diff --git a/source/plugins/ruby/filter_health_model_builder.rb b/source/plugins/ruby/filter_health_model_builder.rb index bd42260e2..9decda881 100644 --- a/source/plugins/ruby/filter_health_model_builder.rb +++ b/source/plugins/ruby/filter_health_model_builder.rb @@ -4,12 +4,12 @@ require 'fluent/plugin/filter' -module Fluent::Plugin +module Fluent::Plugin require_relative 'extension_utils' require 'logger' require 'yajl/json_gem' Dir[File.join(__dir__, './health', '*.rb')].each { |file| require file } - + class FilterHealthModelBuilder < Filter include HealthModel @@ -23,7 +23,7 @@ class FilterHealthModelBuilder < Filter attr_reader :buffer, :model_builder, :health_model_definition, :monitor_factory, :state_finalizers, :monitor_set, :model_builder, :hierarchy_builder, :resources, :kube_api_down_handler, :provider, :reducer, :state, :generator, :telemetry - + @@cluster_id = KubernetesApiClient.getClusterId @@token_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" @@cert_file_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" @@ -57,7 +57,7 @@ def initialize deserialized_state_info = @cluster_health_state.get_state @state.initialize_state(deserialized_state_info) end - + rescue => e ApplicationInsightsUtility.sendExceptionTelemetry(e, {"FeatureArea" => "Health"}) end @@ -91,14 +91,14 @@ def filter_stream(tag, es) end begin new_es = Fluent::MultiEventStream.new - time = Time.now + time = Time.now if ExtensionUtils.isAADMSIAuthMode() - $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") - if !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("filter_health_model_builder::enumerate: AAD AUTH MSI MODE") + if @rewrite_tag.nil? || !@rewrite_tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @rewrite_tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_HEALTH_DATA_TYPE) - end - $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("filter_health_model_builder::filter_stream: using tag -#{@rewrite_tag} @ #{Time.now.utc.iso8601}") + end if tag.start_with?("kubehealth.DaemonSet.Node") node_records = [] @@ -230,7 +230,7 @@ def filter_stream(tag, es) @log.info "after optimizing health signals all_monitors.size #{all_monitors.size}" - + # for each key in monitor.keys, # get the state from health_monitor_state # generate the record to send @@ -253,7 +253,7 @@ def filter_stream(tag, es) @cluster_new_state = new_state end end - end + end new_es.add(emit_time, record) } @@ -269,7 +269,7 @@ def filter_stream(tag, es) @telemetry.send # return an empty event stream, else the match will throw a NoMethodError return Fluent::MultiEventStream.new - elsif tag.start_with?(@rewrite_tag) + elsif tag.start_with?(@rewrite_tag) # this filter also acts as a pass through as we are rewriting the tag and emitting to the fluent stream es else @@ -281,6 +281,6 @@ def filter_stream(tag, es) @log.warn "Message: #{e.message} Backtrace: #{e.backtrace}" return nil end - end + end end end diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index 70bf19631..862e88e44 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -71,10 +71,10 @@ def enumerate() if ExtensionUtils.isAADMSIAuthMode() $log.info("in_cadvisor_perf::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) end - if !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + if @insightsmetricstag.nil? || !@insightsmetricstag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsmetricstag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) end $log.info("in_cadvisor_perf::enumerate: using perf tag -#{@tag} @ #{Time.now.utc.iso8601}") diff --git a/source/plugins/ruby/in_containerinventory.rb b/source/plugins/ruby/in_containerinventory.rb index 6de1b0bb8..9fcb7ab90 100644 --- a/source/plugins/ruby/in_containerinventory.rb +++ b/source/plugins/ruby/in_containerinventory.rb @@ -7,18 +7,18 @@ module Fluent::Plugin class Container_Inventory_Input < Input Fluent::Plugin.register_input("containerinventory", self) - @@PluginName = "ContainerInventory" + @@PluginName = "ContainerInventory" def initialize super require "yajl/json_gem" - require "time" + require "time" require_relative "ContainerInventoryState" require_relative "ApplicationInsightsUtility" require_relative "omslog" require_relative "CAdvisorMetricsAPIClient" - require_relative "kubernetes_container_inventory" - require_relative "extension_utils" + require_relative "kubernetes_container_inventory" + require_relative "extension_utils" end config_param :run_interval, :time, :default => 60 @@ -48,28 +48,28 @@ def shutdown @thread.join super # This super must be at the end of shutdown method end - end - + end + def enumerate - currentTime = Time.now + currentTime = Time.now batchTime = currentTime.utc.iso8601 emitTime = Fluent::Engine.now containerInventory = Array.new eventStream = Fluent::MultiEventStream.new hostName = "" - $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") + $log.info("in_container_inventory::enumerate : Begin processing @ #{Time.now.utc.iso8601}") if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_container_inventory::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) - end - $log.info("in_container_inventory::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_container_inventory::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end begin containerRuntimeEnv = ENV["CONTAINER_RUNTIME"] $log.info("in_container_inventory::enumerate : container runtime : #{containerRuntimeEnv}") clusterCollectEnvironmentVar = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] - $log.info("in_container_inventory::enumerate : using cadvisor apis") + $log.info("in_container_inventory::enumerate : using cadvisor apis") containerIds = Array.new response = CAdvisorMetricsAPIClient.getPodsFromCAdvisor(winNode: nil) if !response.nil? && !response.body.nil? @@ -84,10 +84,10 @@ def enumerate end containerIds.push containerRecord["InstanceID"] containerInventory.push containerRecord - end + end end - end - end + end + end # Update the state for deleted containers deletedContainers = ContainerInventoryState.getDeletedContainers(containerIds) if !deletedContainers.nil? && !deletedContainers.empty? @@ -95,13 +95,13 @@ def enumerate container = ContainerInventoryState.readContainerState(deletedContainer) if !container.nil? container.each { |k, v| container[k] = v } - container["State"] = "Deleted" + container["State"] = "Deleted" KubernetesContainerInventory.deleteCGroupCacheEntryForDeletedContainer(container["InstanceID"]) containerInventory.push container end end - end - containerInventory.each do |record| + end + containerInventory.each do |record| eventStream.add(emitTime, record) if record end router.emit_stream(@tag, eventStream) if eventStream @@ -156,6 +156,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Container_Inventory_Input end # module diff --git a/source/plugins/ruby/in_kube_events.rb b/source/plugins/ruby/in_kube_events.rb index f2d35be7b..deeae6e14 100644 --- a/source/plugins/ruby/in_kube_events.rb +++ b/source/plugins/ruby/in_kube_events.rb @@ -3,7 +3,7 @@ require 'fluent/plugin/input' -module Fluent::Plugin +module Fluent::Plugin class Kube_Event_Input < Input Fluent::Plugin.register_input("kube_events", self) @@KubeEventsStateFile = "/var/opt/microsoft/docker-cimprov/state/KubeEventQueryState.yaml" @@ -38,7 +38,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["EVENTS_CHUNK_SIZE"].nil? && !ENV["EVENTS_CHUNK_SIZE"].empty? && ENV["EVENTS_CHUNK_SIZE"].to_i > 0 @@ -85,15 +85,15 @@ def enumerate batchTime = currentTime.utc.iso8601 eventQueryState = getEventQueryState newEventQueryState = [] - @eventsCount = 0 - + @eventsCount = 0 + if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_events::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kube_events::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_EVENTS_DATA_TYPE) - end - $log.info("in_kube_events::enumerate: using kubeevents tag -#{@tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_kube_events::enumerate: using kubeevents tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_events::enumerate : Getting events from Kube API @ #{Time.now.utc.iso8601}") @@ -139,8 +139,8 @@ def enumerate end # end enumerate def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now @@istestvar = ENV["ISTEST"] begin eventStream = Fluent::MultiEventStream.new @@ -174,7 +174,7 @@ def parse_and_emit_records(events, eventQueryState, newEventQueryState, batchTim record["Count"] = items["count"] record["Computer"] = nodeName record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterId"] = KubernetesApiClient.getClusterId eventStream.add(emitTime, record) if record @eventsCount += 1 end diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 564d060b1..bc62756a1 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -6,12 +6,12 @@ module Fluent::Plugin class Kube_nodeInventory_Input < Input Fluent::Plugin.register_input("kube_nodes", self) - + @@configMapMountPath = "/etc/config/settings/log-data-collection-settings" @@promConfigMountPath = "/etc/config/settings/prometheus-data-collection-settings" @@osmConfigMountPath = "/etc/config/osm-settings/osm-metric-collection-configuration" @@AzStackCloudFileName = "/etc/kubernetes/host/azurestackcloud.json" - + @@rsPromInterval = ENV["TELEMETRY_RS_PROM_INTERVAL"] @@rsPromFieldPassCount = ENV["TELEMETRY_RS_PROM_FIELDPASS_LENGTH"] @@ -38,9 +38,9 @@ def initialize require_relative "omslog" require_relative "extension_utils" - @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" - @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" + @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @MDMKubeNodeInventoryTag = "mdm.kubenodeinventory" @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" # refer tomlparser-agent-config for the defaults @@ -61,7 +61,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["NODES_CHUNK_SIZE"].nil? && !ENV["NODES_CHUNK_SIZE"].empty? && ENV["NODES_CHUNK_SIZE"].to_i > 0 @@ -110,27 +110,27 @@ def enumerate @nodesAPIE2ELatencyMs = 0 @nodeInventoryE2EProcessingLatencyMs = 0 - nodeInventoryStartTime = (Time.now.to_f * 1000).to_i - + nodeInventoryStartTime = (Time.now.to_f * 1000).to_i + if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") - if !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kube_nodes::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - if !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @ContainerNodeInventoryTag.nil? || !@ContainerNodeInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @ContainerNodeInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_NODE_INVENTORY_DATA_TYPE) - end - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_NODE_INVENTORY_DATA_TYPE) - end - $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_kube_nodes::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using containernodeinventory tag -#{@ContainerNodeInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_nodes::enumerate: using kubenodeinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end nodesAPIChunkStartTime = (Time.now.to_f * 1000).to_i # Initializing continuation token to nil @@ -181,19 +181,19 @@ def enumerate def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) begin - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now telemetrySent = false eventStream = Fluent::MultiEventStream.new containerNodeInventoryEventStream = Fluent::MultiEventStream.new insightsMetricsEventStream = Fluent::MultiEventStream.new - kubePerfEventStream = Fluent::MultiEventStream.new + kubePerfEventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] #get node inventory nodeInventory["items"].each do |item| # node inventory nodeInventoryRecord = getNodeInventoryRecord(item, batchTime) - eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord + eventStream.add(emitTime, nodeInventoryRecord) if nodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && eventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_node::parse_and_emit_records: number of node inventory records emitted #{@NODES_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") router.emit_stream(@tag, eventStream) if eventStream @@ -206,7 +206,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) end # container node inventory - containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) + containerNodeInventoryRecord = getContainerNodeInventoryRecord(item, batchTime) containerNodeInventoryEventStream.add(emitTime, containerNodeInventoryRecord) if containerNodeInventoryRecord if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && containerNodeInventoryEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -255,7 +255,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) @NodeCache.mem.set_capacity(nodeMetricRecord["Host"], metricVal) end end - nodeMetricRecords.each do |metricRecord| + nodeMetricRecords.each do |metricRecord| kubePerfEventStream.add(emitTime, metricRecord) if metricRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && kubePerfEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -285,7 +285,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if !insightsMetricsRecord.nil? && !insightsMetricsRecord.empty? nodeGPUInsightsMetricsRecords.push(insightsMetricsRecord) end - nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| + nodeGPUInsightsMetricsRecords.each do |insightsMetricsRecord| insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end if @NODES_EMIT_STREAM_BATCH_SIZE > 0 && insightsMetricsEventStream.count >= @NODES_EMIT_STREAM_BATCH_SIZE @@ -355,7 +355,7 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0) $log.info("kubeNodeInventoryEmitStreamSuccess @ #{Time.now.utc.iso8601}") end - eventStream = nil + eventStream = nil end if containerNodeInventoryEventStream.count > 0 $log.info("in_kube_node::parse_and_emit_records: number of container node inventory records emitted #{containerNodeInventoryEventStream.count} @ #{Time.now.utc.iso8601}") @@ -527,7 +527,7 @@ def getNodeTelemetryProps(item) $log.warn "in_kube_nodes::getContainerNodeIngetNodeTelemetryPropsventoryRecord:Failed: #{errorStr}" end return properties - end + end end # Kube_Node_Input class NodeStatsCache # inner class for caching implementation (CPU and memory caching is handled the exact same way, so logic to do so is moved to a private inner class) @@ -598,5 +598,5 @@ def cpu() def mem() return @@memCache end - end + end end # module diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index f3a2718e7..3f5f4f1cc 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -11,7 +11,7 @@ class Kube_PodInventory_Input < Input @@MDMKubePodInventoryTag = "mdm.kubepodinventory" @@hostName = (OMS::Common.get_hostname) - + def initialize super @@ -40,12 +40,12 @@ def initialize @winContainerCount = 0 @controllerData = {} @podInventoryE2EProcessingLatencyMs = 0 - @podsAPIE2ELatencyMs = 0 - + @podsAPIE2ELatencyMs = 0 + @kubeperfTag = "oneagent.containerInsights.LINUX_PERF_BLOB" @kubeservicesTag = "oneagent.containerInsights.KUBE_SERVICES_BLOB" @containerInventoryTag = "oneagent.containerInsights.CONTAINER_INVENTORY_BLOB" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -56,7 +56,7 @@ def configure(conf) @inventoryToMdmConvertor = Inventory2MdmConvertor.new() end - def start + def start if @run_interval super if !ENV["PODS_CHUNK_SIZE"].nil? && !ENV["PODS_CHUNK_SIZE"].empty? && ENV["PODS_CHUNK_SIZE"].to_i > 0 @@ -108,30 +108,30 @@ def enumerate(podList = nil) batchTime = currentTime.utc.iso8601 serviceRecords = [] @podInventoryE2EProcessingLatencyMs = 0 - podInventoryStartTime = (Time.now.to_f * 1000).to_i + podInventoryStartTime = (Time.now.to_f * 1000).to_i if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") - if !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kube_podinventory::enumerate: AAD AUTH MSI MODE") + if @kubeperfTag.nil? || !@kubeperfTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @kubeperfTag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @kubeservicesTag.nil? || !@kubeservicesTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @kubeservicesTag = ExtensionUtils.getOutputStreamId(Constants::KUBE_SERVICES_DATA_TYPE) - end - if !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @containerInventoryTag.nil? || !@containerInventoryTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @containerInventoryTag = ExtensionUtils.getOutputStreamId(Constants::CONTAINER_INVENTORY_DATA_TYPE) - end - if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_POD_INVENTORY_DATA_TYPE) - end - $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_kube_podinventory::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubeservices tag -#{@kubeservicesTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using containerinventory tag -#{@containerInventoryTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + $log.info("in_kube_podinventory::enumerate: using kubepodinventory tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Get services first so that we dont need to make a call for very chunk $log.info("in_kube_podinventory::enumerate : Getting services from Kube API @ #{Time.now.utc.iso8601}") @@ -221,8 +221,8 @@ def enumerate(podList = nil) end def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now #batchTime = currentTime.utc.iso8601 eventStream = Fluent::MultiEventStream.new containerInventoryStream = Fluent::MultiEventStream.new @@ -238,8 +238,8 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc podInventoryRecords = getPodInventoryRecords(item, serviceRecords, batchTime) podInventoryRecords.each do |record| if !record.nil? - eventStream.add(emitTime, record) if record - @inventoryToMdmConvertor.process_pod_inventory_record(record) + eventStream.add(emitTime, record) if record + @inventoryToMdmConvertor.process_pod_inventory_record(record) end end # Setting this flag to true so that we can send ContainerInventory records for containers @@ -256,7 +256,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc # Send container inventory records for containers on windows nodes @winContainerCount += containerInventoryRecords.length containerInventoryRecords.each do |cirecord| - if !cirecord.nil? + if !cirecord.nil? containerInventoryStream.add(emitTime, cirecord) if cirecord end end @@ -279,7 +279,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "cpu", "cpuLimitNanoCores", batchTime)) containerMetricDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimits(item, "limits", "memory", "memoryLimitBytes", batchTime)) - containerMetricDataItems.each do |record| + containerMetricDataItems.each do |record| kubePerfEventStream.add(emitTime, record) if record end @@ -298,7 +298,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "nvidia.com/gpu", "containerGpuLimits", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "requests", "amd.com/gpu", "containerGpuRequests", batchTime)) containerGPUInsightsMetricsDataItems.concat(KubernetesApiClient.getContainerResourceRequestsAndLimitsAsInsightsMetrics(item, "limits", "amd.com/gpu", "containerGpuLimits", batchTime)) - containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| + containerGPUInsightsMetricsDataItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(emitTime, insightsMetricsRecord) if insightsMetricsRecord end @@ -365,7 +365,7 @@ def parse_and_emit_records(podInventory, serviceRecords, continuationToken, batc if !kubeServiceRecord.nil? # adding before emit to reduce memory foot print kubeServiceRecord["ClusterId"] = KubernetesApiClient.getClusterId - kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName + kubeServiceRecord["ClusterName"] = KubernetesApiClient.getClusterName kubeServicesEventStream.add(emitTime, kubeServiceRecord) if kubeServiceRecord if @PODS_EMIT_STREAM_BATCH_SIZE > 0 && kubeServicesEventStream.count >= @PODS_EMIT_STREAM_BATCH_SIZE $log.info("in_kube_podinventory::parse_and_emit_records: number of service records emitted #{@PODS_EMIT_STREAM_BATCH_SIZE} @ #{Time.now.utc.iso8601}") @@ -672,6 +672,6 @@ def getServiceNameFromLabels(namespace, labels, serviceRecords) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return serviceName - end + end end # Kube_Pod_Input end # module diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 6a95ccd4f..fccfd459d 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -34,7 +34,7 @@ def configure(conf) super end - def start + def start if @run_interval super @finished = false @@ -62,13 +62,13 @@ def enumerate telemetryFlush = false @pvTypeToCountHash = {} currentTime = Time.now - batchTime = currentTime.utc.iso8601 + batchTime = currentTime.utc.iso8601 if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kube_pvinventory::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kube_pvinventory::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::KUBE_PV_INVENTORY_DATA_TYPE) - end - end + end + end continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") @@ -100,7 +100,7 @@ def enumerate if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) telemetryFlush = true end - + # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} @@ -117,8 +117,8 @@ def enumerate end # end enumerate def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) - currentTime = Time.now - emitTime = Fluent::Engine.now + currentTime = Time.now + emitTime = Fluent::Engine.now eventStream = Fluent::MultiEventStream.new @@istestvar = ENV["ISTEST"] begin @@ -159,8 +159,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end records.each do |record| - if !record.nil? - eventStream.add(emitTime, record) + if !record.nil? + eventStream.add(emitTime, record) end end @@ -198,7 +198,7 @@ def getTypeInfo(item) begin if !item["spec"].nil? (Constants::PV_TYPES).each do |pvType| - + # PV is this type if !item["spec"][pvType].nil? @@ -259,6 +259,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end # Kube_PVInventory_Input end # module diff --git a/source/plugins/ruby/in_kubestate_deployments.rb b/source/plugins/ruby/in_kubestate_deployments.rb index dd875862e..0b563a890 100644 --- a/source/plugins/ruby/in_kubestate_deployments.rb +++ b/source/plugins/ruby/in_kubestate_deployments.rb @@ -45,7 +45,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["DEPLOYMENTS_CHUNK_SIZE"].nil? && !ENV["DEPLOYMENTS_CHUNK_SIZE"].empty? && ENV["DEPLOYMENTS_CHUNK_SIZE"].to_i > 0 @@ -56,11 +56,11 @@ def start @DEPLOYMENTS_CHUNK_SIZE = 500 end $log.info("in_kubestate_deployments::start : DEPLOYMENTS_CHUNK_SIZE @ #{@DEPLOYMENTS_CHUNK_SIZE}") - + @finished = false @condition = ConditionVariable.new @mutex = Mutex.new - @thread = Thread.new(&method(:run_periodic)) + @thread = Thread.new(&method(:run_periodic)) end end @@ -82,14 +82,14 @@ def enumerate batchTime = currentTime.utc.iso8601 #set the running total for this batch to 0 - @deploymentsRunningTotal = 0 - + @deploymentsRunningTotal = 0 + if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kubestate_deployments::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kubestate_deployments::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - end + end + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_deployments::enumerate : Getting deployments from Kube API @ #{Time.now.utc.iso8601}") @@ -193,7 +193,7 @@ def parse_and_emit_records(deployments, batchTime = Time.utc.iso8601) end time = Fluent::Engine.now - metricItems.each do |insightsMetricsRecord| + metricItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end @@ -240,6 +240,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_kubestate_hpa.rb b/source/plugins/ruby/in_kubestate_hpa.rb index b1e112f45..178f7944f 100644 --- a/source/plugins/ruby/in_kubestate_hpa.rb +++ b/source/plugins/ruby/in_kubestate_hpa.rb @@ -42,7 +42,7 @@ def configure(conf) super end - def start + def start if @run_interval super if !ENV["HPA_CHUNK_SIZE"].nil? && !ENV["HPA_CHUNK_SIZE"].empty? && ENV["HPA_CHUNK_SIZE"].to_i > 0 @@ -79,14 +79,14 @@ def enumerate batchTime = currentTime.utc.iso8601 @hpaCount = 0 - + if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_kubestate_hpa::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_kubestate_hpa::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) - end - $log.info("in_kubestate_hpa::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") - end + end + $log.info("in_kubestate_hpa::enumerate: using tag -#{@tag} @ #{Time.now.utc.iso8601}") + end # Initializing continuation token to nil continuationToken = nil $log.info("in_kubestate_hpa::enumerate : Getting HPAs from Kube API @ #{Time.now.utc.iso8601}") @@ -194,7 +194,7 @@ def parse_and_emit_records(hpas, batchTime = Time.utc.iso8601) end time = Fluent::Engine.now - metricItems.each do |insightsMetricsRecord| + metricItems.each do |insightsMetricsRecord| insightsMetricsEventStream.add(time, insightsMetricsRecord) if insightsMetricsRecord end @@ -239,6 +239,6 @@ def run_periodic @mutex.lock end @mutex.unlock - end + end end end diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 77e84bc2b..dd462fdf2 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -21,7 +21,7 @@ def initialize require_relative "omslog" require_relative "constants" require_relative "extension_utils" - @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" + @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" end config_param :run_interval, :time, :default => 60 @@ -60,16 +60,16 @@ def enumerate() timeDifferenceInMinutes = timeDifference / 60 @@istestvar = ENV["ISTEST"] if ExtensionUtils.isAADMSIAuthMode() - $log.info("in_win_cadvisor_perf::enumerate: AAD AUTH MSI MODE") - if !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + $log.info("in_win_cadvisor_perf::enumerate: AAD AUTH MSI MODE") + if @tag.nil? || !@tag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @tag = ExtensionUtils.getOutputStreamId(Constants::PERF_DATA_TYPE) - end - if !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) + end + if @insightsMetricsTag.nil? || !@insightsMetricsTag.start_with?(Constants::EXTENSION_OUTPUT_STREAM_ID_TAG_PREFIX) @insightsMetricsTag = ExtensionUtils.getOutputStreamId(Constants::INSIGHTS_METRICS_DATA_TYPE) end - $log.info("in_win_cadvisor_perf::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") - $log.info("in_win_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") - end + $log.info("in_win_cadvisor_perf::enumerate: using perf tag -#{@kubeperfTag} @ #{Time.now.utc.iso8601}") + $log.info("in_win_cadvisor_perf::enumerate: using insightsmetrics tag -#{@insightsMetricsTag} @ #{Time.now.utc.iso8601}") + end #Resetting this cache so that it is populated with the current set of containers with every call CAdvisorMetricsAPIClient.resetWinContainerIdCache() From 09ba05bbe2951774a3186f74c2583b103d3b35cb Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Wed, 14 Jul 2021 16:39:13 -0700 Subject: [PATCH 27/28] cherry windows agent nodeip issue --- kubernetes/windows/Dockerfile | 2 +- kubernetes/windows/main.ps1 | 49 ++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/kubernetes/windows/Dockerfile b/kubernetes/windows/Dockerfile index 94be59644..0ba64cd75 100644 --- a/kubernetes/windows/Dockerfile +++ b/kubernetes/windows/Dockerfile @@ -3,7 +3,7 @@ MAINTAINER OMSContainers@microsoft.com LABEL vendor=Microsoft\ Corp \ com.microsoft.product="Azure Monitor for containers" -ARG IMAGE_TAG=win-ciprod06112021 +ARG IMAGE_TAG=win-ciprod06112021-2 # Do not split this into multiple RUN! # Docker creates a layer for every RUN-Statement diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index 4cb8e5fb0..80cc99638 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -172,7 +172,7 @@ function Set-EnvironmentVariables { if ($aiKeyURl) { $aiKeyFetched = "" # retry up to 5 times - for( $i = 1; $i -le 4; $i++) { + for ( $i = 1; $i -le 4; $i++) { try { $response = Invoke-WebRequest -uri $aiKeyURl -UseBasicParsing -TimeoutSec 5 -ErrorAction:Stop @@ -276,6 +276,23 @@ function Set-EnvironmentVariables { [System.Environment]::SetEnvironmentVariable("USE_IMDS_TOKEN_PROXY_END_POINT", $useIMDSTokenProxyEndpoint, "Machine") Write-Host "Successfully set environment variable USE_IMDS_TOKEN_PROXY_END_POINT - $($useIMDSTokenProxyEndpoint) for target 'machine'..." } + $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") + if (![string]::IsNullOrEmpty($nodeIp)) { + [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") + Write-Host "Successfully set environment variable NODE_IP - $($nodeIp) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" + } + + $agentVersion = [System.Environment]::GetEnvironmentVariable("AGENT_VERSION", "process") + if (![string]::IsNullOrEmpty($agentVersion)) { + [System.Environment]::SetEnvironmentVariable("AGENT_VERSION", $agentVersion, "machine") + Write-Host "Successfully set environment variable AGENT_VERSION - $($agentVersion) for target 'machine'..." + } + else { + Write-Host "Failed to set environment variable AGENT_VERSION for target 'machine' since it is either null or empty" + } # run config parser ruby /opt/omsagentwindows/scripts/ruby/tomlparser.rb @@ -412,13 +429,12 @@ function Start-Fluent-Telegraf { if (![string]::IsNullOrEmpty($containerRuntime) -and [string]$containerRuntime.StartsWith('docker') -eq $false) { # change parser from docker to cri if the container runtime is not docker Write-Host "changing parser from Docker to CRI since container runtime : $($containerRuntime) and which is non-docker" - (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf','fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf + (Get-Content -Path C:/etc/fluent/fluent.conf -Raw) -replace 'fluent-docker-parser.conf', 'fluent-cri-parser.conf' | Set-Content C:/etc/fluent/fluent.conf } # Start telegraf only in sidecar scraping mode $sidecarScrapingEnabled = [System.Environment]::GetEnvironmentVariable('SIDECAR_SCRAPING_ENABLED') - if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') - { + if (![string]::IsNullOrEmpty($sidecarScrapingEnabled) -and $sidecarScrapingEnabled.ToLower() -eq 'true') { Write-Host "Starting telegraf..." Start-Telegraf } @@ -458,6 +474,7 @@ function Start-Telegraf { else { Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" } +<<<<<<< HEAD $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") if (![string]::IsNullOrEmpty($nodeIp)) { @@ -468,6 +485,9 @@ function Start-Telegraf { Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" } +======= + +>>>>>>> 6df299f9... Cherry picking hotfix changes to ci_dev (#605) Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf" @@ -480,14 +500,15 @@ function Start-Telegraf { sc.exe \\$serverName config telegraf start= delayed-auto Write-Host "Successfully set delayed start for telegraf" - } else { + } + else { Write-Host "Failed to get environment variable PODNAME to set delayed telegraf start" } } catch { - $e = $_.Exception - Write-Host $e - Write-Host "exception occured in delayed telegraf start.. continuing without exiting" + $e = $_.Exception + Write-Host $e + Write-Host "exception occured in delayed telegraf start.. continuing without exiting" } Write-Host "Running telegraf service in test mode" C:\opt\telegraf\telegraf.exe --config "C:\etc\telegraf\telegraf.conf" --test @@ -496,8 +517,7 @@ function Start-Telegraf { # Trying to start telegraf again if it did not start due to fluent bit not being ready at startup Get-Service telegraf | findstr Running - if ($? -eq $false) - { + if ($? -eq $false) { Write-Host "trying to start telegraf in again in 30 seconds, since fluentbit might not have been ready..." Start-Sleep -s 30 C:\opt\telegraf\telegraf.exe --service start @@ -536,7 +556,7 @@ function Bootstrap-CACertificates { $certMountPath = "C:\ca" Get-ChildItem $certMountPath | Foreach-Object { - $absolutePath=$_.FullName + $absolutePath = $_.FullName Write-Host "cert path: $($absolutePath)" Import-Certificate -FilePath $absolutePath -CertStoreLocation 'Cert:\LocalMachine\Root' -Verbose } @@ -558,10 +578,9 @@ Start-FileSystemWatcher $aksResourceId = [System.Environment]::GetEnvironmentVariable("AKS_RESOURCE_ID") $requiresCertBootstrap = [System.Environment]::GetEnvironmentVariable("REQUIRES_CERT_BOOTSTRAP") if (![string]::IsNullOrEmpty($requiresCertBootstrap) -and ` - $requiresCertBootstrap.ToLower() -eq 'true' -and ` - ![string]::IsNullOrEmpty($aksResourceId) -and ` - $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) -{ + $requiresCertBootstrap.ToLower() -eq 'true' -and ` + ![string]::IsNullOrEmpty($aksResourceId) -and ` + $aksResourceId.ToLower().Contains("/microsoft.containerservice/managedclusters/")) { Bootstrap-CACertificates } From a26b841763e3834b98b3da25dee065de499ae3f0 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Thu, 15 Jul 2021 18:15:11 -0700 Subject: [PATCH 28/28] fix merge issue --- kubernetes/windows/main.ps1 | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kubernetes/windows/main.ps1 b/kubernetes/windows/main.ps1 index c28abfe97..3cbc11e20 100644 --- a/kubernetes/windows/main.ps1 +++ b/kubernetes/windows/main.ps1 @@ -474,9 +474,6 @@ function Start-Telegraf { else { Write-Host "Failed to set environment variable KUBERNETES_SERVICE_PORT for target 'machine' since it is either null or empty" } -<<<<<<< HEAD -<<<<<<< HEAD - $nodeIp = [System.Environment]::GetEnvironmentVariable("NODE_IP", "process") if (![string]::IsNullOrEmpty($nodeIp)) { [System.Environment]::SetEnvironmentVariable("NODE_IP", $nodeIp, "machine") @@ -486,12 +483,6 @@ function Start-Telegraf { Write-Host "Failed to set environment variable NODE_IP for target 'machine' since it is either null or empty" } -======= - ->>>>>>> 6df299f9... Cherry picking hotfix changes to ci_dev (#605) -======= - ->>>>>>> ci_dev Write-Host "Installing telegraf service" C:\opt\telegraf\telegraf.exe --service install --config "C:\etc\telegraf\telegraf.conf"