From 1a5fa9187cb4c12054c34e9875aa72b4e1c19eba Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 26 Jun 2022 09:53:09 -0700 Subject: [PATCH 1/5] fix file access exception --- source/plugins/ruby/in_kube_nodes.rb | 6 +++--- source/plugins/ruby/in_kube_podinventory.rb | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a3cbb5a85..817bd2c04 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -756,7 +756,7 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w") + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, File::RDWR | File::CREAT , 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock @@ -771,9 +771,9 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) rescue => err if retryAttemptCount < maxRetryCount f.flock(File::LOCK_UN) if !f.nil? - f.close if !f.nil? + f.close if !f.nil? + sleep (initialRetryDelaySecs * (maxRetryCount - retryAttemptCount)) retryAttemptCount = retryAttemptCount + 1 - sleep (initialRetryDelaySecs * retryAttemptCount) retry end $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bdbc465ec..b66fb07b3 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1168,7 +1168,7 @@ def writeMDMRecords(mdmRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w") + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, File::RDWR | File::CREAT , 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock @@ -1184,7 +1184,7 @@ def writeMDMRecords(mdmRecordsJson) if retryAttemptCount <= maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? - sleep (initialRetryDelaySecs * retryAttemptCount) + sleep (initialRetryDelaySecs * (maxRetryCount - retryAttemptCount)) retryAttemptCount = retryAttemptCount + 1 retry end From 33d1f077df71fdd549c39da7e496c7ff21160e9d Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 26 Jun 2022 14:54:30 -0700 Subject: [PATCH 2/5] move insights metrics conf to common --- build/linux/installer/conf/kube.conf | 54 ++++++++++++++-------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 5b3837748..c506f849d 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -30,6 +30,33 @@ keepalive true + #InsightsMetrics + + @type forward + @id out_insights_metrics_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + #custom_metrics_mdm filter plugin for perf data from windows nodes @type cadvisor2mdm @@ -340,33 +367,6 @@ keepalive true - #InsightsMetrics - #kubestate - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - - - @type file - path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" - flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - keepalive true - - @type mdm @id out_mdm_perf From 5ef7de5d10602b5cb0377b739bb138f7fe27e3e6 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 26 Jun 2022 17:18:56 -0700 Subject: [PATCH 3/5] clear file content before writing content --- source/plugins/ruby/in_kube_nodes.rb | 5 +++-- source/plugins/ruby/in_kube_podinventory.rb | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 817bd2c04..e99235aac 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -756,11 +756,12 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, File::RDWR | File::CREAT , 0644) + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i + File.truncate(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, 0) f.write(nodeAllocatbleRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) @@ -771,7 +772,7 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) rescue => err if retryAttemptCount < maxRetryCount f.flock(File::LOCK_UN) if !f.nil? - f.close if !f.nil? + f.close if !f.nil? sleep (initialRetryDelaySecs * (maxRetryCount - retryAttemptCount)) retryAttemptCount = retryAttemptCount + 1 retry diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index b66fb07b3..4ddd1c335 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1168,11 +1168,12 @@ def writeMDMRecords(mdmRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, File::RDWR | File::CREAT , 0644) + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i + File.truncate(Constants::MDM_POD_INVENTORY_STATE_FILE, 0) f.write(mdmRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) From 9878fbeb5c20f3332c60f23137569c313af8f117 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Sun, 26 Jun 2022 17:32:14 -0700 Subject: [PATCH 4/5] add timestamp to debug logs --- source/plugins/ruby/in_kube_nodes.rb | 6 +++--- source/plugins/ruby/in_kube_perfinventory.rb | 6 +++--- source/plugins/ruby/in_kube_podinventory.rb | 6 +++--- source/plugins/ruby/in_kube_podmdminventory.rb | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index e99235aac..368eb61d4 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -759,15 +759,15 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i File.truncate(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, 0) f.write(nodeAllocatbleRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}" + $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write" + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount < maxRetryCount diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index ad8fdbf21..20589167b 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -405,13 +405,13 @@ def getNodeAllocatableRecords() f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i nodeAllocatableRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read" + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount < maxRetryCount diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 4ddd1c335..37c9741c3 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1171,15 +1171,15 @@ def writeMDMRecords(mdmRecordsJson) f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i File.truncate(Constants::MDM_POD_INVENTORY_STATE_FILE, 0) f.write(mdmRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}" + $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write" + raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount <= maxRetryCount diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index bfc5227f3..b872650d2 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -185,17 +185,17 @@ def getMDMRecords() f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime - raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale" + raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale @ #{Time.now.utc.iso8601}" end @prevCollectionTime = mdmRecords["collectionTime"] - $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" + raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount <= maxRetryCount From ab9571abea9eed9c0aca1ea5e611bff2663ea000 Mon Sep 17 00:00:00 2001 From: Ganga Mahesh Siddem Date: Mon, 27 Jun 2022 07:13:59 -0700 Subject: [PATCH 5/5] release updates for linux agent --- ReleaseNotes.md | 7 +++++++ build/version | 4 ++-- charts/azuremonitor-containers/values.yaml | 4 ++-- kubernetes/linux/Dockerfile | 2 +- kubernetes/linux/Dockerfile.multiarch | 2 +- kubernetes/omsagent.yaml | 12 ++++++------ 6 files changed, 19 insertions(+), 12 deletions(-) diff --git a/ReleaseNotes.md b/ReleaseNotes.md index fb992f09c..39eeb6a50 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,13 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 06/27/2022 - +##### Version microsoft/oms:ciprod06272022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022 (linux) +##### Code change log +- Fixes for following bugs in ciprod06142022 which are caught in AKS Canary region deployment + - Fix the exceptions related to file write & read access of the MDM inventory state file + - Fix for missing Node GPU allocatable & capacity metrics for the clusters which are whitelisted for AKS LargeCluster Private Preview feature + ### 6/14/2022 - ##### Version microsoft/oms:ciprod06142022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022 (linux) ##### Version microsoft/oms:win-ciprod06142022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06142022 (windows) diff --git a/build/version b/build/version index dcbea0179..f2021864e 100644 --- a/build/version +++ b/build/version @@ -4,9 +4,9 @@ CONTAINER_BUILDVERSION_MAJOR=18 CONTAINER_BUILDVERSION_MINOR=0 -CONTAINER_BUILDVERSION_PATCH=0 +CONTAINER_BUILDVERSION_PATCH=1 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20220614 +CONTAINER_BUILDVERSION_DATE=20220627 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 64f48212b..d528115cf 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -22,10 +22,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod06142022" + tag: "ciprod06272022" tagWindows: "win-ciprod06142022" pullPolicy: IfNotPresent - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" agentVersion: "azure-mdsd-1.17.0" winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index af1cab3d9..b9927be7c 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod06142022 +ARG IMAGE_TAG=ciprod06272022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch index ad177d8f0..c96a93802 100644 --- a/kubernetes/linux/Dockerfile.multiarch +++ b/kubernetes/linux/Dockerfile.multiarch @@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/ COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod06142022 +ARG IMAGE_TAG=ciprod06272022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index bb83f6faf..88d2fdda8 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -334,7 +334,7 @@ spec: tier: node annotations: agentVersion: "azure-mdsd-1.17.0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -379,7 +379,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -468,7 +468,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -612,7 +612,7 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "azure-mdsd-1.17.0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -653,7 +653,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -821,7 +821,7 @@ spec: tier: node-win annotations: agentVersion: "0.0.0-0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent