diff --git a/ReleaseNotes.md b/ReleaseNotes.md index fb992f09c..39eeb6a50 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -11,6 +11,13 @@ additional questions or comments. Note : The agent version(s) below has dates (ciprod), which indicate the agent build dates (not release dates) +### 06/27/2022 - +##### Version microsoft/oms:ciprod06272022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022 (linux) +##### Code change log +- Fixes for following bugs in ciprod06142022 which are caught in AKS Canary region deployment + - Fix the exceptions related to file write & read access of the MDM inventory state file + - Fix for missing Node GPU allocatable & capacity metrics for the clusters which are whitelisted for AKS LargeCluster Private Preview feature + ### 6/14/2022 - ##### Version microsoft/oms:ciprod06142022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022 (linux) ##### Version microsoft/oms:win-ciprod06142022 Version mcr.microsoft.com/azuremonitor/containerinsights/ciprod:win-ciprod06142022 (windows) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 5b3837748..c506f849d 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -30,6 +30,33 @@ keepalive true + #InsightsMetrics + + @type forward + @id out_insights_metrics_fwd + @log_level debug + send_timeout 30 + connect_timeout 30 + heartbeat_type none + + host 0.0.0.0 + port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" + + + @type file + path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer + overflow_action drop_oldest_chunk + chunk_limit_size 4m + queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" + flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" + retry_max_times 10 + retry_wait 5s + retry_max_interval 5m + flush_thread_count 5 + + keepalive true + + #custom_metrics_mdm filter plugin for perf data from windows nodes @type cadvisor2mdm @@ -340,33 +367,6 @@ keepalive true - #InsightsMetrics - #kubestate - - @type forward - @log_level debug - send_timeout 30 - connect_timeout 30 - heartbeat_type none - - host 0.0.0.0 - port "#{ENV['MDSD_FLUENT_SOCKET_PORT']}" - - - @type file - path /var/opt/microsoft/docker-cimprov/state/insightsmetrics*.buffer - overflow_action drop_oldest_chunk - chunk_limit_size 4m - queue_limit_length "#{ENV['FLUENTD_QUEUE_LIMIT_LENGTH']}" - flush_interval "#{ENV['FLUENTD_FLUSH_INTERVAL']}" - retry_max_times 10 - retry_wait 5s - retry_max_interval 5m - flush_thread_count 5 - - keepalive true - - @type mdm @id out_mdm_perf diff --git a/build/version b/build/version index dcbea0179..f2021864e 100644 --- a/build/version +++ b/build/version @@ -4,9 +4,9 @@ CONTAINER_BUILDVERSION_MAJOR=18 CONTAINER_BUILDVERSION_MINOR=0 -CONTAINER_BUILDVERSION_PATCH=0 +CONTAINER_BUILDVERSION_PATCH=1 CONTAINER_BUILDVERSION_BUILDNR=0 -CONTAINER_BUILDVERSION_DATE=20220614 +CONTAINER_BUILDVERSION_DATE=20220627 CONTAINER_BUILDVERSION_STATUS=Developer_Build #-------------------------------- End of File ----------------------------------- diff --git a/charts/azuremonitor-containers/values.yaml b/charts/azuremonitor-containers/values.yaml index 64f48212b..d528115cf 100644 --- a/charts/azuremonitor-containers/values.yaml +++ b/charts/azuremonitor-containers/values.yaml @@ -22,10 +22,10 @@ Azure: omsagent: image: repo: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod" - tag: "ciprod06142022" + tag: "ciprod06272022" tagWindows: "win-ciprod06142022" pullPolicy: IfNotPresent - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" agentVersion: "azure-mdsd-1.17.0" winAgentVersion: "0.0.0-0" # there is no base agent version for windows agent diff --git a/kubernetes/linux/Dockerfile b/kubernetes/linux/Dockerfile index af1cab3d9..b9927be7c 100644 --- a/kubernetes/linux/Dockerfile +++ b/kubernetes/linux/Dockerfile @@ -17,7 +17,7 @@ ENV RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR 0.9 RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl curl sudo python-ctypes init-system-helpers net-tools rsyslog cron vim dmidecode apt-transport-https gnupg && rm -rf /var/lib/apt/lists/* COPY setup.sh main.sh defaultpromenvvariables defaultpromenvvariables-rs defaultpromenvvariables-sidecar mdsd.xml envmdsd logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod06142022 +ARG IMAGE_TAG=ciprod06272022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/linux/Dockerfile.multiarch b/kubernetes/linux/Dockerfile.multiarch index ad177d8f0..c96a93802 100644 --- a/kubernetes/linux/Dockerfile.multiarch +++ b/kubernetes/linux/Dockerfile.multiarch @@ -29,7 +29,7 @@ RUN /usr/bin/apt-get update && /usr/bin/apt-get install -y libc-bin wget openssl COPY --from=builder /src/kubernetes/linux/Linux_ULINUX_1.0_*_64_Release/docker-cimprov-*.*.*-*.*.sh $tmpdir/ COPY kubernetes/linux/setup.sh kubernetes/linux/main.sh kubernetes/linux/defaultpromenvvariables kubernetes/linux/defaultpromenvvariables-rs kubernetes/linux/defaultpromenvvariables-sidecar kubernetes/linux/mdsd.xml kubernetes/linux/envmdsd kubernetes/linux/logrotate.conf $tmpdir/ -ARG IMAGE_TAG=ciprod06142022 +ARG IMAGE_TAG=ciprod06272022 ENV AGENT_VERSION ${IMAGE_TAG} WORKDIR ${tmpdir} diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index bb83f6faf..88d2fdda8 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -334,7 +334,7 @@ spec: tier: node annotations: agentVersion: "azure-mdsd-1.17.0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -379,7 +379,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -468,7 +468,7 @@ spec: timeoutSeconds: 15 #Only in sidecar scraping mode - name: omsagent-prometheus - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -612,7 +612,7 @@ spec: rsName: "omsagent-rs" annotations: agentVersion: "azure-mdsd-1.17.0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent @@ -653,7 +653,7 @@ spec: # - NET_ADMIN # - NET_RAW - name: omsagent - image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06142022" + image: "mcr.microsoft.com/azuremonitor/containerinsights/ciprod:ciprod06272022" imagePullPolicy: IfNotPresent resources: limits: @@ -821,7 +821,7 @@ spec: tier: node-win annotations: agentVersion: "0.0.0-0" - dockerProviderVersion: "18.0.0-0" + dockerProviderVersion: "18.0.1-0" schema-versions: "v1" spec: serviceAccountName: omsagent diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index a3cbb5a85..368eb61d4 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -756,24 +756,25 @@ def writeNodeAllocatableRecords(nodeAllocatbleRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "w") + f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i + File.truncate(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, 0) f.write(nodeAllocatbleRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs}" + $log.info "in_kube_nodes::writeNodeAllocatableRecords:Successfull and with time taken(ms): #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write" + raise "in_kube_nodes::writeNodeAllocatableRecords:Failed to open file for write @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount < maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? + sleep (initialRetryDelaySecs * (maxRetryCount - retryAttemptCount)) retryAttemptCount = retryAttemptCount + 1 - sleep (initialRetryDelaySecs * retryAttemptCount) retry end $log.warn "in_kube_nodes::writeNodeAllocatableRecords failed with an error: #{err} after retries: #{maxRetryCount} @ #{Time.now.utc.iso8601}" diff --git a/source/plugins/ruby/in_kube_perfinventory.rb b/source/plugins/ruby/in_kube_perfinventory.rb index ad8fdbf21..20589167b 100644 --- a/source/plugins/ruby/in_kube_perfinventory.rb +++ b/source/plugins/ruby/in_kube_perfinventory.rb @@ -405,13 +405,13 @@ def getNodeAllocatableRecords() f = File.open(Constants::NODE_ALLOCATABLE_RECORDS_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i nodeAllocatableRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_perfinventory:getNodeAllocatableRecords:Number of Node Allocatable records: #{nodeAllocatableRecords.length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read" + raise "in_kube_perfinventory:getNodeAllocatableRecords:Failed to open file for read @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount < maxRetryCount diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bdbc465ec..37c9741c3 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -1168,23 +1168,24 @@ def writeMDMRecords(mdmRecordsJson) initialRetryDelaySecs = 0.5 retryAttemptCount = 1 begin - f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "w") + f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, File::RDWR | File::CREAT, 0644) if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podinventory:writeMDMRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i + File.truncate(Constants::MDM_POD_INVENTORY_STATE_FILE, 0) f.write(mdmRecordsJson) f.flush timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) - $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs}" + $log.info "in_kube_podinventory:writeMDMRecords:Successfull and with time taken(ms): #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write" + raise "in_kube_podinventory:writeMDMRecords:Failed to open file for write @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount <= maxRetryCount f.flock(File::LOCK_UN) if !f.nil? f.close if !f.nil? - sleep (initialRetryDelaySecs * retryAttemptCount) + sleep (initialRetryDelaySecs * (maxRetryCount - retryAttemptCount)) retryAttemptCount = retryAttemptCount + 1 retry end diff --git a/source/plugins/ruby/in_kube_podmdminventory.rb b/source/plugins/ruby/in_kube_podmdminventory.rb index bfc5227f3..b872650d2 100644 --- a/source/plugins/ruby/in_kube_podmdminventory.rb +++ b/source/plugins/ruby/in_kube_podmdminventory.rb @@ -185,17 +185,17 @@ def getMDMRecords() f = File.open(Constants::MDM_POD_INVENTORY_STATE_FILE, "r") if !f.nil? isAcquiredLock = f.flock(File::LOCK_EX | File::LOCK_NB) - raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock" if !isAcquiredLock + raise "in_kube_podmdminventory:getMDMRecords:Failed to acquire file lock @ #{Time.now.utc.iso8601}" if !isAcquiredLock startTime = (Time.now.to_f * 1000).to_i mdmRecords = Yajl::Parser.parse(f) timetakenMs = ((Time.now.to_f * 1000).to_i - startTime) if mdmRecords.nil? || mdmRecords.empty? || mdmRecords["items"].nil? || mdmRecords["collectionTime"] == @prevCollectionTime - raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale" + raise "in_kube_podmdminventory:getMDMRecords: either read mdmRecords is nil or empty or stale @ #{Time.now.utc.iso8601}" end @prevCollectionTime = mdmRecords["collectionTime"] - $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" + $log.info "in_kube_podmdminventory:getMDMRecords:Number of MDM records: #{mdmRecords["items"].length} with time taken(ms) for read: #{timetakenMs} @ #{Time.now.utc.iso8601}" else - raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read" + raise "in_kube_podmdminventory:getMDMRecords:Failed to open file for read @ #{Time.now.utc.iso8601}" end rescue => err if retryAttemptCount <= maxRetryCount