From 04826d02910e448937d72aace521113bc7bcf438 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 18 Aug 2020 15:57:45 -0700 Subject: [PATCH 01/57] Add in pv metrics from cadvisor --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 42ecfcaf0..c685ebff9 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -293,6 +293,10 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryTotal", "containerGpumemoryTotalBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) + + metricNamesToCollect = ["availableBytes", "capacityBytes", "usedBytes", "inodes", "inodesUsed", "inodesFree"] + metricNamesToReturn = ["PVAvailableBytes", "PVCapacityBytes", "PVUsedBytes", "PVinodes", "PVinodesUsed", "PVinodesFree"] + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, metricNamesToCollect, metricNamesToReturn, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -303,6 +307,73 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end + def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricNamesToReturn, metricPollTime) + metricItems = [] + clusterId = KubernetesApiClient.getClusterId + clusterName = KubernetesApiClient.getClusterName + begin + metricInfo = metricJSON + metricInfo["pods"].each do |pod| + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] + podNamespace = pod["podRef"]["namespace"] + + containerNames = [] + if (!pod["containers"].nil?) + pod["containers"].each do |container| + containerName = container["name"] + containerNames.push(podUid + "/" + containerName) + + if (!pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + time = volume["time"] + + metricCount = 0 + metricNamesToCollect.each do |metricNameToCollect| + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + metricTags["PVName"] = pvName + metricTags["PVCName"] = pvcName + metricTags["Time"] = time + + metricItem["Tags"] = metricTags + + metricItems.push(metricItem) + + metricCount = metricCount + 1 + end + end + end + end + end + end + end + end + rescue => errorStr + @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + return metricItems + end + return metricItems + end + + def getContainerGpuMetricsAsInsightsMetrics(metricJSON, hostName, metricNameToCollect, metricNametoReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId From a459794f03130ee37a4f5a2db8b330be0e76b471 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 20 Aug 2020 15:14:41 -0700 Subject: [PATCH 02/57] changed to send only pv usage & add kube-system toggle config --- build/common/installer/scripts/tomlparser.rb | 14 +++++ kubernetes/container-azm-ms-agentconfig.yaml | 6 +++ .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 51 ++++++++----------- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 7235ee0c3..51c0d7b13 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -24,6 +24,7 @@ @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false @collectAllKubeEvents = false +@collectPVKubeSystemMetrics = false @containerLogsRoute = "" # Use parser to parse the configmap toml file to a ruby structure @@ -148,6 +149,16 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") end + #Get PV kube-system enrichment setting + begin + if !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") + end + #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? @@ -199,6 +210,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") # Close file after writing all environment variables file.close @@ -244,6 +256,8 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) + commands = get_command_windows('export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS', @collectPVKubeSystemMetrics) + file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58e09f041..c4b300e9d 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,6 +42,12 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected + [log_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected + enabled = false + # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected + prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings [prometheus_data_collection_settings.cluster] diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 77997feca..4efd9092d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -20,6 +20,7 @@ class CAdvisorMetricsAPIClient @clusterEnvVarCollectionEnabled = ENV["AZMON_CLUSTER_COLLECT_ENV_VAR"] @clusterStdErrLogCollectionEnabled = ENV["AZMON_COLLECT_STDERR_LOGS"] @clusterStdOutLogCollectionEnabled = ENV["AZMON_COLLECT_STDOUT_LOGS"] + @pvKubeSystemCollectionMetricsEnabled = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] @clusterLogTailExcludPath = ENV["AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH"] @clusterLogTailPath = ENV["AZMON_LOG_TAIL_PATH"] @clusterAgentSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] @@ -302,9 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricNamesToCollect = ["availableBytes", "capacityBytes", "usedBytes", "inodes", "inodesUsed", "inodesFree"] - metricNamesToReturn = ["PVAvailableBytes", "PVCapacityBytes", "PVUsedBytes", "PVinodes", "PVinodesUsed", "PVinodesFree"] - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, metricNamesToCollect, metricNamesToReturn, metricTime)) + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", "pv_used_bytes", metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -327,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, podNamespace = pod["podRef"]["namespace"] containerNames = [] - if (!pod["containers"].nil?) + if ((!podNamespace == "kube-system" || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) pod["containers"].each do |container| containerName = container["name"] containerNames.push(podUid + "/" + containerName) @@ -341,32 +340,26 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, # A PVC exists on this volume pvcName = pvcRef["name"] pvName = volume["name"] - time = volume["time"] - - metricCount = 0 - metricNamesToCollect.each do |metricNameToCollect| - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNamesToReturn[metricCount] - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGHTSMETRICS_TAGS_GPU_NAMESPACE + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = podNameSpace - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags["PVName"] = pvName - metricTags["PVCName"] = pvcName - metricTags["Time"] = time - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName + metricTags["pvName"] = pvName + metricTags["pvcName"] = pvcName + metricTags["pv_capacity_bytes"] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - - metricCount = metricCount + 1 - end + metricItems.push(metricItem) end end end @@ -375,7 +368,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, end end rescue => errorStr - @Log.warn("getContainerGpuMetricsAsInsightsMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end return metricItems From 0ec8ef9052aec7609fe172fe28cd4bbdf4405b9a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 21 Aug 2020 09:09:56 -0700 Subject: [PATCH 03/57] variable name fixes --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 4efd9092d..55d8ad55d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -314,7 +314,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end - def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricNamesToReturn, metricPollTime) + def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -326,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, podNamespace = pod["podRef"]["namespace"] containerNames = [] - if ((!podNamespace == "kube-system" || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) + if ((!(podNamespace == "kube-system") || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) pod["containers"].each do |container| containerName = container["name"] containerNames.push(podUid + "/" + containerName) @@ -344,7 +344,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNamesToCollect, metricItem = {} metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName - metricItem["Name"] = metricNamesToReturn[metricCount] + metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = podNameSpace From fb8a2147820b2fdb1c2008135c69d02b2c740b1e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 09:42:01 -0700 Subject: [PATCH 04/57] Added kube-system config --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 55d8ad55d..228ff6ad9 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -315,6 +315,14 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + @Log.info("Getting PV metrics") + pvKubeSystem = @pvKubeSystemCollectionMetricsEnabled.nil? ? "pv kube-system nil" : "pv kube-system not nil" + @Log.info(pvKubeSystem) + @Log.info(@pvKubeSystemCollectionMetricsEnabled) + + pvKubeSystemCollectionMetrics = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] + @Log.info(pvKubeSystemCollectionMetrics) + metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -325,11 +333,22 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podName = pod["podRef"]["name"] podNamespace = pod["podRef"]["namespace"] - containerNames = [] - if ((!(podNamespace == "kube-system") || @pvKubeSystemCollectionMetricsEnabled) && !pod["containers"].nil?) + kubeSystemNamespace = false + if (podNamespace.include? "kube-system") + @Log.info("kube-system namespace encountered") + if (pvKubeSystemCollectionMetrics == "true") + kubeSystemNamespace = false + @Log.info("kube-system namespace encountered - include") + else + kubeSystemNamespace = true + @Log.info("kube-system namespace encountered - exclude") + end + end + + + if (!pod["containers"].nil? && !kubeSystemNamespace) pod["containers"].each do |container| containerName = container["name"] - containerNames.push(podUid + "/" + containerName) if (!pod["volume"].nil?) pod["volume"].each do |volume| @@ -347,7 +366,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = podNameSpace + metricItem["Namespace"] = "container.azm.ms/pv" metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId @@ -356,6 +375,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricTags["pvName"] = pvName metricTags["pvcName"] = pvcName metricTags["pv_capacity_bytes"] = volume["capacityBytes"] + metricTags["podNamespace"] = podNamespace metricItem["Tags"] = metricTags From 0b2f9dc9eb9c68a4a1f6f2c3822f098ab882b4da Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 13:47:17 -0700 Subject: [PATCH 05/57] mdm filter --- .../scripts/tomlparser-mdm-metrics-config.rb | 12 ++++++ source/plugins/ruby/MdmMetricsGenerator.rb | 8 ++++ source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 40 ++++++++++++++++++- 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 1c01dd8c6..d2990ca0c 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -12,6 +12,7 @@ @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD +@percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD # Use parser to parse the configmap toml file to a ruby structure def parseConfigMap @@ -66,6 +67,15 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end + #PV + pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + end puts "config::Using config map settings for MDM metric configuration settings for resource utilization" end rescue => errorStr @@ -73,6 +83,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end end end @@ -97,6 +108,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_ALERT_CONTAINER_CPU_THRESHOLD=#{@percentageCpuUsageThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_RSS_THRESHOLD=#{@percentageMemoryRssThreshold}\n") file.write("export AZMON_ALERT_CONTAINER_MEMORY_WORKING_SET_THRESHOLD=\"#{@percentageMemoryWorkingSetThreshold}\"\n") + file.write("export AZMON_ALERT_PV_USAGE_THRESHOLD=#{@percentagePVUsageThreshold}\n") # Close file after writing all MDM setting environment variables file.close puts "****************End MDM Metrics Config Processing********************" diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3d75dc6f4..9c0873602 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -356,6 +356,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + metric_threshold_hash["pvUsage"] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -375,6 +376,13 @@ def getContainerResourceUtilizationThresholds memoryWorkingSetThresholdFloat = (memoryWorkingSetThreshold.to_f).round(2) metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + + pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] + @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" + if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? + pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) + metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = pvUsagePercentageThresholdFloat + end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index dd1ba24b3..91dfc6077 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -59,6 +59,7 @@ class Constants DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 80.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index fd43ef98b..9fb0af5a2 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,pv_used_bytes" @@hostName = (OMS::Common.get_hostname) @@ -51,15 +51,18 @@ def start @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false # initialize cpu and memory limit if @process_incoming_stream @cpu_capacity = 0.0 @memory_capacity = 0.0 + @pv_capacity = 0.0 ensure_cpu_memory_capacity_set @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @containerResourceDimensionHash = {} + @pvUsageHash = {} @@metric_threshold_hash = MdmMetricsGenerator.getContainerResourceUtilizationThresholds end rescue => e @@ -87,6 +90,8 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true + elsif metricName == "pv_used_bytes" + @pvExceededUsageThreshold = true end rescue => errorStr @log.info "Error in setThresholdExceededTelemetry: #{errorStr}" @@ -104,10 +109,12 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] + properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash["pv_used_bytes"] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold + properties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @@ -191,6 +198,37 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check + elsif tag == Constants::INSIGHTSMETRICS_FLUENT_TAG + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + end + end else return [] #end if block for object type check end From 1bad74fb9fb4d3784714e189ebc14c41ce6cb915 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 14:38:12 -0700 Subject: [PATCH 06/57] add pv_used_bytes to mdm filter metrics conf --- build/linux/installer/conf/container.conf | 2 +- build/linux/installer/conf/kube.conf | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 12 +++++++----- source/plugins/ruby/MdmMetricsGenerator.rb | 3 ++- source/plugins/ruby/filter_cadvisor2mdm.rb | 3 ++- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index f02ec0131..8988c24bd 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pv_used_bytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 9ada8425f..50ecb3a6a 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pv_used_bytes log_level info diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index c4b300e9d..df175d700 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -19,7 +19,7 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this configmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = ["kube-system"] + exclude_namespaces = [] [log_collection_settings.stderr] # Default value for enabled is true @@ -28,24 +28,24 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this cofigmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = ["kube-system"] + exclude_namespaces = [] [log_collection_settings.env_var] # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = false + enabled = true # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected - enabled = false + enabled = true # When this is enabled (enabled = true), all kube events including normal events will be collected [log_collection_settings.collect_kube_system_pv_metrics] # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = false + enabled = true # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- @@ -106,6 +106,8 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 + # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage + pv_usage_threshold_percentage = 80.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 9c0873602..3b801e2de 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -377,11 +377,12 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end + #pvUsagePercentageThreshold = 80.0 pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) - metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = pvUsagePercentageThresholdFloat + metric_threshold_hash["pv_used_bytes"] = pvUsagePercentageThresholdFloat end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 9fb0af5a2..35a68ba2e 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,pv_used_bytes" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,'pv_used_bytes'" @@hostName = (OMS::Common.get_hostname) @@ -129,6 +129,7 @@ def flushMetricTelemetry def filter(tag, time, record) begin + @log.info "Tag: #{tag}" if @process_incoming_stream object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] From 7068629b2058f4f4851b212b42d1789ad420b8bd Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 15:18:13 -0700 Subject: [PATCH 07/57] filter fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 7 ++++++- source/plugins/ruby/in_cadvisor_perf.rb | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 35a68ba2e..ce5880b03 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,6 +135,10 @@ def filter(tag, time, record) counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] percentage_metric_value = 0.0 metric_value = record["DataItems"][0]["Collections"][0]["Value"] + data_type = record["DataType"] + ip_name = record["IPName"] + @log.info "Data Type: #{data_type}" + @log.info "IP Name: #{data_type}" if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -199,9 +203,10 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check - elsif tag == Constants::INSIGHTSMETRICS_FLUENT_TAG + elsif data_type == "INSIGHTS_METRICS_BLOB" @log.info "insights metrics in filter_cadvisor2mdm" record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" if dataItem["Name"] == "pv_used_bytes" @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] diff --git a/source/plugins/ruby/in_cadvisor_perf.rb b/source/plugins/ruby/in_cadvisor_perf.rb index a44365e9d..b706ff00a 100644 --- a/source/plugins/ruby/in_cadvisor_perf.rb +++ b/source/plugins/ruby/in_cadvisor_perf.rb @@ -88,6 +88,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("cAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") From 94348cd80ef4ab1607b39ec205a24d52f2eaa11c Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 16:16:59 -0700 Subject: [PATCH 08/57] more filter fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 74 +++++++++++----------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index ce5880b03..6ab723634 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -131,14 +131,48 @@ def filter(tag, time, record) begin @log.info "Tag: #{tag}" if @process_incoming_stream - object_name = record["DataItems"][0]["ObjectName"] - counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] - percentage_metric_value = 0.0 - metric_value = record["DataItems"][0]["Collections"][0]["Value"] data_type = record["DataType"] ip_name = record["IPName"] @log.info "Data Type: #{data_type}" @log.info "IP Name: #{data_type}" + + if data_type == "INSIGHTS_METRICS_BLOB" + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + end + end + + object_name = record["DataItems"][0]["ObjectName"] + counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] + percentage_metric_value = 0.0 + metric_value = record["DataItems"][0]["Collections"][0]["Value"] if object_name == Constants::OBJECT_NAME_K8S_NODE && @metrics_to_collect_hash.key?(counter_name.downcase) # Compute and send % CPU and Memory @@ -203,38 +237,6 @@ def filter(tag, time, record) else return [] end #end if block for percentage metric > configured threshold % check - elsif data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" - @log.info "pv_used_bytes is a data item" - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] - if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity - @log.info "capacity is not 0" - end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] - @log.info "resourceDimensions: #{resourceDimensions}" - - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - end - end else return [] #end if block for object type check end From 58230fd459370af2aec88fa8c726a1b2b3744020 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 24 Aug 2020 16:19:58 -0700 Subject: [PATCH 09/57] end statement fix --- source/plugins/ruby/filter_cadvisor2mdm.rb | 53 +++++++++++----------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 6ab723634..031176126 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,37 +135,38 @@ def filter(tag, time, record) ip_name = record["IPName"] @log.info "Data Type: #{data_type}" @log.info "IP Name: #{data_type}" - + if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" - @log.info "pv_used_bytes is a data item" - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] - if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity - @log.info "capacity is not 0" - end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] - @log.info "resourceDimensions: #{resourceDimensions}" - - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + @log.info "insights metrics in filter_cadvisor2mdm" + record["DataItems"].each do |dataItem| + @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == "pv_used_bytes" + @log.info "pv_used_bytes is a data item" + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"]["pv_capacity_bytes"] + if capacity != 0 + percentage_metric_value = (usage) * 100 / capacity + @log.info "capacity is not 0" + end + @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + resourceDimensions = {} + resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + resourceDimensions[1] = "podName" + resourceDimensions[2] = "controllerName" + resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + @log.info "resourceDimensions: #{resourceDimensions}" + + thresholdPercentage = @@metric_threshold_hash[metricName] + @log.info "thresholdPercentage: #{thresholdPercentage}" + return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, thresholdPercentage) + end end end From f68c04a1d1ad21a633d307d8992d160b849677be Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 09:03:12 -0700 Subject: [PATCH 10/57] log fixes --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 031176126..2633b31ac 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -149,7 +149,7 @@ def filter(tag, time, record) percentage_metric_value = (usage) * 100 / capacity @log.info "capacity is not 0" end - @log.info "percentage_metric_value for metric: #{metricName} for instance: #{instanceName} percentage: #{percentage_metric_value}" + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" resourceDimensions = {} From 46c1b50a6fe31d0d83ee63b5a8e105eedaa3587a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 12:10:12 -0700 Subject: [PATCH 11/57] all pv records to mdm --- source/plugins/ruby/MdmMetricsGenerator.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 3b801e2de..c250cf7c7 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -251,6 +251,7 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } + @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 2633b31ac..c3c2b49be 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -138,6 +138,8 @@ def filter(tag, time, record) if data_type == "INSIGHTS_METRICS_BLOB" @log.info "insights metrics in filter_cadvisor2mdm" + @log.info "#{record["DataItems"]}" + mdmMetrics = [] record["DataItems"].each do |dataItem| @log.info "dataItem: #{dataItem}" if dataItem["Name"] == "pv_used_bytes" @@ -152,21 +154,19 @@ def filter(tag, time, record) @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - resourceDimensions = {} - resourceDimensions[0] = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - resourceDimensions[1] = "podName" - resourceDimensions[2] = "controllerName" - resourceDimensions[3] = dataItem["Tags"]["podNamespace"] + resourceDimensions = [dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME], + "podName", "controllerName", dataItem["Tags"]["podNamespace"]].join("~~") @log.info "resourceDimensions: #{resourceDimensions}" thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - return MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + mdmMetrics.push(MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, - thresholdPercentage) + thresholdPercentage)) end + return mdmMetrics end end From db24b0fe889207610c6ce9eadde90f5363563a5e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 13:04:21 -0700 Subject: [PATCH 12/57] different mdm generator method --- source/plugins/ruby/MdmMetricsGenerator.rb | 27 ++++++++++++++++++++++ source/plugins/ruby/filter_cadvisor2mdm.rb | 9 ++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index c250cf7c7..1555f75b0 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,6 +35,7 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, + "pv_used_bytes" => "pv_usage_percentage" } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @@ -260,6 +261,32 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + records = [] + begin + @log.info "resource dimensions: #{dims}" + # get dimension values + containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] + podNamespace = dims["podNamespace"] + resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { + timestamp: recordTimeStamp, + metricName: @@container_metric_name_metric_percentage_name_hash[metricName], + containerNameDimValue: containerName, + podNameDimValue: "podName", + controllerNameDimValue: "controllerName", + namespaceDimValue: podNamespace, + containerResourceUtilizationPercentage: percentageMetricValue, + thresholdPercentageDimValue: thresholdPercentage, + } + @log.info "resourceUtilRecord: #{resourceUtilRecord}" + records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) + rescue => errorStr + @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + return records + end + def getDiskUsageMetricRecords(record) records = [] usedPercent = nil diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index c3c2b49be..73a280fa7 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -148,19 +148,18 @@ def filter(tag, time, record) usage = dataItem["Value"] capacity = dataItem["Tags"]["pv_capacity_bytes"] if capacity != 0 - percentage_metric_value = (usage) * 100 / capacity + percentage_metric_value = (usage * 100.0) / capacity @log.info "capacity is not 0" end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - resourceDimensions = [dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME], - "podName", "controllerName", dataItem["Tags"]["podNamespace"]].join("~~") - @log.info "resourceDimensions: #{resourceDimensions}" + resourceDimensions = dataItem["Tags"] + @log.info "#{resourceDimensions}" thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - mdmMetrics.push(MdmMetricsGenerator.getContainerResourceUtilMetricRecords(dataItem["CollectionTime"], + mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, From 9d6874fcfe834b1c1fdabe26d011a33d990eec70 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 25 Aug 2020 17:24:38 -0700 Subject: [PATCH 13/57] out_mdm log path --- source/plugins/ruby/out_mdm.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index d801edb9a..bd36662b5 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -51,6 +51,7 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName + @log = Logger.new("/var/opt/microsoft/docker-cimprov/log/out_mdm.log", 1, 5000000) super end From cdf96a023c6e1fe0461bae655b87cf145b2ac5db Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 26 Aug 2020 12:19:34 -0700 Subject: [PATCH 14/57] try to get out_mdm logging path --- source/plugins/ruby/filter_cadvisor2mdm.rb | 3 +++ source/plugins/ruby/out_mdm.rb | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 73a280fa7..097b7df35 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -120,6 +120,7 @@ def flushMetricTelemetry @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @containersExceededMemWorkingSetThreshold = false + @pvExceededUsageThreshold = false end rescue => errorStr @log.info "Error in flushMetricTelemetry: #{errorStr}" @@ -165,6 +166,8 @@ def filter(tag, time, record) resourceDimensions, thresholdPercentage)) end + flushMetricTelemetry + setThresholdExceededTelemetry(metricName) return mdmMetrics end end diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index bd36662b5..f658af612 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -4,6 +4,7 @@ module Fluent class OutputMDM < BufferedOutput config_param :retry_mdm_post_wait_minutes, :integer + config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/out_mdm.log" Plugin.register_output("out_mdm", self) @@ -51,7 +52,7 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName - @log = Logger.new("/var/opt/microsoft/docker-cimprov/log/out_mdm.log", 1, 5000000) + @log = Logger.new(@log_path, 1, 5000000) super end From c902df6155b39c46d2db8a65fa1e482b9b54e7ad Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 26 Aug 2020 16:43:50 -0700 Subject: [PATCH 15/57] pv metric now sending to ME --- source/plugins/ruby/filter_cadvisor2mdm.rb | 4 +++- source/plugins/ruby/out_mdm.rb | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 097b7df35..bf8c53cb0 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -168,7 +168,7 @@ def filter(tag, time, record) end flushMetricTelemetry setThresholdExceededTelemetry(metricName) - return mdmMetrics + return mdmMetrics[0] end end @@ -304,7 +304,9 @@ def filter_stream(tag, es) es.each { |time, record| filtered_records = filter(tag, time, record) + @log.info "filtered records: #{filtered_records}" filtered_records.each { |filtered_record| + @log.info "filtered_record: #{filtered_record}" new_es.add(time, filtered_record) if filtered_record } if filtered_records } diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index f658af612..91563e100 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -14,6 +14,7 @@ def initialize require "net/https" require "uri" require "yajl/json_gem" + require "logger" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "constants" @@ -187,7 +188,7 @@ def write_status_file(success, message) # Convert the event to a raw string. def format(tag, time, record) if record != {} - @log.trace "Buffering #{tag}" + #@log.trace "Buffering #{tag}" return [tag, record].to_msgpack else return "" @@ -236,6 +237,7 @@ def send_to_mdm(post_body) request.body = post_body.join("\n") @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" response = @http_client.request(request) + @log.info "REQUEST RESPONSE: #{response}" response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now From 0f41269f99f388cc0067408988380fb698502a15 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 09:32:08 -0700 Subject: [PATCH 16/57] add in threshold condition --- source/plugins/ruby/filter_cadvisor2mdm.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index bf8c53cb0..1756b1da4 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -160,11 +160,13 @@ def filter(tag, time, record) thresholdPercentage = @@metric_threshold_hash[metricName] @log.info "thresholdPercentage: #{thresholdPercentage}" - mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + if percentage_metric_value >= thresholdPercentage + mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, thresholdPercentage)) + end end flushMetricTelemetry setThresholdExceededTelemetry(metricName) From d4148cc5097ec18a45afb1f44940c6ccc832484e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 11:49:28 -0700 Subject: [PATCH 17/57] constants and consistent naming --- build/linux/installer/conf/container.conf | 2 +- build/linux/installer/conf/kube.conf | 2 +- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 10 +++++----- source/plugins/ruby/MdmMetricsGenerator.rb | 4 ++-- source/plugins/ruby/constants.rb | 6 ++++++ source/plugins/ruby/filter_cadvisor2mdm.rb | 10 +++++----- source/plugins/ruby/in_win_cadvisor_perf.rb | 1 + 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/build/linux/installer/conf/container.conf b/build/linux/installer/conf/container.conf index 8988c24bd..e55c62fbc 100644 --- a/build/linux/installer/conf/container.conf +++ b/build/linux/installer/conf/container.conf @@ -46,7 +46,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pv_used_bytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,memoryRssBytes,pvUsedBytes log_level info diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 50ecb3a6a..ba40b7a35 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -74,7 +74,7 @@ type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pv_used_bytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 228ff6ad9..b06e37e21 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -303,7 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", "pv_used_bytes", metricTime)) + metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -372,10 +372,10 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags["pvName"] = pvName - metricTags["pvcName"] = pvcName - metricTags["pv_capacity_bytes"] = volume["capacityBytes"] - metricTags["podNamespace"] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 1555f75b0..dbcf84772 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,7 +35,7 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, - "pv_used_bytes" => "pv_usage_percentage" + Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC } # Setting this to true since we need to send zero filled metrics at startup. If metrics are absent alert creation fails @@ -410,7 +410,7 @@ def getContainerResourceUtilizationThresholds @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) - metric_threshold_hash["pv_used_bytes"] = pvUsagePercentageThresholdFloat + metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat end rescue => errorStr @log.info "Error in getContainerResourceUtilizationThresholds: #{errorStr}" diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 91dfc6077..3295c8823 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,10 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" + INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" + INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" #Kubestate (common) @@ -45,6 +49,7 @@ class Constants MDM_CONTAINER_CPU_UTILIZATION_METRIC = "cpuExceededPercentage" MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC = "memoryRssExceededPercentage" MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC = "memoryWorkingSetExceededPercentage" + MDM_PV_UTILIZATION_METRIC = "pvUsageExceededPercentage" MDM_NODE_CPU_USAGE_PERCENTAGE = "cpuUsagePercentage" MDM_NODE_MEMORY_RSS_PERCENTAGE = "memoryRssPercentage" MDM_NODE_MEMORY_WORKING_SET_PERCENTAGE = "memoryWorkingSetPercentage" @@ -56,6 +61,7 @@ class Constants CPU_USAGE_MILLI_CORES = "cpuUsageMillicores" MEMORY_WORKING_SET_BYTES= "memoryWorkingSetBytes" MEMORY_RSS_BYTES = "memoryRssBytes" + PV_USED_BYTES = "pvUsedBytes" DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 1756b1da4..865f8bce6 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -16,7 +16,7 @@ class CAdvisor2MdmFilter < Filter config_param :enable_log, :integer, :default => 0 config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/filter_cadvisor2mdm.log" config_param :custom_metrics_azure_regions, :string - config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,'pv_used_bytes'" + config_param :metrics_to_collect, :string, :default => "Constants::CPU_USAGE_NANO_CORES,Constants::MEMORY_WORKING_SET_BYTES,Constants::MEMORY_RSS_BYTES,Constants::PV_USED_BYTES" @@hostName = (OMS::Common.get_hostname) @@ -90,7 +90,7 @@ def setThresholdExceededTelemetry(metricName) @containersExceededMemRssThreshold = true elsif metricName == Constants::MEMORY_WORKING_SET_BYTES @containersExceededMemWorkingSetThreshold = true - elsif metricName == "pv_used_bytes" + elsif metricName == Constants::PV_USED_BYTES @pvExceededUsageThreshold = true end rescue => errorStr @@ -109,7 +109,7 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] - properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash["pv_used_bytes"] + properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold @@ -143,11 +143,11 @@ def filter(tag, time, record) mdmMetrics = [] record["DataItems"].each do |dataItem| @log.info "dataItem: #{dataItem}" - if dataItem["Name"] == "pv_used_bytes" + if dataItem["Name"] == Constants::PV_USED_BYTES @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] usage = dataItem["Value"] - capacity = dataItem["Tags"]["pv_capacity_bytes"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity @log.info "capacity is not 0" diff --git a/source/plugins/ruby/in_win_cadvisor_perf.rb b/source/plugins/ruby/in_win_cadvisor_perf.rb index 38868f2f5..4e90195e5 100644 --- a/source/plugins/ruby/in_win_cadvisor_perf.rb +++ b/source/plugins/ruby/in_win_cadvisor_perf.rb @@ -101,6 +101,7 @@ def enumerate() end router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, insightsMetricsEventStream) if insightsMetricsEventStream + router.emit_stream(@mdmtag, insightsMetricsEventStream) if insightsMetricsEventStream if (!@@istestvar.nil? && !@@istestvar.empty? && @@istestvar.casecmp("true") == 0 && insightsMetricsEventStream.count > 0) $log.info("winCAdvisorInsightsMetricsEmitStreamSuccess @ #{Time.now.utc.iso8601}") end From 1d6cee6e44bdd5d9b0bca3f823230af5cfa4c858 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 27 Aug 2020 12:34:36 -0700 Subject: [PATCH 18/57] comments and code cleanup --- .../scripts/tomlparser-mdm-metrics-config.rb | 2 +- kubernetes/container-azm-ms-agentconfig.yaml | 10 +++--- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 23 ++++--------- source/plugins/ruby/MdmMetricsGenerator.rb | 9 +++-- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 33 ++++++++----------- 6 files changed, 30 insertions(+), 48 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index d2990ca0c..5a90b4b04 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -67,7 +67,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #PV + #Persistent Volume & Persistent Volume Claim pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index df175d700..58c9cdcd1 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -19,7 +19,7 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stdout' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this configmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = [] + exclude_namespaces = ["kube-system"] [log_collection_settings.stderr] # Default value for enabled is true @@ -28,24 +28,24 @@ data: # kube-system log collection is disabled by default in the absence of 'log_collection_settings.stderr' setting. If you want to enable kube-system, remove it from the following setting. # If you want to continue to disable kube-system log collection keep this namespace in the following setting and add any other namespace you want to disable log collection to the array. # In the absense of this cofigmap, default value for exclude_namespaces = ["kube-system"] - exclude_namespaces = [] + exclude_namespaces = ["kube-system"] [log_collection_settings.env_var] # In the absense of this configmap, default value for enabled is true enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = true + enabled = ffalse # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false # When the setting is set to false, only the kube events with !normal event type will be collected - enabled = true + enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected [log_collection_settings.collect_kube_system_pv_metrics] # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = true + enabled = false # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index b06e37e21..b30a79ff8 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -315,14 +315,6 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) - @Log.info("Getting PV metrics") - pvKubeSystem = @pvKubeSystemCollectionMetricsEnabled.nil? ? "pv kube-system nil" : "pv kube-system not nil" - @Log.info(pvKubeSystem) - @Log.info(@pvKubeSystemCollectionMetricsEnabled) - - pvKubeSystemCollectionMetrics = ENV["AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS"] - @Log.info(pvKubeSystemCollectionMetrics) - metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -333,20 +325,17 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podName = pod["podRef"]["name"] podNamespace = pod["podRef"]["namespace"] - kubeSystemNamespace = false + excludeNamespace = false if (podNamespace.include? "kube-system") - @Log.info("kube-system namespace encountered") - if (pvKubeSystemCollectionMetrics == "true") - kubeSystemNamespace = false - @Log.info("kube-system namespace encountered - include") + if (@pvKubeSystemCollectionMetricsEnabled == "true") + excludeNamespace = false else - kubeSystemNamespace = true - @Log.info("kube-system namespace encountered - exclude") + excludeNamespace = true end end - if (!pod["containers"].nil? && !kubeSystemNamespace) + if (!pod["containers"].nil? && !excludeNamespace) pod["containers"].each do |container| containerName = container["name"] @@ -366,7 +355,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["Name"] = metricNameToReturn metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = "container.azm.ms/pv" + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index dbcf84772..e9953d0c1 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -264,10 +264,10 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) records = [] begin - @log.info "resource dimensions: #{dims}" - # get dimension values containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] - podNamespace = dims["podNamespace"] + podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] + + # Will need a different MDM Template resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { timestamp: recordTimeStamp, metricName: @@container_metric_name_metric_percentage_name_hash[metricName], @@ -384,7 +384,7 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD metric_threshold_hash[Constants::MEMORY_RSS_BYTES] = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD - metric_threshold_hash["pvUsage"] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD + metric_threshold_hash[Constants::PV_USED_BYTES] = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD cpuThreshold = ENV["AZMON_ALERT_CONTAINER_CPU_THRESHOLD"] if !cpuThreshold.nil? && !cpuThreshold.empty? @@ -405,7 +405,6 @@ def getContainerResourceUtilizationThresholds metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] = memoryWorkingSetThresholdFloat end - #pvUsagePercentageThreshold = 80.0 pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 3295c8823..183cbc415 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,6 +13,7 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 865f8bce6..700e90a9e 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -133,46 +133,39 @@ def filter(tag, time, record) @log.info "Tag: #{tag}" if @process_incoming_stream data_type = record["DataType"] - ip_name = record["IPName"] - @log.info "Data Type: #{data_type}" - @log.info "IP Name: #{data_type}" if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "insights metrics in filter_cadvisor2mdm" - @log.info "#{record["DataItems"]}" mdmMetrics = [] record["DataItems"].each do |dataItem| - @log.info "dataItem: #{dataItem}" + if dataItem["Name"] == Constants::PV_USED_BYTES - @log.info "pv_used_bytes is a data item" metricName = dataItem["Name"] usage = dataItem["Value"] capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity - @log.info "capacity is not 0" end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" resourceDimensions = dataItem["Tags"] - @log.info "#{resourceDimensions}" - thresholdPercentage = @@metric_threshold_hash[metricName] - @log.info "thresholdPercentage: #{thresholdPercentage}" + + flushMetricTelemetry if percentage_metric_value >= thresholdPercentage - mdmMetrics.push(MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, percentage_metric_value, resourceDimensions, - thresholdPercentage)) - end - end - flushMetricTelemetry - setThresholdExceededTelemetry(metricName) - return mdmMetrics[0] - end - end + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + end # end if block for insights metrics check object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] From 357914ad7b1101cd4e42f6ab6babe9341a1b276d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 12:03:45 -0700 Subject: [PATCH 19/57] remove container name, add pod name/uid --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 63 ++++++++--------- source/plugins/ruby/MdmAlertTemplates.rb | 34 ++++++++++ source/plugins/ruby/MdmMetricsGenerator.rb | 13 ++-- source/plugins/ruby/constants.rb | 2 + source/plugins/ruby/filter_cadvisor2mdm.rb | 68 ++++++++++--------- 6 files changed, 111 insertions(+), 71 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 58c9cdcd1..083263baf 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -107,7 +107,7 @@ data: # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage - pv_usage_threshold_percentage = 80.0 + pv_usage_threshold_percentage = 0.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index b30a79ff8..2c973ad2d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -335,41 +335,38 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m end - if (!pod["containers"].nil? && !excludeNamespace) - pod["containers"].each do |container| - containerName = container["name"] - - if (!pod["volume"].nil?) - pod["volume"].each do |volume| - if (!volume["pvcRef"].nil?) - pvcRef = volume["pvcRef"] - if (!pvcRef["name"].nil?) - - # A PVC exists on this volume - pvcName = pvcRef["name"] - pvName = volume["name"] - - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + if (!excludeNamespace) + if (!pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] = podUid + "/" + containerName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - end + metricItems.push(metricItem) end end end diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index 2e516a99d..d55435c1e 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -90,6 +90,40 @@ class MdmAlertTemplates } }' + PV_resource_utilization_template = ' + { + "time": "%{timestamp}", + "data": { + "baseData": { + "metric": "%{metricName}", + "namespace": "insights.container/persistentvolume", + "dimNames": [ + "podUID", + "podName", + "computerName", + "Kubernetes namespace", + "thresholdPercentage" + ], + "series": [ + { + "dimValues": [ + "%{podUidDimValue}", + "%{podNameDimValue}", + "%{computerNameDimValue}", + "%{namespaceDimValue}", + "%{thresholdPercentageDimValue}" + ], + "min": %{containerResourceUtilizationPercentage}, + "max": %{containerResourceUtilizationPercentage}, + "sum": %{containerResourceUtilizationPercentage}, + "count": 1 + } + ] + } + } + }' + + Node_resource_metrics_template = ' { "time": "%{timestamp}", diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index e9953d0c1..d09a52bab 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -261,19 +261,20 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag return records end - def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, percentageMetricValue, dims, thresholdPercentage) + def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percentageMetricValue, dims, thresholdPercentage) records = [] begin containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] + podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] + podUid = dims[INSIGHTSMETRICS_TAGS_POD_UID] - # Will need a different MDM Template - resourceUtilRecord = MdmAlertTemplates::Container_resource_utilization_template % { + resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, metricName: @@container_metric_name_metric_percentage_name_hash[metricName], - containerNameDimValue: containerName, - podNameDimValue: "podName", - controllerNameDimValue: "controllerName", + podUidDimValue: podUid, + podNameDimValue: podName, + nodeNameDimValue: computer, namespaceDimValue: podNamespace, containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 183cbc415..493f098c6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -13,9 +13,11 @@ class Constants INSIGHTSMETRICS_TAGS_K8SNAMESPACE = "k8sNamespace" INSIGHTSMETRICS_TAGS_CONTROLLER_NAME = "controllerName" INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" + INSIGHTSMETRICS_TAGS_POD_UID = "podUID" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_POD_NAME = "podName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 700e90a9e..592fd5da7 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,37 +135,8 @@ def filter(tag, time, record) data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" - mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] - - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], - metricName, - percentage_metric_value, - resourceDimensions, - thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items - return [] - end # end if block for insights metrics check + return filterPVInsightsMetrics(record) + end object_name = record["DataItems"][0]["ObjectName"] counter_name = record["DataItems"][0]["Collections"][0]["CounterName"] @@ -248,6 +219,41 @@ def filter(tag, time, record) end end + def filterPVInsightsMetrics(record) + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] + + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + metricName, + computer, + percentage_metric_value, + resourceDimensions, + thresholdPercentage) + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + end + def ensure_cpu_memory_capacity_set if @cpu_capacity != 0.0 && @memory_capacity != 0.0 @log.info "CPU And Memory Capacity are already set" From 9377262b03d8607f0d270ba05cb3569c640fb75d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 12:59:59 -0700 Subject: [PATCH 20/57] log fixes and constnat change --- source/plugins/ruby/MdmMetricsGenerator.rb | 4 ++-- source/plugins/ruby/constants.rb | 2 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index d09a52bab..b5086a744 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -267,7 +267,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen containerName = dims[Constants::INSIGHTSMETRICS_TAGS_CONTAINER_NAME] podNamespace = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] podName = dims[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] - podUid = dims[INSIGHTSMETRICS_TAGS_POD_UID] + podUid = dims[Constants::INSIGHTSMETRICS_TAGS_POD_UID] resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, @@ -282,7 +282,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr - @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" + @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end return records diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 493f098c6..299b1c248 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -68,7 +68,7 @@ class Constants DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_RSS_THRESHOLD = 95.0 DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD = 95.0 - DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 80.0 + DEFAULT_MDM_PV_UTILIZATION_THRESHOLD = 60.0 CONTROLLER_KIND_JOB = "job" CONTAINER_TERMINATION_REASON_COMPLETED = "completed" CONTAINER_STATE_TERMINATED = "terminated" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 592fd5da7..7353d9050 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -135,6 +135,8 @@ def filter(tag, time, record) data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" + @log.info "Insights Metrics" + @log.info "record: #{record}" return filterPVInsightsMetrics(record) end From ee14b2bdfd9fdab9d9b00362a5c53f8d4b5a8e3f Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 13:49:06 -0700 Subject: [PATCH 21/57] naming fix --- source/plugins/ruby/MdmMetricsGenerator.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index b5086a744..329d91813 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -274,7 +274,7 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen metricName: @@container_metric_name_metric_percentage_name_hash[metricName], podUidDimValue: podUid, podNameDimValue: podName, - nodeNameDimValue: computer, + computerNameDimValue: computer, namespaceDimValue: podNamespace, containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, From c1d46e8a7a9818b00b9b08b77aa1b25f0588a129 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 14:55:58 -0700 Subject: [PATCH 22/57] cleanup --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 9 ++------- source/plugins/ruby/MdmMetricsGenerator.rb | 3 --- source/plugins/ruby/filter_cadvisor2mdm.rb | 8 ++------ source/plugins/ruby/out_mdm.rb | 6 +----- 5 files changed, 6 insertions(+), 22 deletions(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 083263baf..fe80539d4 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -35,7 +35,7 @@ data: enabled = true [log_collection_settings.enrich_container_logs] # In the absense of this configmap, default value for enrich_container_logs is false - enabled = ffalse + enabled = false # When this is enabled (enabled = true), every container log entry (both stdout & stderr) will be enriched with container Name & container Image [log_collection_settings.collect_all_kube_events] # In the absense of this configmap, default value for collect_all_kube_events is false diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 2c973ad2d..8d65c16ea 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -326,15 +326,10 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podNamespace = pod["podRef"]["namespace"] excludeNamespace = false - if (podNamespace.include? "kube-system") - if (@pvKubeSystemCollectionMetricsEnabled == "true") - excludeNamespace = false - else - excludeNamespace = true - end + if (podNamespace.include? "kube-system" && @pvKubeSystemCollectionMetricsEnabled == "false") + excludeNamespace = true end - if (!excludeNamespace) if (!pod["volume"].nil?) pod["volume"].each do |volume| diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index 329d91813..e22660c71 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -252,7 +252,6 @@ def getContainerResourceUtilMetricRecords(recordTimeStamp, metricName, percentag containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } - @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getContainerResourceUtilMetricRecords: #{errorStr}" @@ -279,7 +278,6 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen containerResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } - @log.info "resourceUtilRecord: #{resourceUtilRecord}" records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) rescue => errorStr @log.info "Error in getPVResourceUtilMetricRecords: #{errorStr}" @@ -407,7 +405,6 @@ def getContainerResourceUtilizationThresholds end pvUsagePercentageThreshold = ENV["AZMON_ALERT_PV_USAGE_THRESHOLD"] - @log.info "pvUsagePercentageThreshold: #{pvUsagePercentageThreshold}" if !pvUsagePercentageThreshold.nil? && !pvUsagePercentageThreshold.empty? pvUsagePercentageThresholdFloat = (pvUsagePercentageThreshold.to_f).round(2) metric_threshold_hash[Constants::PV_USED_BYTES] = pvUsagePercentageThresholdFloat diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 7353d9050..33291452c 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -130,13 +130,11 @@ def flushMetricTelemetry def filter(tag, time, record) begin - @log.info "Tag: #{tag}" if @process_incoming_stream - data_type = record["DataType"] + # Check if insights metrics for PV metrics + data_type = record["DataType"] if data_type == "INSIGHTS_METRICS_BLOB" - @log.info "Insights Metrics" - @log.info "record: #{record}" return filterPVInsightsMetrics(record) end @@ -307,9 +305,7 @@ def filter_stream(tag, es) es.each { |time, record| filtered_records = filter(tag, time, record) - @log.info "filtered records: #{filtered_records}" filtered_records.each { |filtered_record| - @log.info "filtered_record: #{filtered_record}" new_es.add(time, filtered_record) if filtered_record } if filtered_records } diff --git a/source/plugins/ruby/out_mdm.rb b/source/plugins/ruby/out_mdm.rb index 91563e100..d801edb9a 100644 --- a/source/plugins/ruby/out_mdm.rb +++ b/source/plugins/ruby/out_mdm.rb @@ -4,7 +4,6 @@ module Fluent class OutputMDM < BufferedOutput config_param :retry_mdm_post_wait_minutes, :integer - config_param :log_path, :string, :default => "/var/opt/microsoft/docker-cimprov/log/out_mdm.log" Plugin.register_output("out_mdm", self) @@ -14,7 +13,6 @@ def initialize require "net/https" require "uri" require "yajl/json_gem" - require "logger" require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "constants" @@ -53,7 +51,6 @@ def initialize def configure(conf) s = conf.add_element("secondary") s["type"] = ChunkErrorHandler::SecondaryName - @log = Logger.new(@log_path, 1, 5000000) super end @@ -188,7 +185,7 @@ def write_status_file(success, message) # Convert the event to a raw string. def format(tag, time, record) if record != {} - #@log.trace "Buffering #{tag}" + @log.trace "Buffering #{tag}" return [tag, record].to_msgpack else return "" @@ -237,7 +234,6 @@ def send_to_mdm(post_body) request.body = post_body.join("\n") @log.info "REQUEST BODY SIZE #{request.body.bytesize / 1024}" response = @http_client.request(request) - @log.info "REQUEST RESPONSE: #{response}" response.value # this throws for non 200 HTTP response code @log.info "HTTP Post Response Code : #{response.code}" if @last_telemetry_sent_time.nil? || @last_telemetry_sent_time + 60 * 60 < Time.now From 130e5d72e79b37fe1bb2ea067200fdbc81ffa1fc Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 15:39:59 -0700 Subject: [PATCH 23/57] add pvUsedBytes as metric to collect --- kubernetes/omsagent.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index db788a37e..128f68697 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -125,7 +125,7 @@ data: type filter_cadvisor2mdm custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast - metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes + metrics_to_collect cpuUsageNanoCores,memoryWorkingSetBytes,pvUsedBytes log_level info From d0f8d58e59764fca68355c5a71b1fb8c4b978645 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 15:41:05 -0700 Subject: [PATCH 24/57] more cleanup --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 64 +++++++++---------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 8d65c16ea..7f06ba9d3 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -330,48 +330,46 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m excludeNamespace = true end - if (!excludeNamespace) - if (!pod["volume"].nil?) - pod["volume"].each do |volume| - if (!volume["pvcRef"].nil?) - pvcRef = volume["pvcRef"] - if (!pvcRef["name"].nil?) - - # A PVC exists on this volume - pvcName = pvcRef["name"] - pvName = volume["name"] - - metricItem = {} - metricItem["CollectionTime"] = metricPollTime - metricItem["Computer"] = hostName - metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + if (!excludeNamespace && !pod["volume"].nil?) + pod["volume"].each do |volume| + if (!volume["pvcRef"].nil?) + pvcRef = volume["pvcRef"] + if (!pvcRef["name"].nil?) + + # A PVC exists on this volume + pvcName = pvcRef["name"] + pvName = volume["name"] + + metricItem = {} + metricItem["CollectionTime"] = metricPollTime + metricItem["Computer"] = hostName + metricItem["Name"] = metricNameToReturn + metricItem["Value"] = volume[metricNameToCollect] + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] - - metricItem["Tags"] = metricTags + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID ] = clusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName + metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] + + metricItem["Tags"] = metricTags - metricItems.push(metricItem) - end + metricItems.push(metricItem) end end end end end rescue => errorStr - @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems - end + end return metricItems end From f0885e49a92989b3bfd50295f028658826c7abaf Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 16:28:35 -0700 Subject: [PATCH 25/57] boolean fix --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 7f06ba9d3..3f5d5bb5d 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -326,7 +326,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m podNamespace = pod["podRef"]["namespace"] excludeNamespace = false - if (podNamespace.include? "kube-system" && @pvKubeSystemCollectionMetricsEnabled == "false") + if (podNamespace.include? "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" excludeNamespace = true end From 62b84baa058168d77257d7e95bf09f0e3348ed34 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 31 Aug 2020 17:03:47 -0700 Subject: [PATCH 26/57] set threshold to 60 --- kubernetes/container-azm-ms-agentconfig.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index fe80539d4..6974d09c6 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -107,7 +107,7 @@ data: # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage - pv_usage_threshold_percentage = 0.0 + pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] collect_basic_metrics = false From 9da59bb373b8ece0f16267a20d6e46634cffee14 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 1 Sep 2020 15:03:13 -0700 Subject: [PATCH 27/57] add pv inventory fluent plugin structure --- build/linux/installer/conf/kube.conf | 23 +++ source/plugins/ruby/in_kube_pvinventory.rb | 166 +++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 source/plugins/ruby/in_kube_pvinventory.rb diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index ba40b7a35..cd7e996cb 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -16,6 +16,14 @@ custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug + + #Kubernetes events type kubeevents @@ -98,6 +106,21 @@ max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb new file mode 100644 index 000000000..c20fb6a1b --- /dev/null +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -0,0 +1,166 @@ +module Fluent + class Kube_PodInventory_Input < Input + Plugin.register_input("kubepvinventory", self) + def initialize + super + require "yaml" + require "yajl/json_gem" + require "yajl" + require "time" + + require_relative "KubernetesApiClient" + require_relative "ApplicationInsightsUtility" + require_relative "oms_common" + require_relative "omslog" + require_relative "constants" + + @PVC_CHUNK_SIZE = "1500" + end + + config_param :run_interval, :time, :default => 60 + config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" + + def configure + super + end + + def start + if @run_interval + @finished = false + @condition = ConditionVariable.new + @mutex = Mutex.new + @thread = Thread.new(&method(:run_periodic)) + @@podTelemetryTimeTracker = DateTime.now.to_time.to_i + end + end + + def shutdown + if @run_interval + @mutex.synchronize { + @finished = true + @condition.signal + } + @thread.join + end + end + + def enumerate + begin + pvInventory = nil + currentTime = Time.now + batchTime = currentTime.utc.iso8601 + + continuationToken = nil + $log.info("in_kube_pvinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistantvolumeclaims?limit=#{@PVC_CHUNK_SIZE}") + $log.info("in_kube_pvinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + + #If we receive a continuation token, make calls, process and flush data until we have processed all data + while (!continuationToken.nil? && !continuationToken.empty?) + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistantvolumeclaims?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + end + + # Setting this to nil so that we dont hold memory until GC kicks in + pvInventory = nil + rescue => errorStr + $log.warn "in_kube_pvinventory::enumerate:Failed in enumerate: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end # end enumerate + + def parse_and_emit_records(pvInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + currentTime = Time.now + emitTime = currentTime.to_f + eventStream = MultiEventStream.new + @@istestvar = ENV["ISTEST"] + + begin + records = [] + pvInventory["items"].each do |item| + record = {} + + record["CollectionTime"] = batchTime + record["Name"] = item["metadata"]["name"] + record["Namespace"] = item["metadata"]["namespace"] + record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + record["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + record["VolumeName"] = item["spec"]["volumeName"] + record["StorageClassName"] = item["spec"]["storageClassName"] + record["Status"] = item["status"]["phase"] + record["AccessMode"] = item["status"]["accessModes"][0] + record["RequestSize"] = item["status"]["capacity"]["storage"] + + record["PodUid"] = "" + record["DiskId"] = "" + record["ClusterName"] = KubernetesApiClient.getClusterName + record["ClusterId"] = KubernetesApiClient.getClusterId + + records.push(record.dup) + end + + records.each do |record| + if !record.nil? + wrapper = { + "DataType" => "KUBE_PV_INVENTORY_BLOB", + "IPName" => "ContainerInsights", + "DataItems" => [record.each { |k, v| record[k] = v }], + } + eventStream.add(emitTime, wrapper) if wrapper + end + end + + router.emit_stream(@tag, eventStream) if eventStream + + rescue => errorStr + $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end #begin block end + end + + def run_periodic + @mutex.lock + done = @finished + @nextTimeToRun = Time.now + @waitTimeout = @run_interval + until done + @nextTimeToRun = @nextTimeToRun + @run_interval + @now = Time.now + if @nextTimeToRun <= @now + @waitTimeout = 1 + @nextTimeToRun = @now + else + @waitTimeout = @nextTimeToRun - @now + end + @condition.wait(@mutex, @waitTimeout) + done = @finished + @mutex.unlock + if !done + begin + $log.info("in_kube_pvinventory::run_periodic.enumerate.start #{Time.now.utc.iso8601}") + enumerate + $log.info("in_kube_pvinventory::run_periodic.enumerate.end #{Time.now.utc.iso8601}") + rescue => errorStr + $log.warn "in_kube_pvinventory::run_periodic: enumerate Failed to retrieve pod inventory: #{errorStr}" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end + end + @mutex.lock + end + @mutex.unlock + end + + end # Kube_PodInventory_Input +end # module \ No newline at end of file From 9163a90ac8a6094c58c8f1abd65dffeb503227e9 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 2 Sep 2020 12:00:33 -0700 Subject: [PATCH 28/57] structure fixes --- build/linux/installer/conf/kube.conf | 2 +- .../linux/installer/datafiles/base_container.data | 1 + source/plugins/ruby/in_kube_pvinventory.rb | 14 +++++++++----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index cd7e996cb..52ebffa54 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -195,7 +195,7 @@ max_retry_wait 5m - + type out_mdm log_level debug num_threads 5 diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..dd5c85917 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -22,6 +22,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/microsoft/omsagent/plugin/filter_container.rb; source/plugins/ruby/filter_container.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_podinventory.rb; source/plugins/ruby/in_kube_podinventory.rb; 644; root; root +/opt/microsoft/omsagent/plugin/in_kube_pvinventory.rb; source/plugins/ruby/in_kube_pvinventory.rb; 644; root; root /opt/microsoft/omsagent/plugin/in_kube_events.rb; source/plugins/ruby/in_kube_events.rb; 644; root; root /opt/microsoft/omsagent/plugin/KubernetesApiClient.rb; source/plugins/ruby/KubernetesApiClient.rb; 644; root; root diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index c20fb6a1b..0943b8007 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -1,6 +1,9 @@ module Fluent - class Kube_PodInventory_Input < Input + class Kube_PVInventory_Input < Input Plugin.register_input("kubepvinventory", self) + + @@MDMKubePVInventoryTag = "mdm.kubepvinventory" + def initialize super require "yaml" @@ -52,7 +55,7 @@ def enumerate continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistantvolumeclaims?limit=#{@PVC_CHUNK_SIZE}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumeclaims?limit=#{@PVC_CHUNK_SIZE}") $log.info("in_kube_pvinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) @@ -63,7 +66,7 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistantvolumeclaims?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumeclaims?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) parse_and_emit_records(pvInventory, batchTime) else @@ -80,7 +83,7 @@ def enumerate end end # end enumerate - def parse_and_emit_records(pvInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f eventStream = MultiEventStream.new @@ -122,6 +125,7 @@ def parse_and_emit_records(pvInventory, serviceList, continuationToken, batchTim end router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@@MDMKubePVInventoryTag, eventStream) if eventStream rescue => errorStr $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" @@ -162,5 +166,5 @@ def run_periodic @mutex.unlock end - end # Kube_PodInventory_Input + end # Kube_PVInventory_Input end # module \ No newline at end of file From 6f705b7d02bbddbcd21c579c22829e980dafad97 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 2 Sep 2020 13:38:57 -0700 Subject: [PATCH 29/57] send as insights metrics --- source/plugins/ruby/in_kube_pvinventory.rb | 66 ++++++++++++++-------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 0943b8007..c427d11c8 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -23,7 +23,7 @@ def initialize config_param :run_interval, :time, :default => 60 config_param :tag, :string, :default => "oms.containerinsights.KubePVInventory" - def configure + def configure(conf) super end @@ -92,31 +92,53 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) begin records = [] pvInventory["items"].each do |item| - record = {} - - record["CollectionTime"] = batchTime - record["Name"] = item["metadata"]["name"] - record["Namespace"] = item["metadata"]["namespace"] - record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - record["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - record["VolumeName"] = item["spec"]["volumeName"] - record["StorageClassName"] = item["spec"]["storageClassName"] - record["Status"] = item["status"]["phase"] - record["AccessMode"] = item["status"]["accessModes"][0] - record["RequestSize"] = item["status"]["capacity"]["storage"] - - record["PodUid"] = "" - record["DiskId"] = "" - record["ClusterName"] = KubernetesApiClient.getClusterName - record["ClusterId"] = KubernetesApiClient.getClusterId - - records.push(record.dup) + metricItem = {} + metricItem["CollectionTime"] = batchTime + metricItem["Computer"] = "nodeName" + metricItem["Name"] = "pvInventory" + metricItem["Value"] = 0 + metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + metricItem["Namespace"] = "container.azm.ms/persistentvolume" + + metricTags = {} + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId + metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName + metricTags["Namespace"] = item["metadata"]["namespace"] + metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + metricTags["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + metricTags["VolumeName"] = item["spec"]["volumeName"] + metricTags["StorageClassName"] = item["spec"]["storageClassName"] + metricTags["Status"] = item["status"]["phase"] + metricTags["AccessMode"] = item["status"]["accessModes"][0] + metricTags["RequestSize"] = item["status"]["capacity"]["storage"] + + metricItem["Tags"] = metricTags + records.push(metricItem) + + #record = {} + #record["CollectionTime"] = batchTime + #record["Name"] = item["metadata"]["name"] + #record["Namespace"] = item["metadata"]["namespace"] + #record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + #record["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + #record["VolumeName"] = item["spec"]["volumeName"] + #record["StorageClassName"] = item["spec"]["storageClassName"] + #record["Status"] = item["status"]["phase"] + #record["AccessMode"] = item["status"]["accessModes"][0] + #record["RequestSize"] = item["status"]["capacity"]["storage"] + + #record["PodUid"] = "" + #record["DiskId"] = "" + #record["ClusterName"] = KubernetesApiClient.getClusterName + #record["ClusterId"] = KubernetesApiClient.getClusterId + + #records.push(record.dup) end records.each do |record| if !record.nil? wrapper = { - "DataType" => "KUBE_PV_INVENTORY_BLOB", + "DataType" => "INSIGHTS_METRICS_BLOB", "IPName" => "ContainerInsights", "DataItems" => [record.each { |k, v| record[k] = v }], } @@ -125,7 +147,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(@@MDMKubePVInventoryTag, eventStream) if eventStream + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" From 2e1a2cd04336797310ac7d1c7539b039a53e0f01 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 2 Sep 2020 14:49:22 -0700 Subject: [PATCH 30/57] include in pod inventory --- source/plugins/ruby/in_kube_podinventory.rb | 51 +++++++++++++++++++-- source/plugins/ruby/in_kube_pvinventory.rb | 1 + 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index bffa725ee..7373491d1 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -87,13 +87,45 @@ def enumerate(podList = nil) serviceInfo = nil end + # Get PVCs first so that we dont need to make a call for every chunk + pvInfo = nil + $log.info("in_kube_podinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumeclaims") + $log.info("in_kube_podinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + + if !pvInfo.nil? + $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) + $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInfo = nil + end + + pvNameToInventoryHash = {} + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + pvInventory["items"].each do |item| + pvRecord = {} + pvRecord["Name"] = item["metadata"]["name"] + pvRecord["Namespace"] = item["metadata"]["namespace"] + pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + pvRecord["VolumeName"] = item["spec"]["volumeName"] + pvRecord["StorageClassName"] = item["spec"]["storageClassName"] + pvRecord["Status"] = item["status"]["phase"] + pvRecord["AccessMode"] = item["status"]["accessModes"][0] + pvRecord["RequestSize"] = item["status"]["capacity"]["storage"] + + pvNameToInventoryHash[pvRecord["Name"]] = pvRecord.dup + else + $log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" + end + # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -102,13 +134,14 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end end # Setting these to nil so that we dont hold memory until GC kicks in + pvInventory = nil podInventory = nil serviceList = nil @@ -140,7 +173,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 @@ -159,6 +192,18 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] + pvInventoryRecords = [] + if !items["spec"].nil? && !items["spec"]["volumes"].nil? + items["spec"]["volumes"].each do |volume| + if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? + pvInventoryRecord = pvNameToInventoryHash[volume["persistentVolumeClaim"]["claimName"]] + pvInventoryRecords.push(pvInventoryRecord) + end + end + end + + record["pvInventories"] = pvInventoryRecords + # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && (items["spec"]["nodeName"].downcase.start_with?("infra-") || diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index c427d11c8..f8d87f4a8 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -103,6 +103,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName + metricTags["PVCName"] = item["metadata"]["name"] metricTags["Namespace"] = item["metadata"]["namespace"] metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] metricTags["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] From 8fca1274092aea55ba8d1c50928ae222e0da6189 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 3 Sep 2020 12:38:24 -0700 Subject: [PATCH 31/57] add check that pvUsedBytes is a configured metric to collect --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 33291452c..4ab3a0310 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -223,7 +223,7 @@ def filterPVInsightsMetrics(record) mdmMetrics = [] record["DataItems"].each do |dataItem| - if dataItem["Name"] == Constants::PV_USED_BYTES + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) metricName = dataItem["Name"] usage = dataItem["Value"] capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] From da0a34d688f1e04b91416e2ba042a53e1851a65e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 10:13:08 -0700 Subject: [PATCH 32/57] code review feedback changes --- build/common/installer/scripts/tomlparser.rb | 14 ---- .../installer/datafiles/base_container.data | 1 + .../scripts/tomlparser-mdm-metrics-config.rb | 16 +++- .../tomlparser-metric-collection-config.rb | 79 +++++++++++++++++++ kubernetes/container-azm-ms-agentconfig.yaml | 14 ++-- kubernetes/linux/main.sh | 8 ++ .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 22 +++--- source/plugins/ruby/MdmAlertTemplates.rb | 14 ++-- source/plugins/ruby/MdmMetricsGenerator.rb | 8 +- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/filter_cadvisor2mdm.rb | 66 +++++++++------- 11 files changed, 170 insertions(+), 73 deletions(-) create mode 100644 build/linux/installer/scripts/tomlparser-metric-collection-config.rb diff --git a/build/common/installer/scripts/tomlparser.rb b/build/common/installer/scripts/tomlparser.rb index 51c0d7b13..7235ee0c3 100644 --- a/build/common/installer/scripts/tomlparser.rb +++ b/build/common/installer/scripts/tomlparser.rb @@ -24,7 +24,6 @@ @excludePath = "*.csv2" #some invalid path @enrichContainerLogs = false @collectAllKubeEvents = false -@collectPVKubeSystemMetrics = false @containerLogsRoute = "" # Use parser to parse the configmap toml file to a ruby structure @@ -149,16 +148,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") end - #Get PV kube-system enrichment setting - begin - if !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? - @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] - puts "config::Using config map setting for PV kube-system collection" - end - rescue => errorStr - ConfigParseErrorLogger.logError("Exception while reading config map settings for kube event collection - #{errorStr}, using defaults, please check config map for errors") - end - #Get container logs route setting begin if !parsedConfig[:log_collection_settings][:route_container_logs].nil? && !parsedConfig[:log_collection_settings][:route_container_logs][:version].nil? @@ -210,7 +199,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export AZMON_CLUSTER_LOG_TAIL_EXCLUDE_PATH=#{@excludePath}\n") file.write("export AZMON_CLUSTER_CONTAINER_LOG_ENRICH=#{@enrichContainerLogs}\n") file.write("export AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS=#{@collectAllKubeEvents}\n") - file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") file.write("export AZMON_CONTAINER_LOGS_ROUTE=#{@containerLogsRoute}\n") # Close file after writing all environment variables file.close @@ -256,8 +244,6 @@ def get_command_windows(env_variable_name, env_variable_value) file.write(commands) commands = get_command_windows('AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS', @collectAllKubeEvents) file.write(commands) - commands = get_command_windows('export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS', @collectPVKubeSystemMetrics) - file.write(commands) commands = get_command_windows('AZMON_CONTAINER_LOGS_ROUTE', @containerLogsRoute) file.write(commands) diff --git a/build/linux/installer/datafiles/base_container.data b/build/linux/installer/datafiles/base_container.data index 87b89b14c..ca2538b79 100644 --- a/build/linux/installer/datafiles/base_container.data +++ b/build/linux/installer/datafiles/base_container.data @@ -120,6 +120,7 @@ MAINTAINER: 'Microsoft Corporation' /opt/livenessprobe.sh; build/linux/installer/scripts/livenessprobe.sh; 755; root; root /opt/tomlparser-prom-customconfig.rb; build/linux/installer/scripts/tomlparser-prom-customconfig.rb; 755; root; root /opt/tomlparser-mdm-metrics-config.rb; build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb; 755; root; root +/opt/tomlparser-metric-collection-config.rb; build/linux/installer/scripts/tomlparser-metric-collection-config.rb; 755; root; root /opt/tomlparser-health-config.rb; build/linux/installer/scripts/tomlparser-health-config.rb; 755; root; root /opt/tomlparser.rb; build/common/installer/scripts/tomlparser.rb; 755; root; root diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 5a90b4b04..dd9a582b9 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -67,12 +67,20 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #Persistent Volume & Persistent Volume Claim + #Persistent Volume + noPVConfig = false pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] - pvUsageThresholdFloat = pvUsageThreshold.to_f - if pvUsageThresholdFloat.kind_of? Float - @percentagePVUsageThreshold = pvUsageThresholdFloat + if !pvUsageThreshold.nil? + pvUsageThresholdFloat = pvUsageThreshold.to_f + if pvUsageThresholdFloat.kind_of? Float + @percentagePVUsageThreshold = pvUsageThresholdFloat + else + noPVConfig = true + end else + noPVConfig = true + end + if (noPVConfig) puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb new file mode 100644 index 000000000..c48c08cd8 --- /dev/null +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -0,0 +1,79 @@ +#!/usr/local/bin/ruby +# frozen_string_literal: true + +require_relative "tomlrb" +require_relative "ConfigParseErrorLogger" +require_relative "microsoft/omsagent/plugin/constants" +require_relative "../../../../source/plugins/ruby/ApplicationInsightsUtility.rb" + +@configMapMountPath = "/etc/config/settings/metric_collection_settings" +@configVersion = "" +@configSchemaVersion = "" + +# Setting default values which will be used in case they are not set in the configmap or if configmap doesnt exist +@collectPVKubeSystemMetrics = false + +# Use parser to parse the configmap toml file to a ruby structure +def parseConfigMap + begin + # Check to see if config map is created + if (File.file?(@configMapMountPath)) + puts "config::configmap container-azm-ms-agentconfig for metric collection settings mounted, parsing values" + parsedConfig = Tomlrb.load_file(@configMapMountPath, symbolize_keys: true) + puts "config::Successfully parsed mounted config map" + return parsedConfig + else + puts "config::configmap container-azm-ms-agentconfig for metric collection settings not mounted, using defaults" + return nil + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while parsing config map for metric collection settings: #{errorStr}, using defaults, please check config map for errors") + return nil + end +end + +# Use the ruby structure created after config parsing to set the right values to be used for metric collection settings +def populateSettingValuesFromConfigMap(parsedConfig) + # Get metric collection settings for including or excluding kube-system namespace in PV metrics + begin + if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? + @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + puts "config::Using config map setting for PV kube-system collection" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") + end + + begin + if @collectPVKubeSystemMetrics + ApplicationInsightsUtility.sendCustomEvent("CollectPVKubeSystemMetricsEnabled", {}) + end + rescue => errorStr + end +end + +@configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] +puts "****************Start Metric Collection Settings Processing********************" +if !@configSchemaVersion.nil? && !@configSchemaVersion.empty? && @configSchemaVersion.strip.casecmp("v1") == 0 #note v1 is the only supported schema version, so hardcoding it + configMapSettings = parseConfigMap + if !configMapSettings.nil? + populateSettingValuesFromConfigMap(configMapSettings) + end +else + if (File.file?(@configMapMountPath)) + ConfigParseErrorLogger.logError("config::unsupported/missing config schema version - '#{@configSchemaVersion}' , using defaults, please use supported schema version") + end +end + +# Write the settings to file, so that they can be set as environment variables +file = File.open("config_metric_collection_env_var", "w") + +if !file.nil? + file.write("export AZMON_PV_COLLECT_KUBE_SYSTEM_METRICS=#{@collectPVKubeSystemMetrics}\n") + # Close file after writing all metric collection setting environment variables + file.close + puts "****************End Metric Collection Settings Processing********************" +else + puts "Exception while opening file for writing MDM metric config environment variables" + puts "****************End Metric Collection Settings Processing********************" +end diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index 6974d09c6..b6e1364ad 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -42,11 +42,6 @@ data: # When the setting is set to false, only the kube events with !normal event type will be collected enabled = false # When this is enabled (enabled = true), all kube events including normal events will be collected - [log_collection_settings.collect_kube_system_pv_metrics] - # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false - # When the setting is set to false, only the pv metrics outside the kube_system namespace will be collected - enabled = false - # When this is enabled (enabled = true), pv metrics including those in the kube_system namespace will be collected prometheus-data-collection-settings: |- # Custom Prometheus metrics data collection settings @@ -96,6 +91,15 @@ data: #fieldpass = ["metric_to_pass1", "metric_to_pass12"] #fielddrop = ["metric_to_drop"] + + metric_collection_settings: |- + # Metrics collection settings for metrics sent to Log Analytics and MDM + [metric_collection_settings.collect_kube_system_pv_metrics] + # In the absense of this configmap, default value for collect_kube_system_pv_metrics is false + # When the setting is set to false, only the persistent volume metrics outside the kube-system namespace will be collected + enabled = false + # When this is enabled (enabled = true), persistent volume metrics including those in the kube-system namespace will be collected + alertable-metrics-configuration-settings: |- # Alertable metrics configuration settings for container resource utilization [alertable_metrics_configuration_settings.container_resource_utilization_thresholds] diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index 311470660..d9fdc42e9 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -236,6 +236,14 @@ cat config_mdm_metrics_env_var | while read line; do done source config_mdm_metrics_env_var +#Parse the configmap to set the right environment variables for metric collection settings +/opt/microsoft/omsagent/ruby/bin/ruby tomlparser-metric-collection-config.rb + +cat config_metric_collection_env_var | while read line; do + echo $line >> ~/.bashrc +done +source config_metric_collection_env_var + #Setting environment variable for CAdvisor metrics to use port 10255/10250 based on curl request echo "Making wget request to cadvisor endpoint with port 10250" #Defaults to use port 10255 diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 3f5d5bb5d..afde3401b 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -303,7 +303,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "memoryUsed","containerGpumemoryUsedBytes", metricTime)) metricDataItems.concat(getContainerGpuMetricsAsInsightsMetrics(metricInfo, hostName, "dutyCycle","containerGpuDutyCycle", metricTime)) - metricDataItems.concat(getPersistentVolumeClaimMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) + metricDataItems.concat(getPersistentVolumeMetrics(metricInfo, hostName, "usedBytes", Constants::PV_USED_BYTES, metricTime)) else @Log.warn("Couldn't get Insights metrics information for host: #{hostName} os:#{operatingSystem}") end @@ -314,29 +314,29 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) return metricDataItems end - def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName begin metricInfo = metricJSON metricInfo["pods"].each do |pod| - podUid = pod["podRef"]["uid"] - podName = pod["podRef"]["name"] - podNamespace = pod["podRef"]["namespace"] - excludeNamespace = false - if (podNamespace.include? "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" - excludeNamespace = true + podNamespace = pod["podRef"]["namespace"] + includeNamespace = false + if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" + includeNamespace = true end - if (!excludeNamespace && !pod["volume"].nil?) + if (!includeNamespace && !pod["volume"].nil?) pod["volume"].each do |volume| if (!volume["pvcRef"].nil?) pvcRef = volume["pvcRef"] if (!pvcRef["name"].nil?) # A PVC exists on this volume + podUid = pod["podRef"]["uid"] + podName = pod["podRef"]["name"] pvcName = pvcRef["name"] pvName = volume["name"] @@ -344,7 +344,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNameToCollect] + metricItem["Value"] = volume[metricNasmeToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE @@ -367,7 +367,7 @@ def getPersistentVolumeClaimMetrics(metricJSON, hostName, metricNameToCollect, m end end rescue => errorStr - @Log.warn("getPersistentVolumeClaimMetrics failed: #{errorStr} for metric #{metricNameToCollect}") + @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end return metricItems diff --git a/source/plugins/ruby/MdmAlertTemplates.rb b/source/plugins/ruby/MdmAlertTemplates.rb index d55435c1e..d5107fea1 100644 --- a/source/plugins/ruby/MdmAlertTemplates.rb +++ b/source/plugins/ruby/MdmAlertTemplates.rb @@ -96,26 +96,24 @@ class MdmAlertTemplates "data": { "baseData": { "metric": "%{metricName}", - "namespace": "insights.container/persistentvolume", + "namespace": "insights.container/persistentvolumes", "dimNames": [ - "podUID", "podName", - "computerName", - "Kubernetes namespace", + "node", + "kubernetesNamespace", "thresholdPercentage" ], "series": [ { "dimValues": [ - "%{podUidDimValue}", "%{podNameDimValue}", "%{computerNameDimValue}", "%{namespaceDimValue}", "%{thresholdPercentageDimValue}" ], - "min": %{containerResourceUtilizationPercentage}, - "max": %{containerResourceUtilizationPercentage}, - "sum": %{containerResourceUtilizationPercentage}, + "min": %{pvResourceUtilizationPercentage}, + "max": %{pvResourceUtilizationPercentage}, + "sum": %{pvResourceUtilizationPercentage}, "count": 1 } ] diff --git a/source/plugins/ruby/MdmMetricsGenerator.rb b/source/plugins/ruby/MdmMetricsGenerator.rb index e22660c71..662a10322 100644 --- a/source/plugins/ruby/MdmMetricsGenerator.rb +++ b/source/plugins/ruby/MdmMetricsGenerator.rb @@ -35,6 +35,9 @@ class MdmMetricsGenerator Constants::CPU_USAGE_NANO_CORES => Constants::MDM_CONTAINER_CPU_UTILIZATION_METRIC, Constants::MEMORY_RSS_BYTES => Constants::MDM_CONTAINER_MEMORY_RSS_UTILIZATION_METRIC, Constants::MEMORY_WORKING_SET_BYTES => Constants::MDM_CONTAINER_MEMORY_WORKING_SET_UTILIZATION_METRIC, + } + + @@pod_metric_name_metric_percentage_name_hash = { Constants::PV_USED_BYTES => Constants::MDM_PV_UTILIZATION_METRIC } @@ -270,12 +273,11 @@ def getPVResourceUtilMetricRecords(recordTimeStamp, metricName, computer, percen resourceUtilRecord = MdmAlertTemplates::PV_resource_utilization_template % { timestamp: recordTimeStamp, - metricName: @@container_metric_name_metric_percentage_name_hash[metricName], - podUidDimValue: podUid, + metricName: @@pod_metric_name_metric_percentage_name_hash[metricName], podNameDimValue: podName, computerNameDimValue: computer, namespaceDimValue: podNamespace, - containerResourceUtilizationPercentage: percentageMetricValue, + pvResourceUtilizationPercentage: percentageMetricValue, thresholdPercentageDimValue: thresholdPercentage, } records.push(Yajl::Parser.parse(StringIO.new(resourceUtilRecord))) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 299b1c248..9aea6eb3a 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -81,6 +81,7 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" + PV_METRICS_HEART_BEAT_EVENT = "PVUtilMdmHeartBeatEvent" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 4ab3a0310..edd26603b 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -57,7 +57,6 @@ def start if @process_incoming_stream @cpu_capacity = 0.0 @memory_capacity = 0.0 - @pv_capacity = 0.0 ensure_cpu_memory_capacity_set @containerCpuLimitHash = {} @containerMemoryLimitHash = {} @@ -109,13 +108,18 @@ def flushMetricTelemetry properties["CpuThresholdPercentage"] = @@metric_threshold_hash[Constants::CPU_USAGE_NANO_CORES] properties["MemoryRssThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_RSS_BYTES] properties["MemoryWorkingSetThresholdPercentage"] = @@metric_threshold_hash[Constants::MEMORY_WORKING_SET_BYTES] - properties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] # Keeping track of any containers that have exceeded threshold in the last flush interval properties["CpuThresholdExceededInLastFlushInterval"] = @containersExceededCpuThreshold properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold - properties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) + + # Also send for PV usage metrics + pvProperties = {} + pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] + pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENTT, pvProperties) + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i @containersExceededCpuThreshold = false @containersExceededMemRssThreshold = false @@ -220,38 +224,44 @@ def filter(tag, time, record) end def filterPVInsightsMetrics(record) - mdmMetrics = [] - record["DataItems"].each do |dataItem| - - if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) - metricName = dataItem["Name"] - usage = dataItem["Value"] - capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] - if capacity != 0 - percentage_metric_value = (usage * 100.0) / capacity - end - @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" - @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" + begin + mdmMetrics = [] + record["DataItems"].each do |dataItem| + + if dataItem["Name"] == Constants::PV_USED_BYTES && @metrics_to_collect_hash.key?(dataItem["Name"].downcase) + metricName = dataItem["Name"] + usage = dataItem["Value"] + capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] + if capacity != 0 + percentage_metric_value = (usage * 100.0) / capacity + end + @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" + @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" - computer = dataItem["Computer"] - resourceDimensions = dataItem["Tags"] - thresholdPercentage = @@metric_threshold_hash[metricName] + computer = dataItem["Computer"] + resourceDimensions = dataItem["Tags"] + thresholdPercentage = @@metric_threshold_hash[metricName] - flushMetricTelemetry - if percentage_metric_value >= thresholdPercentage - setThresholdExceededTelemetry(metricName) - return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], + flushMetricTelemetry + if percentage_metric_value >= thresholdPercentage + setThresholdExceededTelemetry(metricName) + return MdmMetricsGenerator.getPVResourceUtilMetricRecords(dataItem["CollectionTime"], metricName, computer, percentage_metric_value, resourceDimensions, thresholdPercentage) - else - return [] - end # end if block for percentage metric > configured threshold % check - end # end if block for dataItem name check - end # end for block of looping through data items - return [] + else + return [] + end # end if block for percentage metric > configured threshold % check + end # end if block for dataItem name check + end # end for block of looping through data items + return [] + rescue Exception => e + @log.info "Error processing cadvisor insights metrics record Exception: #{e.class} Message: #{e.message}" + ApplicationInsightsUtility.sendExceptionTelemetry(e.backtrace) + return [] #return empty array if we ran into any errors + end end def ensure_cpu_memory_capacity_set From 68404bf374776a0067d7d8b8edf3b39808fda7cb Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 14:48:59 -0700 Subject: [PATCH 33/57] after testing changes --- .../scripts/tomlparser-mdm-metrics-config.rb | 42 +++++++++++-------- .../tomlparser-metric-collection-config.rb | 10 +---- kubernetes/container-azm-ms-agentconfig.yaml | 5 ++- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 23 ++++++++-- source/plugins/ruby/constants.rb | 3 +- source/plugins/ruby/filter_cadvisor2mdm.rb | 26 ++++++++---- 6 files changed, 69 insertions(+), 40 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index dd9a582b9..04d664289 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -36,7 +36,7 @@ def parseConfigMap # Use the ruby structure created after config parsing to set the right values to be used for MDM metric configuration settings def populateSettingValuesFromConfigMap(parsedConfig) if !parsedConfig.nil? && !parsedConfig[:alertable_metrics_configuration_settings].nil? - # Get mdm metrics config settings for resource utilization + # Get mdm metrics config settings for container resource utilization begin resourceUtilization = parsedConfig[:alertable_metrics_configuration_settings][:container_resource_utilization_thresholds] if !resourceUtilization.nil? @@ -67,30 +67,38 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "config::Non floating point value or value not convertible to float specified for Memory Working Set threshold, using default " @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD end - #Persistent Volume - noPVConfig = false - pvUsageThreshold = resourceUtilization[:pv_usage_threshold_percentage] + puts "config::Using config map settings for MDM metric configuration settings for container resource utilization" + end + rescue => errorStr + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") + @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD + @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD + @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + end + + # Get mdm metrics config settings for PV utilization + begin + usingPVThresholdConfig = false + pvUtilization = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + if !pvUtilization.nil? + pvUsageThreshold = pvUtilization[:pv_usage_threshold_percentage] if !pvUsageThreshold.nil? pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float @percentagePVUsageThreshold = pvUsageThresholdFloat - else - noPVConfig = true + usingPVThresholdConfig = true end - else - noPVConfig = true - end - if (noPVConfig) - puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " - @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end - puts "config::Using config map settings for MDM metric configuration settings for resource utilization" + end + + if usingPVThresholdConfig + puts "config::Using config map settings for MDM metric configuration settings for PV utilization" + else + puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " + @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end rescue => errorStr - ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for resource utilization - #{errorStr}, using defaults, please check config map for errors") - @percentageCpuUsageThreshold = Constants::DEFAULT_MDM_CPU_UTILIZATION_THRESHOLD - @percentageMemoryRssThreshold = Constants::DEFAULT_MDM_MEMORY_RSS_THRESHOLD - @percentageMemoryWorkingSetThreshold = Constants::DEFAULT_MDM_MEMORY_WORKING_SET_THRESHOLD + ConfigParseErrorLogger.logError("Exception while reading config map settings for MDM metric configuration settings for PV utilization - #{errorStr}, using defaults, please check config map for errors") @percentagePVUsageThreshold = Constants::DEFAULT_MDM_PV_UTILIZATION_THRESHOLD end end diff --git a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb index c48c08cd8..40d87b7f1 100644 --- a/build/linux/installer/scripts/tomlparser-metric-collection-config.rb +++ b/build/linux/installer/scripts/tomlparser-metric-collection-config.rb @@ -4,7 +4,6 @@ require_relative "tomlrb" require_relative "ConfigParseErrorLogger" require_relative "microsoft/omsagent/plugin/constants" -require_relative "../../../../source/plugins/ruby/ApplicationInsightsUtility.rb" @configMapMountPath = "/etc/config/settings/metric_collection_settings" @configVersion = "" @@ -37,19 +36,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) # Get metric collection settings for including or excluding kube-system namespace in PV metrics begin if !parsedConfig.nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics].nil? && !parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled].nil? - @collectPVKubeSystemMetrics = parsedConfig[:log_collection_settings][:collect_kube_system_pv_metrics][:enabled] + @collectPVKubeSystemMetrics = parsedConfig[:metric_collection_settings][:collect_kube_system_pv_metrics][:enabled] puts "config::Using config map setting for PV kube-system collection" end rescue => errorStr ConfigParseErrorLogger.logError("Exception while reading config map settings for PV kube-system collection - #{errorStr}, using defaults, please check config map for errors") end - - begin - if @collectPVKubeSystemMetrics - ApplicationInsightsUtility.sendCustomEvent("CollectPVKubeSystemMetricsEnabled", {}) - end - rescue => errorStr - end end @configSchemaVersion = ENV["AZMON_AGENT_CFG_SCHEMA_VERSION"] diff --git a/kubernetes/container-azm-ms-agentconfig.yaml b/kubernetes/container-azm-ms-agentconfig.yaml index b6e1364ad..aec1bb456 100644 --- a/kubernetes/container-azm-ms-agentconfig.yaml +++ b/kubernetes/container-azm-ms-agentconfig.yaml @@ -110,7 +110,10 @@ data: container_memory_rss_threshold_percentage = 95.0 # Threshold for container memoryWorkingSet, metric will be sent only when memory working set exceeds or becomes equal to the following percentage container_memory_working_set_threshold_percentage = 95.0 - # Threshold for pv usage bytes, metric will be sent only when pv utilization exceeds or becomes equal to the following percentage + + # Alertable metrics configuration settings for persistent volume utilization + [alertable_metrics_configuration_settings.pv_utilization_thresholds] + # Threshold for persistent volume usage bytes, metric will be sent only when persistent volume utilization exceeds or becomes equal to the following percentage pv_usage_threshold_percentage = 60.0 integrations: |- [integrations.azure_network_policy_manager] diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index afde3401b..3355a11db 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -54,6 +54,7 @@ class CAdvisorMetricsAPIClient @@winNodePrevMetricRate = {} @@telemetryCpuMetricTimeTracker = DateTime.now.to_time.to_i @@telemetryMemoryMetricTimeTracker = DateTime.now.to_time.to_i + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i #Containers a hash of node name and the last time telemetry was sent for this node @@nodeTelemetryTimeTracker = {} @@ -315,6 +316,9 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) + telemetryimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 + metricItems = [] clusterId = KubernetesApiClient.getClusterId clusterName = KubernetesApiClient.getClusterName @@ -323,12 +327,12 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricInfo["pods"].each do |pod| podNamespace = pod["podRef"]["namespace"] - includeNamespace = false + excludeNamespace = false if (podNamespace.downcase == "kube-system") && @pvKubeSystemCollectionMetricsEnabled == "false" - includeNamespace = true + excludeNamespace = true end - if (!includeNamespace && !pod["volume"].nil?) + if (!excludeNamespace && !pod["volume"].nil?) pod["volume"].each do |volume| if (!volume["pvcRef"].nil?) pvcRef = volume["pvcRef"] @@ -344,7 +348,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricItem["CollectionTime"] = metricPollTime metricItem["Computer"] = hostName metricItem["Name"] = metricNameToReturn - metricItem["Value"] = volume[metricNasmeToCollect] + metricItem["Value"] = volume[metricNameToCollect] metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE @@ -370,6 +374,17 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric @Log.warn("getPersistentVolumeMetrics failed: #{errorStr} for metric #{metricNameToCollect}") return metricItems end + + # If kube-system metrics collection enabled, send telemetry + begin + if telemetryTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES && @pvKubeSystemCollectionMetricsEnabled == "true" + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT, {}) + @@telemetryPVKubeSystemMetricsTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @Log.warn("getPersistentVolumeMetrics kube-system metrics enabled telemetry failed: #{errorStr}") + end + return metricItems end diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 9aea6eb3a..692fd6e4d 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -81,7 +81,8 @@ class Constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" POD_READY_PERCENTAGE_HEART_BEAT_EVENT = "PodReadyPercentageMdmHeartBeatEvent" CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" - PV_METRICS_HEART_BEAT_EVENT = "PVUtilMdmHeartBeatEvent" + PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" + PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index edd26603b..6cf1e3d72 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -46,6 +46,7 @@ def start @metrics_to_collect_hash = build_metrics_hash @log.debug "After check_custom_metrics_availability process_incoming_stream #{@process_incoming_stream}" @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i # These variables keep track if any resource utilization threshold exceeded in the last 10 minutes @containersExceededCpuThreshold = false @@ -113,21 +114,30 @@ def flushMetricTelemetry properties["MemRssThresholdExceededInLastFlushInterval"] = @containersExceededMemRssThreshold properties["MemWSetThresholdExceededInLastFlushInterval"] = @containersExceededMemWorkingSetThreshold ApplicationInsightsUtility.sendCustomEvent(Constants::CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT, properties) + @containersExceededCpuThreshold = false + @containersExceededMemRssThreshold = false + @containersExceededMemWorkingSetThreshold = false + @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr + @log.info "Error in flushMetricTelemetry: #{errorStr} for container resource util telemetry" + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) + end - # Also send for PV usage metrics + # Also send for PV usage metrics + begin + pvTimeDifference = (DateTime.now.to_time.to_i - @@pvUsageTelemetryTimeTracker).abs + pvTimeDifferenceInMinutes = pvTimeDifference / 60 + if (pvTimeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) pvProperties = {} pvProperties["PVUsageThresholdPercentage"] = @@metric_threshold_hash[Constants::PV_USED_BYTES] pvProperties["PVUsageThresholdExceededInLastFlushInterval"] = @pvExceededUsageThreshold - ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENTT, pvProperties) - - @@containerResourceUtilTelemetryTimeTracker = DateTime.now.to_time.to_i - @containersExceededCpuThreshold = false - @containersExceededMemRssThreshold = false - @containersExceededMemWorkingSetThreshold = false + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_USAGE_HEART_BEAT_EVENT, pvProperties) @pvExceededUsageThreshold = false + @@pvUsageTelemetryTimeTracker = DateTime.now.to_time.to_i end rescue => errorStr - @log.info "Error in flushMetricTelemetry: #{errorStr}" + @log.info "Error in flushMetricTelemetry: #{errorStr} for PV usage telemetry" ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end From 4505b452f4e0bec5e80f6a2e40fd5ce1e19b639d Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 16:21:05 -0700 Subject: [PATCH 34/57] whitespace fix --- source/plugins/ruby/filter_cadvisor2mdm.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/filter_cadvisor2mdm.rb b/source/plugins/ruby/filter_cadvisor2mdm.rb index 6cf1e3d72..3bc674ea8 100644 --- a/source/plugins/ruby/filter_cadvisor2mdm.rb +++ b/source/plugins/ruby/filter_cadvisor2mdm.rb @@ -244,7 +244,7 @@ def filterPVInsightsMetrics(record) capacity = dataItem["Tags"][Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] if capacity != 0 percentage_metric_value = (usage * 100.0) / capacity - end + end @log.info "percentage_metric_value for metric: #{metricName} percentage: #{percentage_metric_value}" @log.info "@@metric_threshold_hash for #{metricName}: #{@@metric_threshold_hash[metricName]}" From c08054b55886993ee95e1c38ba3be0d27bb93884 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 4 Sep 2020 16:32:20 -0700 Subject: [PATCH 35/57] variable name fix --- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index 3355a11db..d815644c2 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -316,7 +316,7 @@ def getInsightsMetrics(winNode: nil, metricTime: Time.now.utc.iso8601) end def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metricNameToReturn, metricPollTime) - telemetryimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs + telemetryTimeDifference = (DateTime.now.to_time.to_i - @@telemetryPVKubeSystemMetricsTimeTracker).abs telemetryTimeDifferenceInMinutes = telemetryTimeDifference / 60 metricItems = [] From c88b9ab907fd762435f8ae67be34968ade3082d6 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 8 Sep 2020 11:37:00 -0700 Subject: [PATCH 36/57] naming changes --- .../installer/scripts/tomlparser-mdm-metrics-config.rb | 10 +++++----- source/plugins/ruby/CAdvisorMetricsAPIClient.rb | 4 ++-- source/plugins/ruby/constants.rb | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb index 04d664289..74f1c0726 100644 --- a/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb +++ b/build/linux/installer/scripts/tomlparser-mdm-metrics-config.rb @@ -78,20 +78,20 @@ def populateSettingValuesFromConfigMap(parsedConfig) # Get mdm metrics config settings for PV utilization begin - usingPVThresholdConfig = false - pvUtilization = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] + isUsingPVThresholdConfig = false + pvUtilizationThresholds = parsedConfig[:alertable_metrics_configuration_settings][:pv_utilization_thresholds] if !pvUtilization.nil? - pvUsageThreshold = pvUtilization[:pv_usage_threshold_percentage] + pvUsageThreshold = pvUtilizationThresholds[:pv_usage_threshold_percentage] if !pvUsageThreshold.nil? pvUsageThresholdFloat = pvUsageThreshold.to_f if pvUsageThresholdFloat.kind_of? Float @percentagePVUsageThreshold = pvUsageThresholdFloat - usingPVThresholdConfig = true + isUsingPVThresholdConfig = true end end end - if usingPVThresholdConfig + if isUsingPVThresholdConfig puts "config::Using config map settings for MDM metric configuration settings for PV utilization" else puts "config::Non floating point value or value not convertible to float specified for PV threshold, using default " diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index d815644c2..bd1cd1000 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -342,7 +342,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric podUid = pod["podRef"]["uid"] podName = pod["podRef"]["name"] pvcName = pvcRef["name"] - pvName = volume["name"] + volumeName = volume["name"] metricItem = {} metricItem["CollectionTime"] = metricPollTime @@ -357,7 +357,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = clusterName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_UID] = podUid metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName - metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_NAME] = pvName + metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volumeName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 692fd6e4d..5409a21b6 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -15,7 +15,7 @@ class Constants INSIGHTSMETRICS_TAGS_CONTROLLER_KIND = "controllerKind" INSIGHTSMETRICS_TAGS_POD_UID = "podUID" INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" - INSIGHTSMETRICS_TAGS_PV_NAME = "pvName" + INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" INSIGHTSMETRICS_TAGS_POD_NAME = "podName" INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" From 4447cdd6a540dc09521a627be968d8641bd990e9 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 09:51:06 -0700 Subject: [PATCH 37/57] make call for pv instead of pvc --- source/plugins/ruby/in_kube_podinventory.rb | 88 ++++++++++----------- source/plugins/ruby/in_kube_pvinventory.rb | 40 +++++++--- 2 files changed, 72 insertions(+), 56 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 7373491d1..1149fbc25 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -88,36 +88,36 @@ def enumerate(podList = nil) end # Get PVCs first so that we dont need to make a call for every chunk - pvInfo = nil - $log.info("in_kube_podinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumeclaims") - $log.info("in_kube_podinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - - if !pvInfo.nil? - $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInfo = nil - end - - pvNameToInventoryHash = {} - if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) - pvInventory["items"].each do |item| - pvRecord = {} - pvRecord["Name"] = item["metadata"]["name"] - pvRecord["Namespace"] = item["metadata"]["namespace"] - pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - pvRecord["VolumeName"] = item["spec"]["volumeName"] - pvRecord["StorageClassName"] = item["spec"]["storageClassName"] - pvRecord["Status"] = item["status"]["phase"] - pvRecord["AccessMode"] = item["status"]["accessModes"][0] - pvRecord["RequestSize"] = item["status"]["capacity"]["storage"] - - pvNameToInventoryHash[pvRecord["Name"]] = pvRecord.dup - else - $log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" - end + #pvInfo = nil + #$log.info("in_kube_podinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + #pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumeclaims") + #$log.info("in_kube_podinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + + #if !pvInfo.nil? + #$log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + #pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) + #$log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + #pvInfo = nil + #end + + #pvNameToInventoryHash = {} + #if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + #pvInventory["items"].each do |item| + #pvRecord = {} + #pvRecord["Name"] = item["metadata"]["name"] + #pvRecord["Namespace"] = item["metadata"]["namespace"] + #pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + #pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + #pvRecord["VolumeName"] = item["spec"]["volumeName"] + #pvRecord["StorageClassName"] = item["spec"]["storageClassName"] + #pvRecord["Status"] = item["status"]["phase"] + #pvRecord["AccessMode"] = item["status"]["accessModes"][0] + #pvRecord["RequestSize"] = item["status"]["capacity"]["storage"] + + #pvNameToInventoryHash[pvRecord["Name"]] = pvRecord.dup + #else + #$log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" + #end # Initializing continuation token to nil continuationToken = nil @@ -125,7 +125,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -134,14 +134,14 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end end # Setting these to nil so that we dont hold memory until GC kicks in - pvInventory = nil + #pvInventory = nil podInventory = nil serviceList = nil @@ -173,7 +173,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 @@ -192,17 +192,17 @@ def parse_and_emit_records(podInventory, serviceList, pvNameToInventoryHash, con record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] - pvInventoryRecords = [] - if !items["spec"].nil? && !items["spec"]["volumes"].nil? - items["spec"]["volumes"].each do |volume| - if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? - pvInventoryRecord = pvNameToInventoryHash[volume["persistentVolumeClaim"]["claimName"]] - pvInventoryRecords.push(pvInventoryRecord) - end - end - end + #pvInventoryRecords = [] + #if !items["spec"].nil? && !items["spec"]["volumes"].nil? + #items["spec"]["volumes"].each do |volume| + #if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? + #pvInventoryRecord = pvNameToInventoryHash[volume["persistentVolumeClaim"]["claimName"]] + #pvInventoryRecords.push(pvInventoryRecord) + #end + #end + #end - record["pvInventories"] = pvInventoryRecords + #record["pvInventories"] = pvInventoryRecords # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index f8d87f4a8..3184cce80 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -54,9 +54,9 @@ def enumerate batchTime = currentTime.utc.iso8601 continuationToken = nil - $log.info("in_kube_pvinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumeclaims?limit=#{@PVC_CHUNK_SIZE}") - $log.info("in_kube_pvinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PVC_CHUNK_SIZE}") + $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) parse_and_emit_records(pvInventory, batchTime) @@ -66,7 +66,7 @@ def enumerate #If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumeclaims?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) parse_and_emit_records(pvInventory, batchTime) else @@ -92,26 +92,42 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) begin records = [] pvInventory["items"].each do |item| + + pvHasPVC = false + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + item["spec"]["claimRef"].each do |claimRef| + if claimRef["kind"] == "PersistentVolumeClaim" + namespace = claimRef["namespace"] + pvcName = claimRef["name"] + pvHasPVC = true + end + end + end + + if !pvHasPVC + return records + end + metricItem = {} metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = "nodeName" + metricItem["Computer"] = "" metricItem["Name"] = "pvInventory" metricItem["Value"] = 0 metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = "container.azm.ms/persistentvolume" + metricItem["Namespace"] = "container.azm.ms/pv" metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName - metricTags["PVCName"] = item["metadata"]["name"] - metricTags["Namespace"] = item["metadata"]["namespace"] + metricTags["PVName"] = item["metadata"]["name"] + metricTags["PVCName"] = pvcName + metricTags["Namespace"] = namespace metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - metricTags["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - metricTags["VolumeName"] = item["spec"]["volumeName"] + metricTags["Kind"] = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] metricTags["StorageClassName"] = item["spec"]["storageClassName"] metricTags["Status"] = item["status"]["phase"] - metricTags["AccessMode"] = item["status"]["accessModes"][0] - metricTags["RequestSize"] = item["status"]["capacity"]["storage"] + metricTags["AccessMode"] = item["spec"]["accessModes"][0] + metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] metricItem["Tags"] = metricTags records.push(metricItem) From dc3351db84f55a8a9f319813615df09b041142e4 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 11:27:37 -0700 Subject: [PATCH 38/57] disk info and telemetry --- source/plugins/ruby/in_kube_pvinventory.rb | 86 ++++++++++++++-------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 3184cce80..8ea9517b7 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -3,6 +3,7 @@ class Kube_PVInventory_Input < Input Plugin.register_input("kubepvinventory", self) @@MDMKubePVInventoryTag = "mdm.kubepvinventory" + @@hostName = (OMS::Common.get_hostname) def initialize super @@ -17,7 +18,9 @@ def initialize require_relative "omslog" require_relative "constants" - @PVC_CHUNK_SIZE = "1500" + @PV_CHUNK_SIZE = "1500" + @pvCount = 0 + @diskCount = 0 end config_param :run_interval, :time, :default => 60 @@ -33,7 +36,7 @@ def start @condition = ConditionVariable.new @mutex = Mutex.new @thread = Thread.new(&method(:run_periodic)) - @@podTelemetryTimeTracker = DateTime.now.to_time.to_i + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end end @@ -50,12 +53,15 @@ def shutdown def enumerate begin pvInventory = nil + telemetryFlush = false + @pvCount = 0 + @diskCount = 0 currentTime = Time.now batchTime = currentTime.utc.iso8601 continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PVC_CHUNK_SIZE}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) @@ -64,9 +70,9 @@ def enumerate $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" end - #If we receive a continuation token, make calls, process and flush data until we have processed all data + # If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PVC_CHUNK_SIZE}&continue=#{continuationToken}") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}&continue=#{continuationToken}") if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) parse_and_emit_records(pvInventory, batchTime) else @@ -76,6 +82,24 @@ def enumerate # Setting this to nil so that we dont hold memory until GC kicks in pvInventory = nil + + # Adding telemetry to send pod telemetry every 5 minutes + timeDifference = (DateTime.now.to_time.to_i - @@pvTelemetryTimeTracker).abs + timeDifferenceInMinutes = timeDifference / 60 + if (timeDifferenceInMinutes >= 5) + telemetryFlush = true + end + + # Flush AppInsights telemetry once all the processing is done + if telemetryFlush == true + telemetryProperties = {} + telemetryProperties["Computer"] = @@hostName + ApplicationInsightsUtility.sendCustomEvent("KubePVInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendMetricTelemetry("PVCount", @pvCount, {}) + ApplicationInsightsUtility.sendMetricTelemetry("DiskCount", @diskCount, {}) + @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i + end + rescue => errorStr $log.warn "in_kube_pvinventory::enumerate:Failed in enumerate: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) @@ -93,34 +117,46 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) records = [] pvInventory["items"].each do |item| - pvHasPVC = false + # Check if the PV has a PVC + hasPVC = false if !item["spec"].nil? && !item["spec"]["claimRef"].nil? item["spec"]["claimRef"].each do |claimRef| if claimRef["kind"] == "PersistentVolumeClaim" + hasPVC = true namespace = claimRef["namespace"] pvcName = claimRef["name"] - pvHasPVC = true end end end - - if !pvHasPVC + # Return if no PVC + if !hasPVC return records end + # Check if the PV is an Azure Disk + isAzureDisk = false + if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? + isAzureDisk = true + azureDisk = item["spec"]["azureDisk"] + diskName = azureDisk["diskName"] + diskUri = azureDisk["diskURI"] + @diskCount += 1 + end + metricItem = {} metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = "" + metricItem["Computer"] = @@hostName metricItem["Name"] = "pvInventory" metricItem["Value"] = 0 metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = "container.azm.ms/pv" + metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName metricTags["PVName"] = item["metadata"]["name"] metricTags["PVCName"] = pvcName + metricTags["PodUID"] = "" metricTags["Namespace"] = namespace metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] metricTags["Kind"] = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] @@ -128,30 +164,18 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags["Status"] = item["status"]["phase"] metricTags["AccessMode"] = item["spec"]["accessModes"][0] metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] + if isAzureDisk + metricTags["DiskName"] = diskName + metricTags["DiskURI"] = diskUri + end metricItem["Tags"] = metricTags records.push(metricItem) - - #record = {} - #record["CollectionTime"] = batchTime - #record["Name"] = item["metadata"]["name"] - #record["Namespace"] = item["metadata"]["namespace"] - #record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - #record["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - #record["VolumeName"] = item["spec"]["volumeName"] - #record["StorageClassName"] = item["spec"]["storageClassName"] - #record["Status"] = item["status"]["phase"] - #record["AccessMode"] = item["status"]["accessModes"][0] - #record["RequestSize"] = item["status"]["capacity"]["storage"] - - #record["PodUid"] = "" - #record["DiskId"] = "" - #record["ClusterName"] = KubernetesApiClient.getClusterName - #record["ClusterId"] = KubernetesApiClient.getClusterId - - #records.push(record.dup) + $log.info("PV inventory record: #{metricItem}") end + @pvCount += records.length + records.each do |record| if !record.nil? wrapper = { @@ -170,7 +194,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) - end #begin block end + end end def run_periodic From 4d5e22882eef5e534efe1a89fbe110e56714fe40 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 11:53:31 -0700 Subject: [PATCH 39/57] logging and pvcs in pod inventory --- source/plugins/ruby/in_kube_podinventory.rb | 24 ++++++++++++--------- source/plugins/ruby/in_kube_pvinventory.rb | 16 +++++++++++++- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 1149fbc25..9351b0f93 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -192,17 +192,21 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] - #pvInventoryRecords = [] - #if !items["spec"].nil? && !items["spec"]["volumes"].nil? - #items["spec"]["volumes"].each do |volume| - #if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? - #pvInventoryRecord = pvNameToInventoryHash[volume["persistentVolumeClaim"]["claimName"]] - #pvInventoryRecords.push(pvInventoryRecord) - #end - #end - #end + pvcs = [] + if !items["spec"].nil? && !items["spec"]["volumes"].nil? + items["spec"]["volumes"].each do |volume| + if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? + $log.info "pvc on the pod" + + pvc = podNamespace + "/" + volume["persistentVolumeClaim"]["claimName"] + + $log.info "pv: #{pvc}" + pvcs.push(pvc) + end + end + end - #record["pvInventories"] = pvInventoryRecords + $log.info "Pod PVCs: #{pvcs}" # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 8ea9517b7..9ac323d81 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -114,6 +114,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) @@istestvar = ENV["ISTEST"] begin + $log.info "pvInventory: #{pvInventory}" + records = [] pvInventory["items"].each do |item| @@ -133,6 +135,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) return records end + $log.info "hasPVC: #{hasPVC}" + # Check if the PV is an Azure Disk isAzureDisk = false if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? @@ -143,6 +147,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) @diskCount += 1 end + $log.info "isAzureDisk: #{isAzureDisk}" + metricItem = {} metricItem["CollectionTime"] = batchTime metricItem["Computer"] = @@hostName @@ -151,13 +157,15 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + $log.info "metricItem: #{metricItem}" + metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName metricTags["PVName"] = item["metadata"]["name"] metricTags["PVCName"] = pvcName metricTags["PodUID"] = "" - metricTags["Namespace"] = namespace + metricTags["PVCNamespace"] = namespace metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] metricTags["Kind"] = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] metricTags["StorageClassName"] = item["spec"]["storageClassName"] @@ -169,13 +177,19 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags["DiskURI"] = diskUri end + $log.info "metricTags: #{metricTags}" + metricItem["Tags"] = metricTags records.push(metricItem) $log.info("PV inventory record: #{metricItem}") end + $log.info "went through all pv's" + @pvCount += records.length + $log.info "pvCount: #{pvCount}" + records.each do |record| if !record.nil? wrapper = { From 8312caf3a59b6185624117ae7235e8bd032b4eda Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 12:15:40 -0700 Subject: [PATCH 40/57] parsing fixes --- source/plugins/ruby/in_kube_podinventory.rb | 2 +- source/plugins/ruby/in_kube_pvinventory.rb | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 9351b0f93..b28008cf9 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -198,7 +198,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? $log.info "pvc on the pod" - pvc = podNamespace + "/" + volume["persistentVolumeClaim"]["claimName"] + pvc = podNameSpace + "/" + volume["persistentVolumeClaim"]["claimName"] $log.info "pv: #{pvc}" pvcs.push(pvc) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 9ac323d81..b36800b26 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -119,10 +119,12 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) records = [] pvInventory["items"].each do |item| + $log.info "item: #{item}" + # Check if the PV has a PVC hasPVC = false if !item["spec"].nil? && !item["spec"]["claimRef"].nil? - item["spec"]["claimRef"].each do |claimRef| + claimRef = item["spec"]["claimRef"] if claimRef["kind"] == "PersistentVolumeClaim" hasPVC = true namespace = claimRef["namespace"] From 610cc717a847c8e74d0764958c13fcd1601e7b78 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 14:29:44 -0700 Subject: [PATCH 41/57] pv inventory in pod inventory and more telemetry --- source/plugins/ruby/in_kube_podinventory.rb | 101 +++++++++++++------- source/plugins/ruby/in_kube_pvinventory.rb | 34 ++++--- 2 files changed, 87 insertions(+), 48 deletions(-) diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index b28008cf9..1b75a52ca 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -88,36 +88,63 @@ def enumerate(podList = nil) end # Get PVCs first so that we dont need to make a call for every chunk - #pvInfo = nil - #$log.info("in_kube_podinventory::enumerate : Getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - #pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumeclaims") - #$log.info("in_kube_podinventory::enumerate : Done getting PVCs from Kube API @ #{Time.now.utc.iso8601}") - - #if !pvInfo.nil? - #$log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - #pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) - #$log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - #pvInfo = nil - #end - - #pvNameToInventoryHash = {} - #if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) - #pvInventory["items"].each do |item| - #pvRecord = {} - #pvRecord["Name"] = item["metadata"]["name"] - #pvRecord["Namespace"] = item["metadata"]["namespace"] - #pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - #pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - #pvRecord["VolumeName"] = item["spec"]["volumeName"] - #pvRecord["StorageClassName"] = item["spec"]["storageClassName"] - #pvRecord["Status"] = item["status"]["phase"] - #pvRecord["AccessMode"] = item["status"]["accessModes"][0] - #pvRecord["RequestSize"] = item["status"]["capacity"]["storage"] - - #pvNameToInventoryHash[pvRecord["Name"]] = pvRecord.dup - #else - #$log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" - #end + pvInfo = nil + $log.info("in_kube_podinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumes") + $log.info("in_kube_podinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + + if !pvInfo.nil? + $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) + $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInfo = nil + end + + pvcNameToPVInventoryHash = {} + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + pvInventory["items"].each do |item| + + # Check if the PV has a PVC + hasPVC = false + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + claimRef = item["spec"]["claimRef"] + if claimRef["kind"] == "PersistentVolumeClaim" + hasPVC = true + namespace = claimRef["namespace"] + pvcName = claimRef["name"] + end + end + + if hasPVC + # Check if the PV is an Azure Disk + isAzureDisk = false + if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? + isAzureDisk = true + azureDisk = item["spec"]["azureDisk"] + diskName = azureDisk["diskName"] + diskUri = azureDisk["diskURI"] + end + + pvRecord = {} + pvRecord["PVName"] = item["metadata"]["name"] + pvRecord["PVCName"] pvcName + pvRecord["Namespace"] = namespace + pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] + pvRecord["StorageClassName"] = item["spec"]["storageClassName"] + pvRecord["Status"] = item["status"]["phase"] + pvRecord["AccessMode"] = item["status"]["accessModes"] + pvRecord["RequestSize"] = item["spec"]["capacity"]["storage"] + + pvcNamespaceAndName = namespace + "/" + pvcName + pvcNameToPVInventoryHash[pvcNamespaceAndName] = pvRecord.dup + end + end + else + $log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" + end + + $log.info "pvcNameToPVInventoryHash #{pvcNameToPVInventoryHash}" # Initializing continuation token to nil continuationToken = nil @@ -125,7 +152,7 @@ def enumerate(podList = nil) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -134,7 +161,7 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -173,7 +200,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 @@ -192,7 +219,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] - pvcs = [] + pvInventories = [] if !items["spec"].nil? && !items["spec"]["volumes"].nil? items["spec"]["volumes"].each do |volume| if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? @@ -201,12 +228,12 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi pvc = podNameSpace + "/" + volume["persistentVolumeClaim"]["claimName"] $log.info "pv: #{pvc}" - pvcs.push(pvc) + pvInventories.push(pvcNameToPVInventoryHash[pvc]) end end end - $log.info "Pod PVCs: #{pvcs}" + $log.info "Pod PV inventories: #{pvInventories}" # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && @@ -220,7 +247,7 @@ def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTi next end record["PodUid"] = podUid - record["PodLabel"] = [items["metadata"]["labels"]] + record["PodLabel"] = pvInventories record["Namespace"] = podNameSpace record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] #for unscheduled (non-started) pods startTime does NOT exist diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index b36800b26..9fee8bcea 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -11,7 +11,6 @@ def initialize require "yajl/json_gem" require "yajl" require "time" - require_relative "KubernetesApiClient" require_relative "ApplicationInsightsUtility" require_relative "oms_common" @@ -21,6 +20,7 @@ def initialize @PV_CHUNK_SIZE = "1500" @pvCount = 0 @diskCount = 0 + @pvKindToCountHash = {} end config_param :run_interval, :time, :default => 60 @@ -56,6 +56,7 @@ def enumerate telemetryFlush = false @pvCount = 0 @diskCount = 0 + @pvKindToCountHash = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -94,6 +95,7 @@ def enumerate if telemetryFlush == true telemetryProperties = {} telemetryProperties["Computer"] = @@hostName + telemetryProperties["CountsOfPVKinds"] = @pvKindToCountHash ApplicationInsightsUtility.sendCustomEvent("KubePVInventoryHeartBeatEvent", telemetryProperties) ApplicationInsightsUtility.sendMetricTelemetry("PVCount", @pvCount, {}) ApplicationInsightsUtility.sendMetricTelemetry("DiskCount", @diskCount, {}) @@ -124,12 +126,11 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) # Check if the PV has a PVC hasPVC = false if !item["spec"].nil? && !item["spec"]["claimRef"].nil? - claimRef = item["spec"]["claimRef"] - if claimRef["kind"] == "PersistentVolumeClaim" - hasPVC = true - namespace = claimRef["namespace"] - pvcName = claimRef["name"] - end + claimRef = item["spec"]["claimRef"] + if claimRef["kind"] == "PersistentVolumeClaim" + hasPVC = true + namespace = claimRef["namespace"] + pvcName = claimRef["name"] end end # Return if no PVC @@ -151,6 +152,15 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) $log.info "isAzureDisk: #{isAzureDisk}" + if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil + kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] + if (@pvKindToCountHash.has_key? kind) + @pvKindToCountHash[kind] += 1 + else + @pvKindToCountHash[kind] = 1 + end + end + metricItem = {} metricItem["CollectionTime"] = batchTime metricItem["Computer"] = @@hostName @@ -169,10 +179,10 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags["PodUID"] = "" metricTags["PVCNamespace"] = namespace metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - metricTags["Kind"] = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] + metricTags["Kind"] = kind metricTags["StorageClassName"] = item["spec"]["storageClassName"] metricTags["Status"] = item["status"]["phase"] - metricTags["AccessMode"] = item["spec"]["accessModes"][0] + metricTags["AccessMode"] = item["spec"]["accessModes"] metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] if isAzureDisk metricTags["DiskName"] = diskName @@ -190,7 +200,9 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) @pvCount += records.length - $log.info "pvCount: #{pvCount}" + $log.info "pvCount: #{@pvCount}" + $log.info "diskCount: #{@diskCount}" + $log.info "pvKindToCountHash: #{@pvKindToCountHash}" records.each do |record| if !record.nil? @@ -203,7 +215,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - router.emit_stream(@tag, eventStream) if eventStream + #router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr From 7c4a547f2cf80aeffc3698c8c6fb907b6e16a73a Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 16:26:53 -0700 Subject: [PATCH 42/57] cleanup and add logging for kube api response size --- source/plugins/ruby/KubernetesApiClient.rb | 2 +- source/plugins/ruby/in_kube_podinventory.rb | 84 +-------------------- source/plugins/ruby/in_kube_pvinventory.rb | 33 +------- 3 files changed, 9 insertions(+), 110 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index 36dcdd8c6..b6844f659 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -52,7 +52,7 @@ def getKubeResourceInfo(resource, api_group: nil) kubeApiRequest["Authorization"] = "Bearer " + getTokenStr @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} with size #{response.content_length} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end end end diff --git a/source/plugins/ruby/in_kube_podinventory.rb b/source/plugins/ruby/in_kube_podinventory.rb index 1b75a52ca..bffa725ee 100644 --- a/source/plugins/ruby/in_kube_podinventory.rb +++ b/source/plugins/ruby/in_kube_podinventory.rb @@ -87,72 +87,13 @@ def enumerate(podList = nil) serviceInfo = nil end - # Get PVCs first so that we dont need to make a call for every chunk - pvInfo = nil - $log.info("in_kube_podinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") - pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumes") - $log.info("in_kube_podinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") - - if !pvInfo.nil? - $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInfo = nil - end - - pvcNameToPVInventoryHash = {} - if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) - pvInventory["items"].each do |item| - - # Check if the PV has a PVC - hasPVC = false - if !item["spec"].nil? && !item["spec"]["claimRef"].nil? - claimRef = item["spec"]["claimRef"] - if claimRef["kind"] == "PersistentVolumeClaim" - hasPVC = true - namespace = claimRef["namespace"] - pvcName = claimRef["name"] - end - end - - if hasPVC - # Check if the PV is an Azure Disk - isAzureDisk = false - if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? - isAzureDisk = true - azureDisk = item["spec"]["azureDisk"] - diskName = azureDisk["diskName"] - diskUri = azureDisk["diskURI"] - end - - pvRecord = {} - pvRecord["PVName"] = item["metadata"]["name"] - pvRecord["PVCName"] pvcName - pvRecord["Namespace"] = namespace - pvRecord["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - pvRecord["Kind"] = item["metadata"]["annotations"]["volume.beta.kubernetes.io/storage-provisioner"] - pvRecord["StorageClassName"] = item["spec"]["storageClassName"] - pvRecord["Status"] = item["status"]["phase"] - pvRecord["AccessMode"] = item["status"]["accessModes"] - pvRecord["RequestSize"] = item["spec"]["capacity"]["storage"] - - pvcNamespaceAndName = namespace + "/" + pvcName - pvcNameToPVInventoryHash[pvcNamespaceAndName] = pvRecord.dup - end - end - else - $log.warn "in_kube_podinventory::enumerate:Received empty pvInventory" - end - - $log.info "pvcNameToPVInventoryHash #{pvcNameToPVInventoryHash}" - # Initializing continuation token to nil continuationToken = nil $log.info("in_kube_podinventory::enumerate : Getting pods from Kube API @ #{Time.now.utc.iso8601}") continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}") $log.info("in_kube_podinventory::enumerate : Done getting pods from Kube API @ #{Time.now.utc.iso8601}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end @@ -161,14 +102,13 @@ def enumerate(podList = nil) while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, podInventory = KubernetesApiClient.getResourcesAndContinuationToken("pods?limit=#{@PODS_CHUNK_SIZE}&continue=#{continuationToken}") if (!podInventory.nil? && !podInventory.empty? && podInventory.key?("items") && !podInventory["items"].nil? && !podInventory["items"].empty?) - parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime) + parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime) else $log.warn "in_kube_podinventory::enumerate:Received empty podInventory" end end # Setting these to nil so that we dont hold memory until GC kicks in - #pvInventory = nil podInventory = nil serviceList = nil @@ -200,7 +140,7 @@ def enumerate(podList = nil) end end - def parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, continuationToken, batchTime = Time.utc.iso8601) + def parse_and_emit_records(podInventory, serviceList, continuationToken, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f #batchTime = currentTime.utc.iso8601 @@ -219,22 +159,6 @@ def parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, record["Name"] = items["metadata"]["name"] podNameSpace = items["metadata"]["namespace"] - pvInventories = [] - if !items["spec"].nil? && !items["spec"]["volumes"].nil? - items["spec"]["volumes"].each do |volume| - if !volume["persistentVolumeClaim"].nil? && !volume["persistentVolumeClaim"]["claimName"].nil? - $log.info "pvc on the pod" - - pvc = podNameSpace + "/" + volume["persistentVolumeClaim"]["claimName"] - - $log.info "pv: #{pvc}" - pvInventories.push(pvcNameToPVInventoryHash[pvc]) - end - end - end - - $log.info "Pod PV inventories: #{pvInventories}" - # For ARO v3 cluster, skip the pods scheduled on to master or infra nodes if KubernetesApiClient.isAROV3Cluster && !items["spec"].nil? && !items["spec"]["nodeName"].nil? && (items["spec"]["nodeName"].downcase.start_with?("infra-") || @@ -247,7 +171,7 @@ def parse_and_emit_records(podInventory, serviceList, pvcNameToPVInventoryHash, next end record["PodUid"] = podUid - record["PodLabel"] = pvInventories + record["PodLabel"] = [items["metadata"]["labels"]] record["Namespace"] = podNameSpace record["PodCreationTimeStamp"] = items["metadata"]["creationTimestamp"] #for unscheduled (non-started) pods startTime does NOT exist diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 9fee8bcea..d08186c72 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -18,8 +18,6 @@ def initialize require_relative "constants" @PV_CHUNK_SIZE = "1500" - @pvCount = 0 - @diskCount = 0 @pvKindToCountHash = {} end @@ -54,8 +52,6 @@ def enumerate begin pvInventory = nil telemetryFlush = false - @pvCount = 0 - @diskCount = 0 @pvKindToCountHash = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -97,8 +93,6 @@ def enumerate telemetryProperties["Computer"] = @@hostName telemetryProperties["CountsOfPVKinds"] = @pvKindToCountHash ApplicationInsightsUtility.sendCustomEvent("KubePVInventoryHeartBeatEvent", telemetryProperties) - ApplicationInsightsUtility.sendMetricTelemetry("PVCount", @pvCount, {}) - ApplicationInsightsUtility.sendMetricTelemetry("DiskCount", @diskCount, {}) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end @@ -116,13 +110,9 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) @@istestvar = ENV["ISTEST"] begin - $log.info "pvInventory: #{pvInventory}" - records = [] pvInventory["items"].each do |item| - $log.info "item: #{item}" - # Check if the PV has a PVC hasPVC = false if !item["spec"].nil? && !item["spec"]["claimRef"].nil? @@ -138,8 +128,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) return records end - $log.info "hasPVC: #{hasPVC}" - # Check if the PV is an Azure Disk isAzureDisk = false if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? @@ -147,12 +135,10 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) azureDisk = item["spec"]["azureDisk"] diskName = azureDisk["diskName"] diskUri = azureDisk["diskURI"] - @diskCount += 1 end - $log.info "isAzureDisk: #{isAzureDisk}" - - if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil + # Get telemetry on PV kind + if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil? kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] if (@pvKindToCountHash.has_key? kind) @pvKindToCountHash[kind] += 1 @@ -169,14 +155,11 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - $log.info "metricItem: #{metricItem}" - metricTags = {} metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName metricTags["PVName"] = item["metadata"]["name"] metricTags["PVCName"] = pvcName - metricTags["PodUID"] = "" metricTags["PVCNamespace"] = namespace metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] metricTags["Kind"] = kind @@ -184,24 +167,16 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags["Status"] = item["status"]["phase"] metricTags["AccessMode"] = item["spec"]["accessModes"] metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] + if isAzureDisk metricTags["DiskName"] = diskName metricTags["DiskURI"] = diskUri end - $log.info "metricTags: #{metricTags}" - metricItem["Tags"] = metricTags records.push(metricItem) - $log.info("PV inventory record: #{metricItem}") end - $log.info "went through all pv's" - - @pvCount += records.length - - $log.info "pvCount: #{@pvCount}" - $log.info "diskCount: #{@diskCount}" $log.info "pvKindToCountHash: #{@pvKindToCountHash}" records.each do |record| @@ -215,7 +190,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - #router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr From a578ba3487c1b78401bb879d2c43f58092de2583 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 17:00:21 -0700 Subject: [PATCH 43/57] payload investigation --- source/plugins/ruby/in_kube_pvinventory.rb | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index d08186c72..1edb37563 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -56,10 +56,23 @@ def enumerate currentTime = Time.now batchTime = currentTime.utc.iso8601 - continuationToken = nil - $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") - continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") - $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + #continuationToken = nil + #$log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + #continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") + #$log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + + pvInfo = nil + $log.info("in_kube_podinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumes") + $log.info("in_kube_podinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + + if !pvInfo.nil? + $log.info("in_kube_podinventory::enumerate : Request body size of #{pvInfo.body.size}") + $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) + $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + pvInfo = nil + end if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) parse_and_emit_records(pvInventory, batchTime) @@ -139,7 +152,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) # Get telemetry on PV kind if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil? - kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"] + kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].downcase if (@pvKindToCountHash.has_key? kind) @pvKindToCountHash[kind] += 1 else @@ -167,7 +180,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) metricTags["Status"] = item["status"]["phase"] metricTags["AccessMode"] = item["spec"]["accessModes"] metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] - + if isAzureDisk metricTags["DiskName"] = diskName metricTags["DiskURI"] = diskUri From 4578b807796d9535301dff3afb6d009c8c0e3a4e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 9 Sep 2020 17:29:35 -0700 Subject: [PATCH 44/57] getting more resposne size info --- source/plugins/ruby/KubernetesApiClient.rb | 2 +- source/plugins/ruby/in_kube_pvinventory.rb | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/source/plugins/ruby/KubernetesApiClient.rb b/source/plugins/ruby/KubernetesApiClient.rb index b6844f659..36dcdd8c6 100644 --- a/source/plugins/ruby/KubernetesApiClient.rb +++ b/source/plugins/ruby/KubernetesApiClient.rb @@ -52,7 +52,7 @@ def getKubeResourceInfo(resource, api_group: nil) kubeApiRequest["Authorization"] = "Bearer " + getTokenStr @Log.info "KubernetesAPIClient::getKubeResourceInfo : Making request to #{uri.request_uri} @ #{Time.now.utc.iso8601}" response = http.request(kubeApiRequest) - @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} with size #{response.content_length} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" + @Log.info "KubernetesAPIClient::getKubeResourceInfo : Got response of #{response.code} for #{uri.request_uri} @ #{Time.now.utc.iso8601}" end end end diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 1edb37563..22dc2942a 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -17,7 +17,7 @@ def initialize require_relative "omslog" require_relative "constants" - @PV_CHUNK_SIZE = "1500" + @PV_CHUNK_SIZE = "5000" @pvKindToCountHash = {} end @@ -62,15 +62,17 @@ def enumerate #$log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") pvInfo = nil - $log.info("in_kube_podinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumes") - $log.info("in_kube_podinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") if !pvInfo.nil? - $log.info("in_kube_podinventory::enumerate : Request body size of #{pvInfo.body.size}") - $log.info("in_kube_podinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + $log.info("in_kube_pvinventory::enumerate : Response size of #{pvInfo.size}") + $log.info("in_kube_pvinventory::enumerate : Response header size of #{pvInfo.header.size}") + $log.info("in_kube_pvinventory::enumerate : Response body size of #{pvInfo.body.size}") + $log.info("in_kube_pvinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) - $log.info("in_kube_podinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") + $log.info("in_kube_pvinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") pvInfo = nil end @@ -203,7 +205,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - router.emit_stream(@tag, eventStream) if eventStream + #router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr From bc6b8ccd4b79e7188f749a57471620d66b10c74b Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 10 Sep 2020 11:43:36 -0700 Subject: [PATCH 45/57] use continuation token, get rid of mdm path --- build/linux/installer/conf/kube.conf | 2 +- source/plugins/ruby/in_kube_pvinventory.rb | 25 ++-------------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/build/linux/installer/conf/kube.conf b/build/linux/installer/conf/kube.conf index 52ebffa54..cd7e996cb 100644 --- a/build/linux/installer/conf/kube.conf +++ b/build/linux/installer/conf/kube.conf @@ -195,7 +195,7 @@ max_retry_wait 5m - + type out_mdm log_level debug num_threads 5 diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 22dc2942a..525ae61b8 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -56,32 +56,11 @@ def enumerate currentTime = Time.now batchTime = currentTime.utc.iso8601 - #continuationToken = nil - #$log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") - #continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") - #$log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") - - pvInfo = nil + continuationToken = nil $log.info("in_kube_pvinventory::enumerate : Getting PVs from Kube API @ #{Time.now.utc.iso8601}") - pvInfo = KubernetesApiClient.getKubeResourceInfo("persistentvolumes") + continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") - if !pvInfo.nil? - $log.info("in_kube_pvinventory::enumerate : Response size of #{pvInfo.size}") - $log.info("in_kube_pvinventory::enumerate : Response header size of #{pvInfo.header.size}") - $log.info("in_kube_pvinventory::enumerate : Response body size of #{pvInfo.body.size}") - $log.info("in_kube_pvinventory::enumerate:Start:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInventory = Yajl::Parser.parse(StringIO.new(pvInfo.body)) - $log.info("in_kube_pvinventory::enumerate:End:Parsing pvc data using yajl @ #{Time.now.utc.iso8601}") - pvInfo = nil - end - - if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) - parse_and_emit_records(pvInventory, batchTime) - else - $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" - end - # If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}&continue=#{continuationToken}") From 75452144905fd282b345ff62026bc3ca91607588 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 10 Sep 2020 13:05:41 -0700 Subject: [PATCH 46/57] use kubepvinventory path --- source/plugins/ruby/constants.rb | 1 + source/plugins/ruby/in_kube_pvinventory.rb | 61 +++++++++++----------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 5409a21b6..188b2b0c7 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -83,6 +83,7 @@ class Constants CONTAINER_RESOURCE_UTIL_HEART_BEAT_EVENT = "ContainerResourceUtilMdmHeartBeatEvent" PV_USAGE_HEART_BEAT_EVENT = "PVUsageMdmHeartBeatEvent" PV_KUBE_SYSTEM_METRICS_ENABLED_EVENT = "CollectPVKubeSystemMetricsEnabled" + PV_INVENTORY_HEART_BEAT_EVENT = "KubePVInventoryHeartBeatEvent" TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 10 KUBE_STATE_TELEMETRY_FLUSH_INTERVAL_IN_MINUTES = 15 MDM_TIME_SERIES_FLUSHED_IN_LAST_HOUR = "MdmTimeSeriesFlushedInLastHour" diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 525ae61b8..8b63a2eda 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -2,7 +2,6 @@ module Fluent class Kube_PVInventory_Input < Input Plugin.register_input("kubepvinventory", self) - @@MDMKubePVInventoryTag = "mdm.kubepvinventory" @@hostName = (OMS::Common.get_hostname) def initialize @@ -84,9 +83,8 @@ def enumerate # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} - telemetryProperties["Computer"] = @@hostName telemetryProperties["CountsOfPVKinds"] = @pvKindToCountHash - ApplicationInsightsUtility.sendCustomEvent("KubePVInventoryHeartBeatEvent", telemetryProperties) + ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end @@ -141,34 +139,36 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - metricItem = {} - metricItem["CollectionTime"] = batchTime - metricItem["Computer"] = @@hostName - metricItem["Name"] = "pvInventory" - metricItem["Value"] = 0 - metricItem["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - metricItem["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - - metricTags = {} - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERID] = KubernetesApiClient.getClusterId - metricTags[Constants::INSIGHTSMETRICS_TAGS_CLUSTERNAME] = KubernetesApiClient.getClusterName - metricTags["PVName"] = item["metadata"]["name"] - metricTags["PVCName"] = pvcName - metricTags["PVCNamespace"] = namespace - metricTags["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - metricTags["Kind"] = kind - metricTags["StorageClassName"] = item["spec"]["storageClassName"] - metricTags["Status"] = item["status"]["phase"] - metricTags["AccessMode"] = item["spec"]["accessModes"] - metricTags["RequestSize"] = item["spec"]["capacity"]["storage"] - + # Node and Pod info can be found by joining with pvUsedBytes metric using namespace/PVCName + record = {} + record["CollectionTime"] = batchTime + record["ClusterId"] = KubernetesApiClient.getClusterId + record["ClusterName"] = KubernetesApiClient.getClusterName + # Name or PVName + record["Name"] = item["metadata"]["name"] + record["PVCName"] = pvcName + # Namespace, PodNamespace, or PVNamespace + record["Namespace"] = namespace + record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] + # Should kubernetes.io/ be removed? + record["Kind"] = kind + # This is the storage class name rather than type. Would require another api call to get more storage class info + record["StorageClassName"] = item["spec"]["storageClassName"] + # Available, Bound, Released, Failed + record["Status"] = item["status"]["phase"] + # RWO for azure disks; azure files can have multiple in the spec: RWO, ROX, and/or RWX + record["AccessModes"] = item["spec"]["accessModes"] + # This is a string + record["RequestSize"] = item["spec"]["capacity"]["storage"] + # Should these be their own columns or tags for PV Kind + kindTags = {} if isAzureDisk - metricTags["DiskName"] = diskName - metricTags["DiskURI"] = diskUri + kindTags["DiskName"] = diskName + kindTags["DiskURI"] = diskUri end + record["KindInfo"] = kindTags - metricItem["Tags"] = metricTags - records.push(metricItem) + records.push(record) end $log.info "pvKindToCountHash: #{@pvKindToCountHash}" @@ -176,7 +176,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) records.each do |record| if !record.nil? wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", + "DataType" => "KUBE_PV_INVENTORY_BLOB", "IPName" => "ContainerInsights", "DataItems" => [record.each { |k, v| record[k] = v }], } @@ -184,8 +184,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - #router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream + router.emit_stream(@tag, eventStream) if eventStream rescue => errorStr $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" From 4ae76a1b60ad3046e3da9b1ac85f8af244e2a839 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 10 Sep 2020 13:31:42 -0700 Subject: [PATCH 47/57] add back in parse_and_emit --- source/plugins/ruby/in_kube_pvinventory.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 8b63a2eda..0b837b650 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -60,6 +60,12 @@ def enumerate continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}") $log.info("in_kube_pvinventory::enumerate : Done getting PVs from Kube API @ #{Time.now.utc.iso8601}") + if (!pvInventory.nil? && !pvInventory.empty? && pvInventory.key?("items") && !pvInventory["items"].nil? && !pvInventory["items"].empty?) + parse_and_emit_records(pvInventory, batchTime) + else + $log.warn "in_kube_pvinventory::enumerate:Received empty pvInventory" + end + # If we receive a continuation token, make calls, process and flush data until we have processed all data while (!continuationToken.nil? && !continuationToken.empty?) continuationToken, pvInventory = KubernetesApiClient.getResourcesAndContinuationToken("persistentvolumes?limit=#{@PV_CHUNK_SIZE}&continue=#{continuationToken}") @@ -158,7 +164,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["Status"] = item["status"]["phase"] # RWO for azure disks; azure files can have multiple in the spec: RWO, ROX, and/or RWX record["AccessModes"] = item["spec"]["accessModes"] - # This is a string + # This is a string record["RequestSize"] = item["spec"]["capacity"]["storage"] # Should these be their own columns or tags for PV Kind kindTags = {} From a9218a1edf58c791b359eaefa41de329ba9a376e Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 14 Sep 2020 10:53:39 -0700 Subject: [PATCH 48/57] additions for PV Type --- source/plugins/ruby/in_kube_pvinventory.rb | 48 ++++++++++++++-------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 0b837b650..138eb4d22 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -79,17 +79,17 @@ def enumerate # Setting this to nil so that we dont hold memory until GC kicks in pvInventory = nil - # Adding telemetry to send pod telemetry every 5 minutes + # Adding telemetry to send pod telemetry every 10 minutes timeDifference = (DateTime.now.to_time.to_i - @@pvTelemetryTimeTracker).abs timeDifferenceInMinutes = timeDifference / 60 - if (timeDifferenceInMinutes >= 5) + if (timeDifferenceInMinutes >= Constants::TELEMETRY_FLUSH_INTERVAL_IN_MINUTES) telemetryFlush = true end # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} - telemetryProperties["CountsOfPVKinds"] = @pvKindToCountHash + telemetryProperties["CountsOfPVTypes"] = @pvKindToCountHash ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end @@ -105,7 +105,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) currentTime = Time.now emitTime = currentTime.to_f eventStream = MultiEventStream.new - @@istestvar = ENV["ISTEST"] begin records = [] @@ -126,26 +125,37 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) return records end - # Check if the PV is an Azure Disk + # Check if the PV is an Azure Disk or Azure File isAzureDisk = false + isAzureFile = false if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? isAzureDisk = true azureDisk = item["spec"]["azureDisk"] diskName = azureDisk["diskName"] diskUri = azureDisk["diskURI"] + elsif !item["spec"].nil? && !item["spec"]["azureFile"].nil? + isAzureFile = true + azureFileShareName = item["spec"]["azureFile"]["shareName"] end - # Get telemetry on PV kind + # Get telemetry on PV Type - if statically provisioned, type not in annotations if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil? kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].downcase - if (@pvKindToCountHash.has_key? kind) - @pvKindToCountHash[kind] += 1 - else - @pvKindToCountHash[kind] = 1 - end + elsif isAzureDisk + kind = "kubernetes.io/azure-disk" + elsif isAzureFile + kind = "kubernetes.io/azure-file" + else + kind = "other" + end + if (@pvKindToCountHash.has_key? kind) + @pvKindToCountHash[kind] += 1 + else + @pvKindToCountHash[kind] = 1 end # Node and Pod info can be found by joining with pvUsedBytes metric using namespace/PVCName + # Kube events can also be found using namespace/PVCName record = {} record["CollectionTime"] = batchTime record["ClusterId"] = KubernetesApiClient.getClusterId @@ -153,26 +163,28 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) # Name or PVName record["Name"] = item["metadata"]["name"] record["PVCName"] = pvcName - # Namespace, PodNamespace, or PVNamespace + # Namespace, PodNamespace, or PVCNamespace record["Namespace"] = namespace record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - # Should kubernetes.io/ be removed? - record["Kind"] = kind - # This is the storage class name rather than type. Would require another api call to get more storage class info + # kubernetes.io/azure-disk, kubernetes.io/azure-file + record["Type"] = kind + # This is the storage class name rather than type (standard / premium). Would require another api call to get more storage class info record["StorageClassName"] = item["spec"]["storageClassName"] # Available, Bound, Released, Failed record["Status"] = item["status"]["phase"] # RWO for azure disks; azure files can have multiple in the spec: RWO, ROX, and/or RWX record["AccessModes"] = item["spec"]["accessModes"] - # This is a string + # This is a string i.e 5Gi, should it be numeric? - This can be different from the PVC request size record["RequestSize"] = item["spec"]["capacity"]["storage"] - # Should these be their own columns or tags for PV Kind + # Should these be their own columns or tags for PV Type kindTags = {} if isAzureDisk kindTags["DiskName"] = diskName kindTags["DiskURI"] = diskUri + elsif isAzureFile + kindTags["FileShareName"] = azureFileShareName end - record["KindInfo"] = kindTags + record["TypeInfo"] = kindTags records.push(record) end From bc4a9430b8a338eff42919c0b626b73b00d4eeaa Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 14 Sep 2020 15:28:43 -0700 Subject: [PATCH 49/57] updated schema, sending to insights metrics for testing --- .../plugins/ruby/CAdvisorMetricsAPIClient.rb | 3 +- source/plugins/ruby/constants.rb | 5 +- source/plugins/ruby/in_kube_pvinventory.rb | 116 +++++++++--------- 3 files changed, 67 insertions(+), 57 deletions(-) diff --git a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb index bd1cd1000..daadc1e21 100644 --- a/source/plugins/ruby/CAdvisorMetricsAPIClient.rb +++ b/source/plugins/ruby/CAdvisorMetricsAPIClient.rb @@ -342,6 +342,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric podUid = pod["podRef"]["uid"] podName = pod["podRef"]["name"] pvcName = pvcRef["name"] + pvcNamespace = pvcRef["namespace"] volumeName = volume["name"] metricItem = {} @@ -359,7 +360,7 @@ def getPersistentVolumeMetrics(metricJSON, hostName, metricNameToCollect, metric metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAME] = podName metricTags[Constants::INSIGHTSMETRICS_TAGS_VOLUME_NAME] = volumeName metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAME] = pvcName - metricTags[Constants::INSIGHTSMETRICS_TAGS_POD_NAMESPACE] = podNamespace + metricTags[Constants::INSIGHTSMETRICS_TAGS_PVC_NAMESPACE] = pvcNamespace metricTags[Constants::INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES] = volume["capacityBytes"] metricItem["Tags"] = metricTags diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 188b2b0c7..2b8cff008 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -17,8 +17,8 @@ class Constants INSIGTHTSMETRICS_TAGS_PV_NAMESPACE = "container.azm.ms/pv" INSIGHTSMETRICS_TAGS_VOLUME_NAME = "volumeName" INSIGHTSMETRICS_TAGS_PVC_NAME = "pvcName" + INSIGHTSMETRICS_TAGS_PVC_NAMESPACE = "pvcNamespace" INSIGHTSMETRICS_TAGS_POD_NAME = "podName" - INSIGHTSMETRICS_TAGS_POD_NAMESPACE = "podNamespace" INSIGHTSMETRICS_TAGS_PV_CAPACITY_BYTES = "pvCapacityBytes" INSIGHTSMETRICS_FLUENT_TAG = "oms.api.InsightsMetrics" REASON_OOM_KILLED = "oomkilled" @@ -76,6 +76,9 @@ class Constants TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" + PV_TYPES =["azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quaobyte", "rbd", "scaleIO", "storageOS", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 138eb4d22..5f6c871c7 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -51,7 +51,7 @@ def enumerate begin pvInventory = nil telemetryFlush = false - @pvKindToCountHash = {} + @pvTypeToCountHash = {} currentTime = Time.now batchTime = currentTime.utc.iso8601 @@ -116,85 +116,91 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) claimRef = item["spec"]["claimRef"] if claimRef["kind"] == "PersistentVolumeClaim" hasPVC = true - namespace = claimRef["namespace"] + pvcNamespace = claimRef["namespace"] pvcName = claimRef["name"] end end - # Return if no PVC - if !hasPVC - return records - end - # Check if the PV is an Azure Disk or Azure File + # Determine PV Type + type = "empty" + hasType = false isAzureDisk = false isAzureFile = false - if !item["spec"].nil? && !item["spec"]["azureDisk"].nil? - isAzureDisk = true - azureDisk = item["spec"]["azureDisk"] - diskName = azureDisk["diskName"] - diskUri = azureDisk["diskURI"] - elsif !item["spec"].nil? && !item["spec"]["azureFile"].nil? - isAzureFile = true - azureFileShareName = item["spec"]["azureFile"]["shareName"] + if !item["spec"].nil? + Constants::PV_TYPE.each do |pvType| + + # PV is this type + if !item["spec"][pvType].nil? + type = pvType + hasType = true + + # Get additional info if azure disk/file + if pvType == "azureDisk" + isAzureDisk = true + azureDisk = item["spec"]["azureDisk"] + diskName = azureDisk["diskName"] + diskUri = azureDisk["diskURI"] + elsif pvType == "azureFile" + isAzureFile = true + azureFileShareName = item["spec"]["azureFile"]["shareName"] + end + + end + end end - # Get telemetry on PV Type - if statically provisioned, type not in annotations - if !item["metadata"].nil? && !item["metadata"]["annotations"].nil? && !item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].nil? - kind = item["metadata"]["annotations"]["pv.kubernetes.io/provisioned-by"].downcase - elsif isAzureDisk - kind = "kubernetes.io/azure-disk" - elsif isAzureFile - kind = "kubernetes.io/azure-file" - else - kind = "other" - end - if (@pvKindToCountHash.has_key? kind) - @pvKindToCountHash[kind] += 1 + # Record telemetry + if (@pvTypeToCountHash.has_key? type) + @pvTypeToCountHash[type] += 1 else - @pvKindToCountHash[kind] = 1 + @pvTypeToCountHash[type] = 1 end - # Node and Pod info can be found by joining with pvUsedBytes metric using namespace/PVCName - # Kube events can also be found using namespace/PVCName + # Node and Pod info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName record = {} record["CollectionTime"] = batchTime record["ClusterId"] = KubernetesApiClient.getClusterId record["ClusterName"] = KubernetesApiClient.getClusterName - # Name or PVName - record["Name"] = item["metadata"]["name"] - record["PVCName"] = pvcName - # Namespace, PodNamespace, or PVCNamespace - record["Namespace"] = namespace - record["CreationTimeStamp"] = item["metadata"]["creationTimestamp"] - # kubernetes.io/azure-disk, kubernetes.io/azure-file - record["Type"] = kind - # This is the storage class name rather than type (standard / premium). Would require another api call to get more storage class info - record["StorageClassName"] = item["spec"]["storageClassName"] - # Available, Bound, Released, Failed - record["Status"] = item["status"]["phase"] - # RWO for azure disks; azure files can have multiple in the spec: RWO, ROX, and/or RWX - record["AccessModes"] = item["spec"]["accessModes"] - # This is a string i.e 5Gi, should it be numeric? - This can be different from the PVC request size - record["RequestSize"] = item["spec"]["capacity"]["storage"] - # Should these be their own columns or tags for PV Type - kindTags = {} + + record["Name"] = "pvInventory" + record["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN + record["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE + record["Computer"] = @@hostName + + recordTags = {} + recordTags["PVName"] = item["metadata"]["name"] + recordTags["PVStatus"] = item["status"]["phase"] + recordTags["PVAccessModes"] = item["spec"]["accessModes"].join(', ') + recordTags["PVStorageClassName"] = item["spec"]["storageClassName"] + recordTags["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) + recordTags["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + + # Optional values + if hasPVC + recordTags["PVCName"] = pvcName + recordTags["PVCNamespace"] = pvcNamespace + end + if hasType + recordTags["PVType"] = type + end + typeInfo = {} if isAzureDisk - kindTags["DiskName"] = diskName - kindTags["DiskURI"] = diskUri + typeInfo["DiskName"] = diskName + typeInfo["DiskURI"] = diskUri elsif isAzureFile - kindTags["FileShareName"] = azureFileShareName + typeInfo["FileShareName"] = azureFileShareName end - record["TypeInfo"] = kindTags + recordTags["PVTypeInfo"] = typeInfo records.push(record) end - $log.info "pvKindToCountHash: #{@pvKindToCountHash}" + $log.info "pvTypeToCountHash: #{@pvTypeToCountHash}" records.each do |record| if !record.nil? wrapper = { - "DataType" => "KUBE_PV_INVENTORY_BLOB", + "DataType" => "INSIGHTS_METRICS_BLOB", "IPName" => "ContainerInsights", "DataItems" => [record.each { |k, v| record[k] = v }], } @@ -202,7 +208,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end - router.emit_stream(@tag, eventStream) if eventStream + router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" From 948bf9a2cabbc3aa81dfeed9961645d18bb1aec9 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Fri, 18 Sep 2020 13:22:21 -0700 Subject: [PATCH 50/57] bug fixes --- source/plugins/ruby/constants.rb | 4 ++-- source/plugins/ruby/in_kube_pvinventory.rb | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index 2b8cff008..c4fbfd4aa 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -76,9 +76,9 @@ class Constants TELEGRAF_DISK_METRICS = "container.azm.ms/disk" OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" - PV_TYPES =["azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", - "photonPersistentDisk", "portworxVolume", "quaobyte", "rbd", "scaleIO", "storageOS", "vsphereVolume"] + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 5f6c871c7..a49ddec9c 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -124,10 +124,13 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) # Determine PV Type type = "empty" hasType = false + diskName = "" + diskUri = "" isAzureDisk = false + azureFileShareName = "" isAzureFile = false if !item["spec"].nil? - Constants::PV_TYPE.each do |pvType| + (Constants::PV_TYPES).each do |pvType| # PV is this type if !item["spec"][pvType].nil? @@ -191,6 +194,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) typeInfo["FileShareName"] = azureFileShareName end recordTags["PVTypeInfo"] = typeInfo + + record["Tags"] = recordTags records.push(record) end @@ -208,6 +213,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end + router.emit_stream(@tag, eventStream) if eventStream router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr From 9016f14ff984b21296c77cae5854379a5472e0f0 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 5 Oct 2020 14:41:02 -0700 Subject: [PATCH 51/57] route to new LA table --- kubernetes/omsagent.yaml | 24 +++++++++++++ source/plugins/ruby/in_kube_pvinventory.rb | 40 ++++++++-------------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/kubernetes/omsagent.yaml b/kubernetes/omsagent.yaml index e8352e020..6a7466f44 100644 --- a/kubernetes/omsagent.yaml +++ b/kubernetes/omsagent.yaml @@ -21,6 +21,7 @@ rules: "nodes/proxy", "namespaces", "services", + "persistentvolumes" ] verbs: ["list", "get", "watch"] - apiGroups: ["apps", "extensions", "autoscaling"] @@ -67,6 +68,14 @@ data: custom_metrics_azure_regions eastus,southcentralus,westcentralus,westus2,southeastasia,northeurope,westeurope,southafricanorth,centralus,northcentralus,eastus2,koreacentral,eastasia,centralindia,uksouth,canadacentral,francecentral,japaneast,australiaeast,eastus2,westus,australiasoutheast,brazilsouth,germanywestcentral,northcentralus,switzerlandnorth + #Kubernetes Persistent Volume inventory + + type kubepvinventory + tag oms.containerinsights.KubePVInventory + run_interval 60 + log_level debug + + #Kubernetes events type kubeevents @@ -149,6 +158,21 @@ data: max_retry_wait 5m + + type out_oms + log_level debug + num_threads 5 + buffer_chunk_limit 4m + buffer_type file + buffer_path %STATE_DIR_WS%/state/out_oms_kubepv*.buffer + buffer_queue_limit 20 + buffer_queue_full_action drop_oldest_chunk + flush_interval 20s + retry_limit 10 + retry_wait 5s + max_retry_wait 5m + + type out_oms log_level debug diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index a49ddec9c..e87656401 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -122,13 +122,13 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end # Determine PV Type - type = "empty" hasType = false + isAzureDisk = false + isAzureFile = false + type = "empty" diskName = "" diskUri = "" - isAzureDisk = false azureFileShareName = "" - isAzureFile = false if !item["spec"].nil? (Constants::PV_TYPES).each do |pvType| @@ -164,27 +164,20 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["CollectionTime"] = batchTime record["ClusterId"] = KubernetesApiClient.getClusterId record["ClusterName"] = KubernetesApiClient.getClusterName - - record["Name"] = "pvInventory" - record["Origin"] = Constants::INSIGHTSMETRICS_TAGS_ORIGIN - record["Namespace"] = Constants::INSIGTHTSMETRICS_TAGS_PV_NAMESPACE - record["Computer"] = @@hostName - - recordTags = {} - recordTags["PVName"] = item["metadata"]["name"] - recordTags["PVStatus"] = item["status"]["phase"] - recordTags["PVAccessModes"] = item["spec"]["accessModes"].join(', ') - recordTags["PVStorageClassName"] = item["spec"]["storageClassName"] - recordTags["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) - recordTags["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] + record["PVName"] = item["metadata"]["name"] + record["PVStatus"] = item["status"]["phase"] + record["PVAccessModes"] = item["spec"]["accessModes"].join(', ') + record["PVStorageClassName"] = item["spec"]["storageClassName"] + record["PVCapacityBytes"] = KubernetesApiClient.getMetricNumericValue("memory", item["spec"]["capacity"]["storage"]) + record["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] # Optional values if hasPVC - recordTags["PVCName"] = pvcName - recordTags["PVCNamespace"] = pvcNamespace + record["PVCName"] = pvcName + record["PVCNamespace"] = pvcNamespace end if hasType - recordTags["PVType"] = type + record["PVType"] = type end typeInfo = {} if isAzureDisk @@ -193,19 +186,15 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) elsif isAzureFile typeInfo["FileShareName"] = azureFileShareName end - recordTags["PVTypeInfo"] = typeInfo - - record["Tags"] = recordTags + record["PVTypeInfo"] = typeInfo records.push(record) end - $log.info "pvTypeToCountHash: #{@pvTypeToCountHash}" - records.each do |record| if !record.nil? wrapper = { - "DataType" => "INSIGHTS_METRICS_BLOB", + "DataType" => "KUBE_PV_INVENTORY_BLOB", "IPName" => "ContainerInsights", "DataItems" => [record.each { |k, v| record[k] = v }], } @@ -214,7 +203,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end router.emit_stream(@tag, eventStream) if eventStream - router.emit_stream(Constants::INSIGHTSMETRICS_FLUENT_TAG, eventStream) if eventStream rescue => errorStr $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" From cc8c0235cc5b3a9ce4a14f813b6ad99218dac5e5 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Mon, 5 Oct 2020 17:00:21 -0700 Subject: [PATCH 52/57] refactoring --- source/plugins/ruby/in_kube_pvinventory.rb | 108 ++++++++++----------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index e87656401..c8eddf50a 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -110,55 +110,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) records = [] pvInventory["items"].each do |item| - # Check if the PV has a PVC - hasPVC = false - if !item["spec"].nil? && !item["spec"]["claimRef"].nil? - claimRef = item["spec"]["claimRef"] - if claimRef["kind"] == "PersistentVolumeClaim" - hasPVC = true - pvcNamespace = claimRef["namespace"] - pvcName = claimRef["name"] - end - end - - # Determine PV Type - hasType = false - isAzureDisk = false - isAzureFile = false - type = "empty" - diskName = "" - diskUri = "" - azureFileShareName = "" - if !item["spec"].nil? - (Constants::PV_TYPES).each do |pvType| - - # PV is this type - if !item["spec"][pvType].nil? - type = pvType - hasType = true - - # Get additional info if azure disk/file - if pvType == "azureDisk" - isAzureDisk = true - azureDisk = item["spec"]["azureDisk"] - diskName = azureDisk["diskName"] - diskUri = azureDisk["diskURI"] - elsif pvType == "azureFile" - isAzureFile = true - azureFileShareName = item["spec"]["azureFile"]["shareName"] - end - - end - end - end - - # Record telemetry - if (@pvTypeToCountHash.has_key? type) - @pvTypeToCountHash[type] += 1 - else - @pvTypeToCountHash[type] = 1 - end - # Node and Pod info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName record = {} record["CollectionTime"] = batchTime @@ -172,13 +123,12 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["PVCreationTimeStamp"] = item["metadata"]["creationTimestamp"] # Optional values - if hasPVC - record["PVCName"] = pvcName - record["PVCNamespace"] = pvcNamespace - end - if hasType - record["PVType"] = type - end + pvcNamespace, pvcName = getPVCInfo(item) + type, typeInfo = getTypeInfo(item) + + record["PVCNamespace"] = pvcNamespace + record["PVCName"] = pvcName + record["PVType"] = (type != "empty" ? type : nil) typeInfo = {} if isAzureDisk typeInfo["DiskName"] = diskName @@ -189,6 +139,14 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["PVTypeInfo"] = typeInfo records.push(record) + $log.info("in_kube_pvinventory: record #{record}") + + # Record telemetry + if (@pvTypeToCountHash.has_key? type) + @pvTypeToCountHash[type] += 1 + else + @pvTypeToCountHash[type] = 1 + end end records.each do |record| @@ -211,6 +169,44 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) end end + def getPVCInfo(item) + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + claimRef = item["spec"]["claimRef"] + pvcNamespace = claimRef["namespace"] + pvcName = claimRef["name"] + end + end + return pvcNamespace, pvcName + end + + def getTypeInfo(item) + if !item["spec"].nil? + (Constants::PV_TYPES).each do |pvType| + + # PV is this type + if !item["spec"][pvType].nil? + type = pvType + + # Get additional info if azure disk/file + typeInfo = {} + if pvType == "azureDisk" + azureDisk = item["spec"]["azureDisk"] + typeInfo["DiskName"] = azureDisk["diskName"] + typeInfo["DiskUri"] = azureDisk["diskURI"] + elsif pvType == "azureFile" + typeInfo["FileShareName"] = item["spec"]["azureFile"]["shareName"] + end + + return type, typeInfo + + end + end + end + + return "empty", {} + end + + def run_periodic @mutex.lock done = @finished From 062306a7aa9a89470d3bc3c859c2fb3e15017a8c Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Tue, 6 Oct 2020 17:07:11 -0700 Subject: [PATCH 53/57] add back in pv type list --- source/plugins/ruby/constants.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/plugins/ruby/constants.rb b/source/plugins/ruby/constants.rb index db930dddd..375d1a029 100644 --- a/source/plugins/ruby/constants.rb +++ b/source/plugins/ruby/constants.rb @@ -77,6 +77,9 @@ class Constants OMSAGENT_ZERO_FILL = "omsagent" KUBESYSTEM_NAMESPACE_ZERO_FILL = "kube-system" VOLUME_NAME_ZERO_FILL = "-" + PV_TYPES =["awsElasticBlockStore", "azureDisk", "azureFile", "cephfs", "cinder", "csi", "fc", "flexVolume", + "flocker", "gcePersistentDisk", "glusterfs", "hostPath", "iscsi", "local", "nfs", + "photonPersistentDisk", "portworxVolume", "quobyte", "rbd", "scaleIO", "storageos", "vsphereVolume"] #Telemetry constants CONTAINER_METRICS_HEART_BEAT_EVENT = "ContainerMetricsMdmHeartBeatEvent" From b1ff023cb5e7a7d084276a2bc3757fffe3709e65 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 7 Oct 2020 11:34:17 -0700 Subject: [PATCH 54/57] after testing fixes --- source/plugins/ruby/in_kube_pvinventory.rb | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index c8eddf50a..c0220d053 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -129,13 +129,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["PVCNamespace"] = pvcNamespace record["PVCName"] = pvcName record["PVType"] = (type != "empty" ? type : nil) - typeInfo = {} - if isAzureDisk - typeInfo["DiskName"] = diskName - typeInfo["DiskURI"] = diskUri - elsif isAzureFile - typeInfo["FileShareName"] = azureFileShareName - end record["PVTypeInfo"] = typeInfo records.push(record) @@ -172,9 +165,8 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) def getPVCInfo(item) if !item["spec"].nil? && !item["spec"]["claimRef"].nil? claimRef = item["spec"]["claimRef"] - pvcNamespace = claimRef["namespace"] - pvcName = claimRef["name"] - end + pvcNamespace = claimRef["namespace"] + pvcName = claimRef["name"] end return pvcNamespace, pvcName end From 59108b4f9b6075b88ebf0a5d7c1807679de47906 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 7 Oct 2020 12:10:47 -0700 Subject: [PATCH 55/57] comments and rescues --- source/plugins/ruby/in_kube_pvinventory.rb | 73 ++++++++++++++-------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index c0220d053..98854286f 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -16,6 +16,7 @@ def initialize require_relative "omslog" require_relative "constants" + # Response size is around 1500 bytes per PV @PV_CHUNK_SIZE = "5000" @pvKindToCountHash = {} end @@ -110,7 +111,7 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) records = [] pvInventory["items"].each do |item| - # Node and Pod info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName + # Node, pod, & usage info can be found by joining with pvUsedBytes metric using PVCNamespace/PVCName record = {} record["CollectionTime"] = batchTime record["ClusterId"] = KubernetesApiClient.getClusterId @@ -125,16 +126,18 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) # Optional values pvcNamespace, pvcName = getPVCInfo(item) type, typeInfo = getTypeInfo(item) - record["PVCNamespace"] = pvcNamespace record["PVCName"] = pvcName - record["PVType"] = (type != "empty" ? type : nil) + record["PVType"] = type record["PVTypeInfo"] = typeInfo records.push(record) $log.info("in_kube_pvinventory: record #{record}") # Record telemetry + if type == nil + type = "empty" + end if (@pvTypeToCountHash.has_key? type) @pvTypeToCountHash[type] += 1 else @@ -156,46 +159,62 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) router.emit_stream(@tag, eventStream) if eventStream rescue => errorStr - $log.warn "Failed in parse_and_emit_record pv inventory: #{errorStr}" + $log.warn "Failed in parse_and_emit_record for in_kube_pvinventory: #{errorStr}" $log.debug_backtrace(errorStr.backtrace) ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end end def getPVCInfo(item) - if !item["spec"].nil? && !item["spec"]["claimRef"].nil? - claimRef = item["spec"]["claimRef"] - pvcNamespace = claimRef["namespace"] - pvcName = claimRef["name"] + begin + if !item["spec"].nil? && !item["spec"]["claimRef"].nil? + claimRef = item["spec"]["claimRef"] + pvcNamespace = claimRef["namespace"] + pvcName = claimRef["name"] + return pvcNamespace, pvcName + end + rescue => errorStr + $log.warn "Failed in getPVCInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - return pvcNamespace, pvcName + + # No PVC or an error + return nil, nil end def getTypeInfo(item) - if !item["spec"].nil? - (Constants::PV_TYPES).each do |pvType| - - # PV is this type - if !item["spec"][pvType].nil? - type = pvType + begin + if !item["spec"].nil? + (Constants::PV_TYPES).each do |pvType| - # Get additional info if azure disk/file - typeInfo = {} - if pvType == "azureDisk" - azureDisk = item["spec"]["azureDisk"] - typeInfo["DiskName"] = azureDisk["diskName"] - typeInfo["DiskUri"] = azureDisk["diskURI"] - elsif pvType == "azureFile" - typeInfo["FileShareName"] = item["spec"]["azureFile"]["shareName"] - end + # PV is this type + if !item["spec"][pvType].nil? + + # Get additional info if azure disk/file + typeInfo = {} + if pvType == "azureDisk" + azureDisk = item["spec"]["azureDisk"] + typeInfo["DiskName"] = azureDisk["diskName"] + typeInfo["DiskUri"] = azureDisk["diskURI"] + elsif pvType == "azureFile" + typeInfo["FileShareName"] = item["spec"]["azureFile"]["shareName"] + end + + # Can only have one type: return right away when found + return pvType, typeInfo - return type, typeInfo - + end end end + rescue => errorStr + $log.warn "Failed in getTypeInfo for in_kube_pvinventory: #{errorStr}" + $log.debug_backtrace(errorStr.backtrace) + ApplicationInsightsUtility.sendExceptionTelemetry(errorStr) end - return "empty", {} + # No matches from list of types or an error + return nil, {} end From 677d93863a1ad7346b461bd0b0ddd43512359288 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 7 Oct 2020 12:23:39 -0700 Subject: [PATCH 56/57] remove extra logging --- source/plugins/ruby/in_kube_pvinventory.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index 98854286f..d38287f57 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -132,7 +132,6 @@ def parse_and_emit_records(pvInventory, batchTime = Time.utc.iso8601) record["PVTypeInfo"] = typeInfo records.push(record) - $log.info("in_kube_pvinventory: record #{record}") # Record telemetry if type == nil From db842806a7c35ff1f934c5b83935b789e2fffc71 Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Wed, 7 Oct 2020 12:30:35 -0700 Subject: [PATCH 57/57] fix variable naming --- source/plugins/ruby/in_kube_pvinventory.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/plugins/ruby/in_kube_pvinventory.rb b/source/plugins/ruby/in_kube_pvinventory.rb index d38287f57..b0e09c85b 100644 --- a/source/plugins/ruby/in_kube_pvinventory.rb +++ b/source/plugins/ruby/in_kube_pvinventory.rb @@ -18,7 +18,7 @@ def initialize # Response size is around 1500 bytes per PV @PV_CHUNK_SIZE = "5000" - @pvKindToCountHash = {} + @pvTypeToCountHash = {} end config_param :run_interval, :time, :default => 60 @@ -90,7 +90,7 @@ def enumerate # Flush AppInsights telemetry once all the processing is done if telemetryFlush == true telemetryProperties = {} - telemetryProperties["CountsOfPVTypes"] = @pvKindToCountHash + telemetryProperties["CountsOfPVTypes"] = @pvTypeToCountHash ApplicationInsightsUtility.sendCustomEvent(Constants::PV_INVENTORY_HEART_BEAT_EVENT, telemetryProperties) @@pvTelemetryTimeTracker = DateTime.now.to_time.to_i end