From 2e5071a6c4da1dd2b5815919ec13ebb186d1928f Mon Sep 17 00:00:00 2001 From: Vishwanath Narasimhan Date: Wed, 1 Aug 2018 16:47:22 -0700 Subject: [PATCH 1/9] Adding release history --- README.md | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 18e50ebe3..454d4dc93 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,37 @@ -# Docker Monitoring Agent for OMI Server +# AKS Container Health monitoring -### Code of Conduct +## Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Release History + +### 7/31/2018 - Version microsoft/oms:ciprod07312018 +- Changes for node lost scenario (roll-up pod & container statuses as Unknown) +- Discover unscheduled pods +- KubeNodeInventory - delimit multiple true node conditions for node status +- UTF Encoding support for container logs +- Container environment variable truncated to 200K +- Handle json parsing errors for OMI provider for docker +- Test mode enablement for ACS-engine testing +- Latest OMS agent (1.6.0-163) +- Latest OMI (1.4.2.5) + + +### 6/7/2018 - Version microsoft/oms:ciprod06072018 +- Remove node-0 dependency +- Remove passing WSID & Key as environment variables and pass them as kubernetes secret (for non-AKS; we already pass them as secret for AKS) +- Please note that if you are manually deploying thru yaml you need to - +- Provide workspaceid & key as base64 encoded strings with in double quotes (.yaml has comments to do so as well) +- Provide cluster name twice (for each container – daemonset & replicaset) + +### 5/8/2018 - Version microsoft/oms:ciprod05082018 +- Kubernetes RBAC enablement +- Latest released omsagent (1.6.0-42) +- Bug fix so that we do not collect kube-system namespace container logs when kube api calls fail occasionally (Bug #215107) +- .yaml changes (for RBAC) + + From afc981d504c3f44fd3232892e4823d5d09503d14 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Thu, 11 Oct 2018 21:37:09 -0700 Subject: [PATCH 2/9] Container Log Telemetry --- .gitignore | 3 + installer/conf/td-agent-bit.conf | 7 +- source/code/go/src/plugins/glide.lock | 10 +- source/code/go/src/plugins/glide.yaml | 8 +- source/code/go/src/plugins/oms.go | 9 +- source/code/go/src/plugins/out_oms.go | 10 ++ source/code/go/src/plugins/telemetry.go | 151 ++++++++++++++++++++++++ 7 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 source/code/go/src/plugins/telemetry.go diff --git a/.gitignore b/.gitignore index 92c8c0cf2..e58d69f7b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ /test/code/providers/TestScriptPath.h /test/code/providers/providertestutils.cpp +source/code/go/src/plugins/profiling +.vscode/launch.json +source/code/go/src/plugins/vendor/ \ No newline at end of file diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b5d2309e1..5a1c105bf 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -17,5 +17,8 @@ Skip_Long_Lines On [OUTPUT] - Name oms - Match oms.container.log.* \ No newline at end of file + Name oms + EnableTelemetry true + TelemetryPushInterval 300 + Match oms.container.log.* + AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/glide.lock b/source/code/go/src/plugins/glide.lock index 4597b594a..fc147fe74 100644 --- a/source/code/go/src/plugins/glide.lock +++ b/source/code/go/src/plugins/glide.lock @@ -1,5 +1,5 @@ -hash: bb32415f402ab29751f29b8e394bc974cbc31861453d817aaeb94ef83dacc488 -updated: 2018-09-14T18:14:28.748047598Z +hash: a6a873d09ed9c3d890a70122e61efba992ead9850fe48f6fcb020d86800d4ade +updated: 2018-10-10T13:37:51.9703908-07:00 imports: - name: github.com/fluent/fluent-bit-go version: c4a158a6e3a793166c6ecfa2d5c80d71eada8959 @@ -38,8 +38,10 @@ imports: - diskcache - name: github.com/json-iterator/go version: f2b4162afba35581b6d4a50d3b8f34e33c144682 -- name: github.com/mitchellh/mapstructure - version: fa473d140ef3c6adf42d6b391fe76707f1f243c8 +- name: github.com/Microsoft/ApplicationInsights-Go + version: d2df5d440eda5372f24fcac03839a64d6cb5f7e5 + subpackages: + - appinsights - name: github.com/modern-go/concurrent version: bacd9c7ef1dd9b15be4a9909b8ac7a4e313eec94 - name: github.com/modern-go/reflect2 diff --git a/source/code/go/src/plugins/glide.yaml b/source/code/go/src/plugins/glide.yaml index 403e1efc4..b2829391b 100644 --- a/source/code/go/src/plugins/glide.yaml +++ b/source/code/go/src/plugins/glide.yaml @@ -1,10 +1,8 @@ -package: plugins +package: . import: - package: github.com/fluent/fluent-bit-go subpackages: - output -- package: github.com/mitchellh/mapstructure - version: ^1.0.0 - package: gopkg.in/natefinch/lumberjack.v2 version: ^2.1.0 - package: k8s.io/apimachinery @@ -15,3 +13,7 @@ import: subpackages: - kubernetes - rest +- package: github.com/Microsoft/ApplicationInsights-Go + version: ^0.4.2 + subpackages: + - appinsights diff --git a/source/code/go/src/plugins/oms.go b/source/code/go/src/plugins/oms.go index d20f11d57..807e00937 100644 --- a/source/code/go/src/plugins/oms.go +++ b/source/code/go/src/plugins/oms.go @@ -42,6 +42,8 @@ var ( OMSEndpoint string // Computer (Hostname) when ingesting into ContainerLog table Computer string + // WorkspaceID log analytics workspace id + WorkspaceID string ) var ( @@ -170,6 +172,7 @@ func updateKubeSystemContainerIDs() { pods, err := ClientSet.CoreV1().Pods("kube-system").List(metav1.ListOptions{}) if err != nil { Log("Error getting pods %s\nIt is ok to log here and continue. Kube-system logs will be collected", err.Error()) + continue } _ignoreIDSet := make(map[string]bool) @@ -269,7 +272,10 @@ func PostDataHelper(tailPluginRecords []map[interface{}]interface{}) int { return output.FLB_RETRY } - Log("Successfully flushed %d records in %s", len(dataItems), elapsed) + numRecords := len(dataItems) + Log("Successfully flushed %d records in %s", numRecords, elapsed) + FlushedRecordsCount += float64(numRecords) + FlushedRecordsTimeTaken += float64(elapsed / time.Millisecond) } return output.FLB_OK @@ -322,6 +328,7 @@ func InitializePlugin(pluginConfPath string) { log.Fatalf("Error Reading omsadmin configuration %s\n", err.Error()) } OMSEndpoint = omsadminConf["OMS_ENDPOINT"] + WorkspaceID = omsadminConf["WORKSPACE_ID"] Log("OMSEndpoint %s", OMSEndpoint) // Initialize image,name map refresh ticker diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 0efc1242d..37c9eb12b 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -5,6 +5,7 @@ import ( ) import ( "C" + "strings" "unsafe" ) @@ -19,6 +20,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) + enablePlugin := output.FLBPluginConfigKey(ctx, "EnableTelemetry") + telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") + agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") + + if strings.Compare(strings.ToLower(enablePlugin), "true") == 0 { + go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) + SendEvent(EventNameContainerLogInit, make(map[string]string)) + } return output.FLB_OK } @@ -48,6 +57,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { + defer TelemetryShutdown() KubeSystemContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go new file mode 100644 index 000000000..4d4ab2371 --- /dev/null +++ b/source/code/go/src/plugins/telemetry.go @@ -0,0 +1,151 @@ +package main + +import ( + "encoding/base64" + "errors" + "os" + "strconv" + "strings" + "time" + + "github.com/Microsoft/ApplicationInsights-Go/appinsights" +) + +var ( + // FlushedRecordsCount indicates the number of flushed records in the current period + FlushedRecordsCount float64 + // FlushedRecordsTimeTaken indicates the cumulative time taken to flush the records for the current period + FlushedRecordsTimeTaken float64 + // CommonProperties indicates the dimensions that are sent with every event/metric + CommonProperties map[string]string + // TelemetryClient is the client used to send the telemetry + TelemetryClient appinsights.TelemetryClient + // ContainerLogTelemetryTicker sends telemetry periodically + ContainerLogTelemetryTicker *time.Ticker +) + +const ( + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsInstrumentationKey = "APPLICATIONINSIGHTS_INSTRUMENTATIONKEY" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushInterval = 300 + + // EventNameContainerLogInit name of the event + EventNameContainerLogInit = "ContainerLogPluginInitialized" +) + +// Initialize initializes the telemetry artifacts +func initialize(telemetryIntervalProperty string, agentVersion string) (int, error) { + + telemetryInterval, err := strconv.Atoi(telemetryIntervalProperty) + if err != nil { + telemetryInterval = defaultTelemetryPushInterval + } + + ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryInterval)) + + encodedIkey := os.Getenv(envAppInsightsInstrumentationKey) + if encodedIkey == "" { + Log("App Insights IKey missing in Environment Variables \n") + return -1, errors.New("Missing App Insights Instrumentation Key Environment Variable") + } + + decIkey, err := base64.StdEncoding.DecodeString(encodedIkey) + if err != nil { + Log("Error Decoding encoded Instrumentation key %s", err.Error()) + return -1, err + } + + TelemetryClient = appinsights.NewTelemetryClient(string(decIkey)) + + CommonProperties = make(map[string]string) + CommonProperties["Computer"] = Computer + CommonProperties["WorkspaceID"] = WorkspaceID + CommonProperties["ControllerType"] = controllerTypeDaemonSet + CommonProperties["AgentVersion"] = agentVersion + + aksResourceID := os.Getenv(envAKSResourceID) + // if the aks resource id is not defined, it is most likely an ACS Cluster + if aksResourceID == "" { + CommonProperties["ACSResourceName"] = os.Getenv(envACSResourceName) + CommonProperties["ClusterType"] = clusterTypeACS + + CommonProperties["SubscriptionID"] = "" + CommonProperties["ResourceGroupName"] = "" + CommonProperties["ClusterName"] = "" + CommonProperties["Region"] = "" + + } else { + CommonProperties["ACSResourceName"] = "" + splitStrings := strings.Split(aksResourceID, "/") + CommonProperties["SubscriptionID"] = splitStrings[2] + CommonProperties["ResourceGroupName"] = splitStrings[4] + CommonProperties["ClusterName"] = splitStrings[8] + CommonProperties["ClusterType"] = clusterTypeAKS + + region := os.Getenv("AKS_REGION") + if region != "" { + CommonProperties["Region"] = region + } + } + + TelemetryClient.Context().CommonProperties = CommonProperties + return 0, nil +} + +// SendContainerLogFlushRateMetric is a go-routine that flushes the data periodically (every 5 mins to App Insights) +func SendContainerLogFlushRateMetric(telemetryIntervalProperty string, agentVersion string) { + + ret, err := initialize(telemetryIntervalProperty, agentVersion) + if ret != 0 || err != nil { + Log("Error During Telemetry Initialization :%s", err.Error()) + return + } + + for ; true; <-ContainerLogTelemetryTicker.C { + flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 + metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) + TelemetryClient.Track(metric) + FlushedRecordsCount = 0.0 + FlushedRecordsTimeTaken = 0.0 + } +} + +// TelemetryShutdown stops the ticker that sends data to App Insights periodically +func TelemetryShutdown() { + Log("Shutting down ContainerLog Telemetry\n") + ContainerLogTelemetryTicker.Stop() +} + +// SendEvent sends an event to App Insights +func SendEvent(eventName string, dimensions map[string]string) { + // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine + for TelemetryClient == nil { + Log("Waiting for Telemetry Client to be initialized") + time.Sleep(1 * time.Second) + } + + // take a copy so the CommonProperties can be restored later + _commonProps := make(map[string]string) + for k, v := range TelemetryClient.Context().CommonProperties { + _commonProps[k] = v + } + + // add any extra dimensions + for k, v := range dimensions { + TelemetryClient.Context().CommonProperties[k] = v + } + + Log("Sending Event : %s\n", eventName) + event := appinsights.NewEventTelemetry(eventName) + TelemetryClient.Track(event) + + // restore original CommonProperties + TelemetryClient.Context().CommonProperties = _commonProps +} From 4b958dde94450e96d6d46351756c83500df7935f Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 09:18:10 -0700 Subject: [PATCH 3/9] Fixing an issue with Send Init Event if Telemetry is not initialized properly, tab to whitespace in conf file --- installer/conf/td-agent-bit.conf | 2 +- source/code/go/src/plugins/out_oms.go | 7 ++-- source/code/go/src/plugins/telemetry.go | 44 ++++++++++++++----------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 5a1c105bf..6849a3744 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,6 +19,6 @@ [OUTPUT] Name oms EnableTelemetry true - TelemetryPushInterval 300 + TelemetryPushInterval 300 Match oms.container.log.* AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 37c9eb12b..2603368ab 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -20,13 +20,14 @@ func FLBPluginRegister(ctx unsafe.Pointer) int { func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) - enablePlugin := output.FLBPluginConfigKey(ctx, "EnableTelemetry") + enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - if strings.Compare(strings.ToLower(enablePlugin), "true") == 0 { + if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) - SendEvent(EventNameContainerLogInit, make(map[string]string)) + } else { + Log("Telemetry is not enabled for the plugin %s \n", output.FLBPluginConfigKey(ctx, "Name")) } return output.FLB_OK } diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 4d4ab2371..c2f565a45 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -4,6 +4,7 @@ import ( "encoding/base64" "errors" "os" + "runtime" "strconv" "strings" "time" @@ -25,39 +26,40 @@ var ( ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsInstrumentationKey = "APPLICATIONINSIGHTS_INSTRUMENTATIONKEY" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - defaultTelemetryPushInterval = 300 + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushInterval = 300 // EventNameContainerLogInit name of the event EventNameContainerLogInit = "ContainerLogPluginInitialized" ) // Initialize initializes the telemetry artifacts -func initialize(telemetryIntervalProperty string, agentVersion string) (int, error) { +func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, error) { - telemetryInterval, err := strconv.Atoi(telemetryIntervalProperty) + telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) if err != nil { - telemetryInterval = defaultTelemetryPushInterval + Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushInterval) + telemetryPushInterval = defaultTelemetryPushInterval } - ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryInterval)) + ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryPushInterval)) - encodedIkey := os.Getenv(envAppInsightsInstrumentationKey) + encodedIkey := os.Getenv(envAppInsightsAuth) if encodedIkey == "" { - Log("App Insights IKey missing in Environment Variables \n") - return -1, errors.New("Missing App Insights Instrumentation Key Environment Variable") + Log("Environment Variable Missing \n") + return -1, errors.New("Missing Environment Variable") } decIkey, err := base64.StdEncoding.DecodeString(encodedIkey) if err != nil { - Log("Error Decoding encoded Instrumentation key %s", err.Error()) + Log("Decoding Error %s", err.Error()) return -1, err } @@ -99,14 +101,16 @@ func initialize(telemetryIntervalProperty string, agentVersion string) (int, err } // SendContainerLogFlushRateMetric is a go-routine that flushes the data periodically (every 5 mins to App Insights) -func SendContainerLogFlushRateMetric(telemetryIntervalProperty string, agentVersion string) { +func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agentVersion string) { - ret, err := initialize(telemetryIntervalProperty, agentVersion) + ret, err := initialize(telemetryPushIntervalProperty, agentVersion) if ret != 0 || err != nil { Log("Error During Telemetry Initialization :%s", err.Error()) - return + runtime.Goexit() } + SendEvent(EventNameContainerLogInit, make(map[string]string)) + for ; true; <-ContainerLogTelemetryTicker.C { flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) From 510ef9f95b8e5de04e7b5952e24458374d6cbf6b Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 10:45:14 -0700 Subject: [PATCH 4/9] PR feedback --- installer/conf/td-agent-bit.conf | 10 ++++----- source/code/go/src/plugins/out_oms.go | 8 +++---- source/code/go/src/plugins/telemetry.go | 30 +++++++++++-------------- 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 6849a3744..b01b3a352 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -17,8 +17,8 @@ Skip_Long_Lines On [OUTPUT] - Name oms - EnableTelemetry true - TelemetryPushInterval 300 - Match oms.container.log.* - AgentVersion internaltest1004-2 \ No newline at end of file + Name oms + EnableTelemetry true + TelemetryPushIntervalSeconds 300 + Match oms.container.log.* + AgentVersion internaltest1004-2 \ No newline at end of file diff --git a/source/code/go/src/plugins/out_oms.go b/source/code/go/src/plugins/out_oms.go index 2603368ab..732ae5216 100644 --- a/source/code/go/src/plugins/out_oms.go +++ b/source/code/go/src/plugins/out_oms.go @@ -21,13 +21,13 @@ func FLBPluginInit(ctx unsafe.Pointer) int { Log("Initializing out_oms go plugin for fluentbit") InitializePlugin(ContainerLogPluginConfFilePath) enableTelemetry := output.FLBPluginConfigKey(ctx, "EnableTelemetry") - telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushInterval") - agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") - if strings.Compare(strings.ToLower(enableTelemetry), "true") == 0 { + telemetryPushInterval := output.FLBPluginConfigKey(ctx, "TelemetryPushIntervalSeconds") + agentVersion := output.FLBPluginConfigKey(ctx, "AgentVersion") go SendContainerLogFlushRateMetric(telemetryPushInterval, agentVersion) } else { Log("Telemetry is not enabled for the plugin %s \n", output.FLBPluginConfigKey(ctx, "Name")) + return output.FLB_OK } return output.FLB_OK } @@ -58,7 +58,7 @@ func FLBPluginFlush(data unsafe.Pointer, length C.int, tag *C.char) int { // FLBPluginExit exits the plugin func FLBPluginExit() int { - defer TelemetryShutdown() + ContainerLogTelemetryTicker.Stop() KubeSystemContainersRefreshTicker.Stop() ContainerImageNameRefreshTicker.Stop() return output.FLB_OK diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index c2f565a45..4396ea655 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -26,15 +26,15 @@ var ( ) const ( - clusterTypeACS = "ACS" - clusterTypeAKS = "AKS" - controllerTypeDaemonSet = "DaemonSet" - controllerTypeReplicaSet = "ReplicaSet" - envAKSResourceID = "AKS_RESOURCE_ID" - envACSResourceName = "ACS_RESOURCE_NAME" - envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" - metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" - defaultTelemetryPushInterval = 300 + clusterTypeACS = "ACS" + clusterTypeAKS = "AKS" + controllerTypeDaemonSet = "DaemonSet" + controllerTypeReplicaSet = "ReplicaSet" + envAKSResourceID = "AKS_RESOURCE_ID" + envACSResourceName = "ACS_RESOURCE_NAME" + envAppInsightsAuth = "APPLICATIONINSIGHTS_AUTH" + metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" + defaultTelemetryPushIntervalSeconds = 300 // EventNameContainerLogInit name of the event EventNameContainerLogInit = "ContainerLogPluginInitialized" @@ -45,8 +45,8 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, telemetryPushInterval, err := strconv.Atoi(telemetryPushIntervalProperty) if err != nil { - Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushInterval) - telemetryPushInterval = defaultTelemetryPushInterval + Log("Error Converting telemetryPushIntervalProperty %s. Using Default Interval... %d \n", telemetryPushIntervalProperty, defaultTelemetryPushIntervalSeconds) + telemetryPushInterval = defaultTelemetryPushIntervalSeconds } ContainerLogTelemetryTicker = time.NewTicker(time.Second * time.Duration(telemetryPushInterval)) @@ -116,17 +116,13 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) TelemetryClient.Track(metric) + DataUpdateMutex.Lock() FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 + DataUpdateMutex.Unlock() } } -// TelemetryShutdown stops the ticker that sends data to App Insights periodically -func TelemetryShutdown() { - Log("Shutting down ContainerLog Telemetry\n") - ContainerLogTelemetryTicker.Stop() -} - // SendEvent sends an event to App Insights func SendEvent(eventName string, dimensions map[string]string) { // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine From 684c39b63581fab69595885ec2c98942098be4f6 Mon Sep 17 00:00:00 2001 From: r-dilip Date: Fri, 12 Oct 2018 15:44:25 -0700 Subject: [PATCH 5/9] PR feedback --- source/code/go/src/plugins/telemetry.go | 42 +++++++++---------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 4396ea655..621d88eec 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -81,19 +81,21 @@ func initialize(telemetryPushIntervalProperty string, agentVersion string) (int, CommonProperties["ResourceGroupName"] = "" CommonProperties["ClusterName"] = "" CommonProperties["Region"] = "" + CommonProperties["AKS_RESOURCE_ID"] = "" } else { CommonProperties["ACSResourceName"] = "" + CommonProperties["AKS_RESOURCE_ID"] = aksResourceID splitStrings := strings.Split(aksResourceID, "/") - CommonProperties["SubscriptionID"] = splitStrings[2] - CommonProperties["ResourceGroupName"] = splitStrings[4] - CommonProperties["ClusterName"] = splitStrings[8] + if len(aksResourceID) > 0 && len(aksResourceID) < 10 { + CommonProperties["SubscriptionID"] = splitStrings[2] + CommonProperties["ResourceGroupName"] = splitStrings[4] + CommonProperties["ClusterName"] = splitStrings[8] + } CommonProperties["ClusterType"] = clusterTypeAKS region := os.Getenv("AKS_REGION") - if region != "" { - CommonProperties["Region"] = region - } + CommonProperties["Region"] = region } TelemetryClient.Context().CommonProperties = CommonProperties @@ -112,40 +114,26 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent SendEvent(EventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { + DataUpdateMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 - metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) - TelemetryClient.Track(metric) - DataUpdateMutex.Lock() FlushedRecordsCount = 0.0 FlushedRecordsTimeTaken = 0.0 DataUpdateMutex.Unlock() + metric := appinsights.NewMetricTelemetry(metricNameAvgFlushRate, flushRate) + TelemetryClient.Track(metric) } } // SendEvent sends an event to App Insights func SendEvent(eventName string, dimensions map[string]string) { - // this is because the TelemetryClient is initialized in a different goroutine. A simple wait loop here is just waiting for it to be initialized. This will happen only for the init event. Any subsequent Event should work just fine - for TelemetryClient == nil { - Log("Waiting for Telemetry Client to be initialized") - time.Sleep(1 * time.Second) - } - - // take a copy so the CommonProperties can be restored later - _commonProps := make(map[string]string) - for k, v := range TelemetryClient.Context().CommonProperties { - _commonProps[k] = v - } + Log("Sending Event : %s\n", eventName) + event := appinsights.NewEventTelemetry(eventName) - // add any extra dimensions + // add any extra Properties for k, v := range dimensions { - TelemetryClient.Context().CommonProperties[k] = v + event.Properties[k] = v } - Log("Sending Event : %s\n", eventName) - event := appinsights.NewEventTelemetry(eventName) TelemetryClient.Track(event) - - // restore original CommonProperties - TelemetryClient.Context().CommonProperties = _commonProps } From e165275bb8c346051cf851fb36dbb91ad7cf8afc Mon Sep 17 00:00:00 2001 From: Dilip Raghunathan Date: Mon, 15 Oct 2018 15:14:41 -0700 Subject: [PATCH 6/9] Sending an event every 5 mins(Heartbeat) (#146) --- installer/conf/td-agent-bit.conf | 2 -- source/code/go/src/plugins/telemetry.go | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index b01b3a352..2553f405f 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -12,8 +12,6 @@ Parser docker Mem_Buf_Limit 30m Path_Key filepath - Buffer_Chunk_Size 1m - Buffer_Max_Size 1m Skip_Long_Lines On [OUTPUT] diff --git a/source/code/go/src/plugins/telemetry.go b/source/code/go/src/plugins/telemetry.go index 621d88eec..b1bc4439b 100644 --- a/source/code/go/src/plugins/telemetry.go +++ b/source/code/go/src/plugins/telemetry.go @@ -36,8 +36,8 @@ const ( metricNameAvgFlushRate = "ContainerLogAvgRecordsFlushedPerSec" defaultTelemetryPushIntervalSeconds = 300 - // EventNameContainerLogInit name of the event - EventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameContainerLogInit = "ContainerLogPluginInitialized" + eventNameDaemonSetHeartbeat = "ContainerLogDaemonSetHeartbeatEvent" ) // Initialize initializes the telemetry artifacts @@ -111,9 +111,10 @@ func SendContainerLogFlushRateMetric(telemetryPushIntervalProperty string, agent runtime.Goexit() } - SendEvent(EventNameContainerLogInit, make(map[string]string)) + SendEvent(eventNameContainerLogInit, make(map[string]string)) for ; true; <-ContainerLogTelemetryTicker.C { + SendEvent(eventNameDaemonSetHeartbeat, make(map[string]string)) DataUpdateMutex.Lock() flushRate := FlushedRecordsCount / FlushedRecordsTimeTaken * 1000 Log("Flushed Records : %f Time Taken : %f flush Rate : %f", FlushedRecordsCount, FlushedRecordsTimeTaken, flushRate) From cfe1ca94c259c533a938834a54f1279e703d7e4b Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 13:03:30 -0700 Subject: [PATCH 7/9] PR feedback to cleanup removed workflows --- installer/conf/container.conf | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/installer/conf/container.conf b/installer/conf/container.conf index 1916300cb..17317871c 100755 --- a/installer/conf/container.conf +++ b/installer/conf/container.conf @@ -46,7 +46,7 @@ # Filter for correct format to endpoint - + type filter_container @@ -63,19 +63,6 @@ max_retry_wait 9m - - type out_oms_api - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_containerprocess*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level debug @@ -102,19 +89,6 @@ max_retry_wait 9m - - type out_oms - log_level debug - buffer_chunk_limit 20m - buffer_type file - buffer_path %STATE_DIR_WS%/out_oms_servicelog*.buffer - buffer_queue_limit 20 - flush_interval 20s - retry_limit 10 - retry_wait 15s - max_retry_wait 9m - - type out_oms log_level debug From 892b51c6b166cf10424bf5b6768633f44aa4cfa7 Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 13:04:55 -0700 Subject: [PATCH 8/9] updating agent version for telemetry --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 2553f405f..667f2edc2 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,4 +19,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion internaltest1004-2 \ No newline at end of file + AgentVersion ciprod10162018 From 9c83160dfa92a4f9ae1ab2b010678148aab4fc4d Mon Sep 17 00:00:00 2001 From: Vishwanath Date: Tue, 16 Oct 2018 19:33:43 -0700 Subject: [PATCH 9/9] updating agent version --- installer/conf/td-agent-bit.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/installer/conf/td-agent-bit.conf b/installer/conf/td-agent-bit.conf index 667f2edc2..b39587a97 100644 --- a/installer/conf/td-agent-bit.conf +++ b/installer/conf/td-agent-bit.conf @@ -19,4 +19,4 @@ EnableTelemetry true TelemetryPushIntervalSeconds 300 Match oms.container.log.* - AgentVersion ciprod10162018 + AgentVersion ciprod10162018-2