Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions cmd/agent/dist/conf.d/agentprofiling.d/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@

init_config: {}
instances:
-
-

## @param memory_threshold - string - optional - default: "0"
## Set to either a size (e.g., 10MB) or an exact byte value (e.g., 10485760).
## When this check runs, it will check if the Core Agent's RSS memory usage is above this threshold.
## If the threshold is exceeded, the check will generate a flare with memory and CPU profiles.
## Flare generation can only be triggered once per Agent lifecycle.
##
## If this value is set to "0", the check will not run.
##
## If this value is set to "0", the check will not run.
#
# memory_threshold: "0"

Expand All @@ -21,7 +21,7 @@ instances:
## a flare with memory and CPU profiles will be generated.
## Flare generation can only be triggered once per Agent lifecycle.
##
## If this value is set to 0, the check will not run.
## If this value is set to 0, the check will not run.
#
# cpu_threshold: 0

Expand All @@ -32,7 +32,19 @@ instances:
# ticket_id: ""

## @param user_email - string - required - default: ""
## Set to the email address associated with the ticket.
## Set to the email address associated with the ticket.
## If not specified, the Agent will be unable to associate the flare with the ticket.
#
# user_email: ""

## @param terminate_agent_on_threshold - boolean - optional - default: false
## When set to true, the agent process will be terminated after successfully generating a flare
## when memory or CPU thresholds are exceeded.
##
## WARNING: This will cause the agent to exit. Ensure your process manager is configured to restart
## the agent automatically. Use with caution.
##
## The agent will attempt graceful shutdown via SIGINT, allowing cleanup before exit. If signal
## delivery fails, it will fall back to immediate termination.
#
# terminate_agent_on_threshold: false
41 changes: 37 additions & 4 deletions pkg/collector/corechecks/agentprofiling/agentprofiling.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ package agentprofiling
import (
"fmt"
"os"
"testing"
"time"

"gopkg.in/yaml.v3"

"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/process"

"github.com/DataDog/datadog-agent/cmd/agent/common/signals"
"github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration"
"github.com/DataDog/datadog-agent/comp/core/config"

Expand All @@ -39,10 +41,11 @@ const (

// Config is the configuration for the agentprofiling check
type Config struct {
MemoryThreshold string `yaml:"memory_threshold"`
CPUThreshold int `yaml:"cpu_threshold"`
TicketID string `yaml:"ticket_id"`
UserEmail string `yaml:"user_email"`
MemoryThreshold string `yaml:"memory_threshold"`
CPUThreshold int `yaml:"cpu_threshold"`
TicketID string `yaml:"ticket_id"`
UserEmail string `yaml:"user_email"`
TerminateAgentOnThreshold bool `yaml:"terminate_agent_on_threshold"`
}

// Check is the check that generates a flare with profiles when the core agent's memory or CPU usage exceeds a certain threshold
Expand Down Expand Up @@ -192,6 +195,31 @@ func (m *Check) Run() error {
return nil
}

// terminateAgent requests graceful shutdown of the agent process after flare generation completes.
// It uses the agent's established shutdown mechanism (signals.Stopper) which ensures proper cleanup
// via stopAgent(). Termination is skipped when running in test mode to avoid killing the test process.
func (m *Check) terminateAgent() {
// Skip termination when running in test mode
if testing.Testing() {
log.Info("Skipping agent termination: running in test mode")
return
}

log.Warnf("Terminating agent process due to threshold exceeded (terminate_agent_on_threshold is enabled)")

// Flush logs to ensure termination message is written before triggering shutdown
log.Flush()

// Use the agent's established shutdown mechanism to trigger graceful shutdown.
// This ensures all cleanup happens properly via stopAgent() in command.go.
// The channel is unbuffered, but since the agent's run() function sets up a listener
// before starting the agent, this is safe. If the channel is not being listened to
// (e.g., in tests), this will block, but we've already checked for test mode above.
signals.Stopper <- true
log.Info("Agent Profiling check: Graceful shutdown requested. Agent will exit after cleanup.")
log.Flush()
}

// generateFlare generates a flare and sends it to Zendesk if ticketID is specified, otherwise generates it locally
func (m *Check) generateFlare() error {
// Skip flare generation if flareComponent is not available
Expand Down Expand Up @@ -232,5 +260,10 @@ func (m *Check) generateFlare() error {
m.flareGenerated = true
log.Info("Flare generation complete. No more flares will be generated until the Agent is restarted.")

// Terminate agent if configured to do so
if m.instance.TerminateAgentOnThreshold {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function can terminate early if generating a flare causes an error. Under this scenario, the agent has exceeded the resource threshold but has not terminated itself. Is this behavior what you intend?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. In the case that terminate_agent_on_threshold: true is set, I think customers will find it more important that the Agent is shutdown because they do not want the Agent to continue hogging resources.

Additionally, most flare generation errors happen because the flare was unable to send to our intake, but it will still be available locally in a temp folder. I almost never see the flare fail to generate completely.

m.terminateAgent()
}

return nil
}
59 changes: 50 additions & 9 deletions pkg/collector/corechecks/agentprofiling/agentprofiling_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,21 @@ import (

// testConfig represents a test configuration for the agentprofiling check
type testConfig struct {
memoryThreshold string
cpuThreshold int
ticketID string
userEmail string
memoryThreshold string
cpuThreshold int
ticketID string
userEmail string
terminateAgentOnThreshold bool
}

// defaultTestConfig returns a default test configuration
func defaultTestConfig() testConfig {
return testConfig{
memoryThreshold: "0",
cpuThreshold: 0,
ticketID: "",
userEmail: "",
memoryThreshold: "0",
cpuThreshold: 0,
ticketID: "",
userEmail: "",
terminateAgentOnThreshold: false,
}
}

Expand All @@ -48,7 +50,8 @@ func createTestCheck(t *testing.T, cfg testConfig) *Check {
configData := []byte(fmt.Sprintf(`memory_threshold: "%s"
cpu_threshold: %d
ticket_id: "%s"
user_email: "%s"`, cfg.memoryThreshold, cfg.cpuThreshold, cfg.ticketID, cfg.userEmail))
user_email: "%s"
terminate_agent_on_threshold: %t`, cfg.memoryThreshold, cfg.cpuThreshold, cfg.ticketID, cfg.userEmail, cfg.terminateAgentOnThreshold))

initConfig := []byte("")
senderManager := mocksender.CreateDefaultDemultiplexer()
Expand Down Expand Up @@ -132,3 +135,41 @@ func TestGenerateFlareTicket(t *testing.T) {
require.NoError(t, err)
assert.True(t, check.flareGenerated)
}

// TestTerminateAgentOnThresholdConfig tests that the terminate_agent_on_threshold config is parsed correctly
func TestTerminateAgentOnThresholdConfig(t *testing.T) {
cfg := defaultTestConfig()
cfg.memoryThreshold = "1B" // Force trigger
cfg.terminateAgentOnThreshold = true

check := createTestCheck(t, cfg)

// Verify config is parsed correctly
assert.True(t, check.instance.TerminateAgentOnThreshold)
assert.Equal(t, uint(1), check.memoryThreshold)

// Verify that when threshold is exceeded, flare is generated
// Note: Termination is skipped in test mode (detected via testing.Testing()), so we can't test
// the actual shutdown behavior. However, we verify that the config is parsed correctly
// and that the check would attempt termination in a non-test environment.
err := check.Run()
require.NoError(t, err)
assert.True(t, check.flareGenerated)
}

// TestTerminateAgentOnThresholdDisabled tests that termination does not occur when disabled
func TestTerminateAgentOnThresholdDisabled(t *testing.T) {
cfg := defaultTestConfig()
cfg.memoryThreshold = "1B" // Force trigger
cfg.terminateAgentOnThreshold = false

check := createTestCheck(t, cfg)

// Verify config is parsed correctly
assert.False(t, check.instance.TerminateAgentOnThreshold)

// Verify flare is still generated
err := check.Run()
require.NoError(t, err)
assert.True(t, check.flareGenerated)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
features:
- |
The Agent Profiling check now supports automatic Agent termination after flare generation when memory or CPU thresholds are exceeded. This feature is useful in resource-constrained environments where the Agent needs to be restarted after generating diagnostic information.

Enable this feature by setting `terminate_agent_on_threshold: true` in the Agent Profiling check configuration. When enabled, the Agent uses its established shutdown mechanism to trigger graceful shutdown after successfully generating a flare, ensuring proper cleanup before exit.

**Warning**: This feature will cause the Agent to exit. This feature is disabled by default and should be used with caution.
Loading