Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
b9e66d5
Handle proxy payload
kathiehuang Oct 1, 2025
49fa6d3
Send proxy payload through mpsc channel to then get aggregated and fl…
kathiehuang Oct 15, 2025
39eab7d
Remove aggregator logic, just send payload through channel and then h…
kathiehuang Oct 16, 2025
9c1f4e9
cleanup
kathiehuang Oct 16, 2025
156b8ec
Fix error handling
kathiehuang Oct 17, 2025
3870258
Fix reqwest dependency
kathiehuang Oct 17, 2025
7dbb84a
Fix license
kathiehuang Oct 17, 2025
5744931
nit: formatting
kathiehuang Oct 17, 2025
15ddf9f
Renaming for clarity
kathiehuang Oct 17, 2025
c5d51af
Minor refactoring
kathiehuang Oct 17, 2025
5eeefb3
fix
kathiehuang Oct 17, 2025
f1ec0b8
Reduce request timeout
kathiehuang Oct 27, 2025
a17218b
Add rustls-tls
kathiehuang Oct 27, 2025
9818997
Revert to original dogstatsd build_client function
kathiehuang Nov 5, 2025
c23f3eb
Add additional tags
kathiehuang Nov 7, 2025
a678f23
Formatting
kathiehuang Nov 7, 2025
082e21c
Avoid panicking
kathiehuang Nov 7, 2025
bf61041
Add aas tags
kathiehuang Nov 10, 2025
6049999
Fix aas tags
kathiehuang Nov 10, 2025
66d97a8
Fix additional tags in profiles
kathiehuang Nov 10, 2025
d846870
Remove unnecessary debug logs
kathiehuang Nov 10, 2025
299488d
Refactor
kathiehuang Nov 12, 2025
b58e2d7
Comment out aas.* tags for testing
kathiehuang Nov 24, 2025
e5fec36
Update license
kathiehuang Dec 18, 2025
92655f4
Update tags
kathiehuang Dec 18, 2025
8fcf166
Respect DD_APM_PROFILING_DD_URL for profiling intake url override
kathiehuang Dec 27, 2025
9330986
Detect environment when setting _dd.origin tag
kathiehuang Dec 27, 2025
9c17914
Enable default features for datadog-fips
kathiehuang Dec 29, 2025
ebf4733
Check for 202 success code
kathiehuang Dec 29, 2025
0f671f7
Make config variable names clearer
kathiehuang Jan 6, 2026
4e2a124
Update comment about DD_APM_PROFILING_DD_URL so that it is clearer it…
kathiehuang Jan 6, 2026
b749b9d
Add codeowners file
kathiehuang Jan 13, 2026
b33a25b
Fix license
kathiehuang Jan 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,169 changes: 526 additions & 643 deletions Cargo.lock

Large diffs are not rendered by default.

86 changes: 36 additions & 50 deletions LICENSE-3rdparty.csv

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion crates/datadog-serverless-compat/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use zstd::zstd_safe::CompressionLevel;

use datadog_trace_agent::{
aggregator::TraceAggregator,
config, env_verifier, mini_agent, stats_flusher, stats_processor,
config, env_verifier, mini_agent, proxy_flusher, stats_flusher, stats_processor,
trace_flusher::{self, TraceFlusher},
trace_processor,
};
Expand Down Expand Up @@ -124,13 +124,16 @@ pub async fn main() {
Arc::clone(&config),
));

let proxy_flusher = Arc::new(proxy_flusher::ProxyFlusher::new(Arc::clone(&config)));

let mini_agent = Box::new(mini_agent::MiniAgent {
config: Arc::clone(&config),
env_verifier,
trace_processor,
trace_flusher,
stats_processor,
stats_flusher,
proxy_flusher,
});

tokio::spawn(async move {
Expand Down
3 changes: 3 additions & 0 deletions crates/datadog-trace-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ libdd-trace-protobuf = { git = "https://github.com/DataDog/libdatadog", rev = "4
libdd-trace-utils = { git = "https://github.com/DataDog/libdatadog", rev = "435107c245112397914935c0f7148a18b91cafc6", features = [
"mini_agent",
] }
datadog-fips = { path = "../datadog-fips" }
reqwest = { version = "0.12.23", features = ["json", "http2"], default-features = false }
bytes = "1.10.1"

[dev-dependencies]
rmp-serde = "1.1.1"
Expand Down
36 changes: 30 additions & 6 deletions crates/datadog-trace-agent/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,21 @@ pub struct Config {
pub os: String,
pub tags: Tags,
/// how often to flush stats, in seconds
pub stats_flush_interval: u64,
pub stats_flush_interval_secs: u64,
/// how often to flush traces, in seconds
pub trace_flush_interval: u64,
pub trace_flush_interval_secs: u64,
pub trace_intake: Endpoint,
pub trace_stats_intake: Endpoint,
/// Profiling intake endpoint (for proxying profiling data to Datadog)
pub profiling_intake: Endpoint,
/// Timeout for each proxy request, in seconds
pub proxy_request_timeout_secs: u64,
/// Maximum number of retry attempts for failed proxy requests
pub proxy_request_max_retries: u32,
/// Base backoff duration for proxy request retries, in milliseconds
pub proxy_request_retry_backoff_base_ms: u64,
/// timeout for environment verification, in milliseconds
pub verify_env_timeout: u64,
pub verify_env_timeout_ms: u64,
pub proxy_url: Option<String>,
}

Expand Down Expand Up @@ -119,6 +127,14 @@ impl Config {
trace_stats_intake_url = trace_stats_url_prefixed(&endpoint_prefix);
};

// TODO: Create helper functions for this in libdatadog
let mut profiling_intake_url = format!("https://intake.profile.{}/api/v2/profile", dd_site);
// DD_APM_PROFILING_DD_URL env var will primarily be used for integration tests
// overrides the prefix of the profiling intake url
if let Ok(endpoint_prefix) = env::var("DD_APM_PROFILING_DD_URL") {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is DD_APM_PROFILING_DD_URL a standard config name? if not, can we add prefix somewhere in there to make it clear that it's the prefix (presumably including the scheme, host, and port?)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datadog-agent uses DD_APM_PROFILING_DD_URL: https://github.com/DataDog/datadog-agent/blob/main/pkg/config/setup/apm.go#L141

But good point, I made the comment clearer!

profiling_intake_url = format!("{endpoint_prefix}/api/v2/profile");
};

let obfuscation_config = obfuscation_config::ObfuscationConfig::new().map_err(|err| {
anyhow::anyhow!(
"Error creating obfuscation config, Mini Agent will not start. Error: {err}",
Expand All @@ -137,9 +153,12 @@ impl Config {
env_type,
os: env::consts::OS.to_string(),
max_request_content_length: 10 * 1024 * 1024, // 10MB in Bytes
trace_flush_interval: 3,
stats_flush_interval: 3,
verify_env_timeout: 100,
trace_flush_interval_secs: 3,
stats_flush_interval_secs: 3,
proxy_request_timeout_secs: 30,
proxy_request_max_retries: 3,
proxy_request_retry_backoff_base_ms: 100,
verify_env_timeout_ms: 100,
dd_dogstatsd_port,
dd_site,
trace_intake: Endpoint {
Expand All @@ -149,6 +168,11 @@ impl Config {
},
trace_stats_intake: Endpoint {
url: hyper::Uri::from_str(&trace_stats_intake_url).unwrap(),
api_key: Some(api_key.clone()),
..Default::default()
},
profiling_intake: Endpoint {
url: hyper::Uri::from_str(&profiling_intake_url).unwrap(),
api_key: Some(api_key),
..Default::default()
},
Expand Down
16 changes: 16 additions & 0 deletions crates/datadog-trace-agent/src/http_utils.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

use core::time::Duration;
use datadog_fips::reqwest_adapter::create_reqwest_client_builder;
use hyper::{
header,
http::{self, HeaderMap},
Response, StatusCode,
};
use libdd_common::hyper_migration;
use serde_json::json;
use std::error::Error;
use tracing::{debug, error};

/// Does two things:
Expand Down Expand Up @@ -111,6 +114,19 @@ pub fn verify_request_content_length(
None
}

/// Builds a reqwest client with optional proxy configuration and timeout.
/// Uses rustls TLS by default. FIPS-compliant TLS is available via the fips feature
pub fn build_client(
proxy_url: Option<&str>,
timeout: Duration,
) -> Result<reqwest::Client, Box<dyn Error>> {
let mut builder = create_reqwest_client_builder()?.timeout(timeout);
Comment thread
duncanpharvey marked this conversation as resolved.
if let Some(proxy) = proxy_url {
builder = builder.proxy(reqwest::Proxy::https(proxy)?);
}
Ok(builder.build()?)
}

#[cfg(test)]
mod tests {
use http_body_util::BodyExt;
Expand Down
1 change: 1 addition & 0 deletions crates/datadog-trace-agent/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub mod config;
pub mod env_verifier;
pub mod http_utils;
pub mod mini_agent;
pub mod proxy_flusher;
pub mod stats_flusher;
pub mod stats_processor;
pub mod trace_flusher;
Expand Down
99 changes: 91 additions & 8 deletions crates/datadog-trace-agent/src/mini_agent.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

use http_body_util::BodyExt;
use hyper::service::service_fn;
use hyper::{http, Method, Response, StatusCode};
use libdd_common::hyper_migration;
Expand All @@ -12,7 +13,8 @@ use std::time::Instant;
use tokio::sync::mpsc::{self, Receiver, Sender};
use tracing::{debug, error};

use crate::http_utils::log_and_create_http_response;
use crate::http_utils::{log_and_create_http_response, verify_request_content_length};
use crate::proxy_flusher::{ProxyFlusher, ProxyRequest};
use crate::{config, env_verifier, stats_flusher, stats_processor, trace_flusher, trace_processor};
use libdd_trace_protobuf::pb;
use libdd_trace_utils::trace_utils;
Expand All @@ -22,8 +24,10 @@ const MINI_AGENT_PORT: usize = 8126;
const TRACE_ENDPOINT_PATH: &str = "/v0.4/traces";
const STATS_ENDPOINT_PATH: &str = "/v0.6/stats";
const INFO_ENDPOINT_PATH: &str = "/info";
const PROFILING_ENDPOINT_PATH: &str = "/profiling/v1/input";
const TRACER_PAYLOAD_CHANNEL_BUFFER_SIZE: usize = 10;
const STATS_PAYLOAD_CHANNEL_BUFFER_SIZE: usize = 10;
const PROXY_PAYLOAD_CHANNEL_BUFFER_SIZE: usize = 10;

pub struct MiniAgent {
pub config: Arc<config::Config>,
Expand All @@ -32,6 +36,7 @@ pub struct MiniAgent {
pub stats_processor: Arc<dyn stats_processor::StatsProcessor + Send + Sync>,
pub stats_flusher: Arc<dyn stats_flusher::StatsFlusher + Send + Sync>,
pub env_verifier: Arc<dyn env_verifier::EnvVerifier + Send + Sync>,
pub proxy_flusher: Arc<ProxyFlusher>,
}

impl MiniAgent {
Expand All @@ -42,7 +47,7 @@ impl MiniAgent {
let mini_agent_metadata = Arc::new(
self.env_verifier
.verify_environment(
self.config.verify_env_timeout,
self.config.verify_env_timeout_ms,
&self.config.env_type,
&self.config.os,
)
Expand Down Expand Up @@ -82,6 +87,17 @@ impl MiniAgent {
.await;
});

// channels to send processed profiling requests to our proxy flusher
let (proxy_tx, proxy_rx): (Sender<ProxyRequest>, Receiver<ProxyRequest>) =
mpsc::channel(PROXY_PAYLOAD_CHANNEL_BUFFER_SIZE);

// start our proxy flusher for profiling requests
let proxy_flusher = self.proxy_flusher.clone();
tokio::spawn(async move {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have ways to gracefully shut down this flusher? do we need them?

Copy link
Copy Markdown
Contributor Author

@kathiehuang kathiehuang Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now we don't, but that's a good point, if this background task is still completing a request when the function handler returns, that could result in data loss. I looked at my notes from when I initially wrote this code and I had this from talking to Darcy: For Azure Functions there are ~20s after invocation before it starts to freeze. If flush period < 20s then chance of data loss is low

I also have this from Azure support with some information about the lifecycle of Azure functions (TLDR they didn't have a documented answer):

Regarding the “idle” time between a Function's completed execution and the instance shutdown/deallocation is not explicitly documented (from my initial review); however, you are correct that the general behavior there varies by the SKU. 

The actual time-out for the Function defaults to 30-minutes (with an “unbounded” maximum) on the Flex Consumption, Premium, Dedicated, and Container Apps plans; meanwhile, the Consumption plan defaults to 5-minutes (with a maximum of 10). Regardless of the configured time-out, HTTP trigger functions are limited to a 230-second response time. 

More relevant to your use-case, Premium, Dedicated, and Flex Consumption Plans allow you to configure the number of always ready instances, meaning scaling to zero (deallocating all instances) would not be an issue compared to the Consumption Plan where the app can scale to zero instances during idle periods.

There is unfortunately not a documented duration for the shutdown post-completion of Function executions; performing operations during timeframe would not be recommended, as there’s no guarantee that the timing would be consistent and predictable.

Alternatively, there is a grace period during drain mode for Functions currently executing for Apps which utilize event-driven scaling (Consumption, Flex Consumption, & Premium). For Consumption plan apps, that grace period can extend up to ten-minutes, while on Flex Consumption & Premium plan apps it can extend up to 60-minutes. When event-driven scaling determines reduced demand, it drains instances of their current Function executions prior to removing the instances, hence the grace period and its extension.

Looking at this log, it seems to be able to send the request in <200ms consistently so chances of data loss seem fairly low?

DEBUG Proxy Flusher | Successfully sent request in 30 ms to https://intake.profile.datadoghq.com/api/v2/profile
...
DEBUG Proxy Flusher | Successfully sent request in 117 ms to https://intake.profile.datadoghq.com/api/v2/profile

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fair point. i guess if we don't need it here, we can leave this as it is. if we ever extract this out into a shared component that would be useful in environments that do have some sort of shutdown detection, we can add something then.

let proxy_flusher = proxy_flusher.clone();
proxy_flusher.start_proxy_flusher(proxy_rx).await;
});

// setup our hyper http server, where the endpoint_handler handles incoming requests
let trace_processor = self.trace_processor.clone();
let stats_processor = self.stats_processor.clone();
Expand All @@ -96,14 +112,17 @@ impl MiniAgent {
let endpoint_config = endpoint_config.clone();
let mini_agent_metadata = Arc::clone(&mini_agent_metadata);

let proxy_tx = proxy_tx.clone();

MiniAgent::trace_endpoint_handler(
endpoint_config,
req.map(hyper_migration::Body::incoming),
trace_processor,
trace_tx,
stats_processor,
stats_tx,
mini_agent_metadata,
trace_processor.clone(),
trace_tx.clone(),
stats_processor.clone(),
stats_tx.clone(),
Arc::clone(&mini_agent_metadata),
proxy_tx.clone(),
)
});

Expand Down Expand Up @@ -167,6 +186,7 @@ impl MiniAgent {
stats_processor: Arc<dyn stats_processor::StatsProcessor + Send + Sync>,
stats_tx: Sender<pb::ClientStatsPayload>,
mini_agent_metadata: Arc<trace_utils::MiniAgentMetadata>,
proxy_tx: Sender<ProxyRequest>,
) -> http::Result<hyper_migration::HttpResponse> {
match (req.method(), req.uri().path()) {
(&Method::PUT | &Method::POST, TRACE_ENDPOINT_PATH) => {
Expand All @@ -190,6 +210,15 @@ impl MiniAgent {
),
}
}
(&Method::POST, PROFILING_ENDPOINT_PATH) => {
match Self::profiling_proxy_handler(config, req, proxy_tx).await {
Ok(res) => Ok(res),
Err(err) => log_and_create_http_response(
&format!("Error processing profiling request: {err}"),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}
(_, INFO_ENDPOINT_PATH) => match Self::info_handler(config.dd_dogstatsd_port) {
Ok(res) => Ok(res),
Err(err) => log_and_create_http_response(
Expand All @@ -205,13 +234,67 @@ impl MiniAgent {
}
}

/// Handles incoming proxy requests for profiling - can be abstracted into a generic proxy handler for other proxy requests in the future
async fn profiling_proxy_handler(
config: Arc<config::Config>,
request: hyper_migration::HttpRequest,
proxy_tx: Sender<ProxyRequest>,
) -> http::Result<hyper_migration::HttpResponse> {
debug!("Trace Agent | Received profiling request");

// Extract headers and body
let (parts, body) = request.into_parts();
if let Some(response) = verify_request_content_length(
&parts.headers,
config.max_request_content_length,
"Error processing profiling request",
) {
return response;
}

let body_bytes = match body.collect().await {
Ok(collected) => collected.to_bytes(),
Err(e) => {
return log_and_create_http_response(
&format!("Error reading profiling request body: {e}"),
StatusCode::BAD_REQUEST,
);
Comment on lines +237 to +261
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enforce profiling payload size before buffering

The new profiling proxy endpoint reads the entire request body into memory (body.collect().await) without validating the Content-Length. The trace and stats handlers call verify_request_content_length to reject payloads larger than Config::max_request_content_length before allocation, but the profiling handler omits this guard. A client can therefore POST an arbitrarily large profiling payload and the mini agent will buffer it in RAM before forwarding, allowing a single request to exhaust memory and crash the function. Consider applying the same content length check as the other endpoints before collecting the body.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

@kathiehuang kathiehuang Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lambda extension also doesn't validate the content-length, probably because profiling payloads are typically much larger and they're coming from the Datadog profiler

}
};

// Create proxy request
let proxy_request = ProxyRequest {
headers: parts.headers,
body: body_bytes,
target_url: config.profiling_intake.url.to_string(),
Comment thread
kathiehuang marked this conversation as resolved.
};
Comment on lines +245 to +270
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Forward profiling query string to Datadog

The profiling proxy handler builds the outgoing request URL from the static profiling_intake endpoint and ignores parts.uri.query(). Profiling clients pass service, environment, and other metadata via the query string when posting to /profiling/v1/input; dropping these parameters means the flusher forwards payloads without required metadata, causing ingestion failures or misattribution. Append the incoming query parameters to ProxyRequest.target_url before sending to the proxy flusher.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


debug!(
"Trace Agent | Sending profiling request to channel, target: {}",
proxy_request.target_url
);

// Send to channel
match proxy_tx.send(proxy_request).await {
Ok(_) => log_and_create_http_response(
"Successfully buffered profiling request to be flushed",
StatusCode::OK,
),
Err(err) => log_and_create_http_response(
&format!("Error sending profiling request to the proxy flusher: {err}"),
StatusCode::INTERNAL_SERVER_ERROR,
),
}
}

fn info_handler(dd_dogstatsd_port: u16) -> http::Result<hyper_migration::HttpResponse> {
let response_json = json!(
{
"endpoints": [
TRACE_ENDPOINT_PATH,
STATS_ENDPOINT_PATH,
INFO_ENDPOINT_PATH
INFO_ENDPOINT_PATH,
PROFILING_ENDPOINT_PATH
],
"client_drop_p0s": true,
"config": {
Expand Down
Loading
Loading