diff --git a/.github/workflows/deploy-profiling.yaml b/.github/workflows/deploy-profiling.yaml new file mode 100644 index 0000000000..ed218909ad --- /dev/null +++ b/.github/workflows/deploy-profiling.yaml @@ -0,0 +1,82 @@ +name: deploy-profiling +on: + workflow_dispatch: + inputs: + tag: + description: 'Tag suffix for the profiling image (e.g., "profiling" or "profiling-v1.2.3")' + required: false + default: 'profiling' + +jobs: + deploy-profiling: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + # Without this the fetch depth defaults to 1, which only includes the most recent commit. We want to know the full history so that `git describe` can give more information when it is invoked in the orderbook's crate build script. + fetch-depth: '0' + persist-credentials: false + + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set image tag + id: tag + run: | + if [ -n "${{ github.event.inputs.tag }}" ]; then + echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT + else + echo "tag=profiling" >> $GITHUB_OUTPUT + fi + echo "sha_tag=profiling-${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + + - name: Profiling services image metadata + id: meta_services + uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + with: + images: ghcr.io/${{ github.repository }} + tags: | + type=raw,value=${{ steps.tag.outputs.tag }} + type=raw,value=${{ steps.tag.outputs.sha_tag }} + labels: | + org.opencontainers.image.licenses=GPL-3.0-or-later + org.opencontainers.image.description=CoW Protocol Services with jemalloc profiling enabled + + - name: Profiling services image build + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + context: . + file: Dockerfile + push: true + tags: ${{ steps.meta_services.outputs.tags }} + labels: ${{ steps.meta_services.outputs.labels }} + build-args: | + CARGO_BUILD_FEATURES=--features jemalloc-profiling + + - name: Profiling migration image metadata + id: meta_migration + uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804 # v5.7.0 + with: + images: ghcr.io/${{ github.repository }}-migration + tags: | + type=raw,value=${{ steps.tag.outputs.tag }} + type=raw,value=${{ steps.tag.outputs.sha_tag }} + labels: | + org.opencontainers.image.licenses=GPL-3.0-or-later + + - name: Profiling migration image build + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + context: . + file: Dockerfile + target: migrations + push: true + tags: ${{ steps.meta_migration.outputs.tags }} + labels: ${{ steps.meta_migration.outputs.labels }} diff --git a/Cargo.lock b/Cargo.lock index b09a995840..566699c907 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,6 +68,7 @@ dependencies = [ "anyhow", "clap", "humantime", + "jemalloc_pprof", "mimalloc", "model", "number", @@ -77,6 +78,7 @@ dependencies = [ "serde", "serde_with", "shared", + "tikv-jemallocator", "tokio", "tracing", "url", @@ -1215,6 +1217,7 @@ dependencies = [ "humantime", "indexmap 2.10.0", "itertools 0.14.0", + "jemalloc_pprof", "maplit", "mimalloc", "mockall 0.12.1", @@ -1237,6 +1240,7 @@ dependencies = [ "sqlx", "strum", "thiserror 1.0.61", + "tikv-jemallocator", "tokio", "tracing", "url", @@ -2721,6 +2725,7 @@ dependencies = [ "humantime-serde", "hyper 0.14.29", "itertools 0.14.0", + "jemalloc_pprof", "maplit", "mimalloc", "model", @@ -2743,6 +2748,7 @@ dependencies = [ "solvers-dto", "tempfile", "thiserror 1.0.61", + "tikv-jemallocator", "tokio", "toml", "tower 0.4.13", @@ -2895,12 +2901,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3100,9 +3106,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fastrlp" @@ -4053,6 +4059,23 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jemalloc_pprof" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74ff642505c7ce8d31c0d43ec0e235c6fd4585d9b8172d8f9dd04d36590200b5" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -4167,9 +4190,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "lock_api" @@ -4235,6 +4258,19 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "mappings" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4d277bb50d4508057e7bddd7fcd19ef4a4cc38051b6a5a36868d75ae2cbeb9" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.1.0" @@ -4656,6 +4692,7 @@ dependencies = [ "chrono", "console-subscriber", "futures", + "jemalloc_pprof", "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", @@ -4833,6 +4870,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.29", + "jemalloc_pprof", "mimalloc", "mockall 0.12.1", "model", @@ -4852,6 +4890,7 @@ dependencies = [ "sqlx", "strum", "thiserror 1.0.61", + "tikv-jemallocator", "tokio", "tracing", "url", @@ -5049,6 +5088,20 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof_util" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4429d44e5e2c8a69399fc0070379201eed018e3df61e04eb7432811df073c224" +dependencies = [ + "anyhow", + "backtrace", + "flate2", + "num", + "paste", + "prost 0.13.5", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -5486,6 +5539,7 @@ dependencies = [ "futures", "gas-estimation", "humantime", + "jemalloc_pprof", "mimalloc", "number", "observe", @@ -5493,6 +5547,7 @@ dependencies = [ "prometheus-metric-storage", "shared", "sqlx", + "tikv-jemallocator", "tokio", "tracing", "url", @@ -5825,15 +5880,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.34" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags 2.9.1", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6459,6 +6514,7 @@ dependencies = [ "hex-literal", "hyper 0.14.29", "itertools 0.14.0", + "jemalloc_pprof", "mimalloc", "model", "num", @@ -6473,6 +6529,7 @@ dependencies = [ "solver", "solvers-dto", "tempfile", + "tikv-jemallocator", "tokio", "toml", "tower 0.4.13", @@ -6882,14 +6939,15 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" -version = "3.10.1" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ - "cfg-if", "fastrand", + "getrandom 0.3.3", + "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6968,6 +7026,37 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "time" version = "0.3.37" diff --git a/Cargo.toml b/Cargo.toml index 1c3fd432ef..66b43d6fb0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ derivative = "2.2.0" derive_more = { version = "1.0.0", features = ["full"] } ethcontract = { git = "https://github.com/cowprotocol/ethcontract-rs", rev = "8e112a88988040cde6110379ee6d1be768a13244", default-features = false, features = ["aws-kms"] } mimalloc = "0.1.43" +tikv-jemallocator = { version = "0.6", features = ["unprefixed_malloc_on_supported_platforms", "profiling"] } +jemalloc_pprof = { version = "0.8", features = ["symbolize"] } ethcontract-generate = { git = "https://github.com/cowprotocol/ethcontract-rs", rev = "8e112a88988040cde6110379ee6d1be768a13244", default-features = false } ethcontract-mock = { git = "https://github.com/cowprotocol/ethcontract-rs", rev = "8e112a88988040cde6110379ee6d1be768a13244", default-features = false } ethereum-types = "0.14.1" diff --git a/Dockerfile b/Dockerfile index e5a8a0d470..47e1b50a9c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,16 +5,22 @@ CMD ["migrate"] FROM docker.io/rust:1-slim-bookworm AS cargo-build WORKDIR /src/ +# Accept build arguments for enabling features +ARG CARGO_BUILD_FEATURES="" + # Install dependencies RUN --mount=type=cache,target=/var/cache/apt,sharing=locked apt-get update && \ - apt-get install -y git libssl-dev pkg-config + apt-get install -y git libssl-dev pkg-config && \ + if echo "${CARGO_BUILD_FEATURES}" | grep -q "jemalloc-profiling"; then \ + apt-get install -y build-essential; \ + fi # Install Rust toolchain RUN rustup install stable && rustup default stable # Copy and Build Code COPY . . RUN --mount=type=cache,target=/usr/local/cargo/registry --mount=type=cache,target=/src/target \ - CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release && \ + CARGO_PROFILE_RELEASE_DEBUG=1 cargo build --release ${CARGO_BUILD_FEATURES} && \ cp target/release/alerter / && \ cp target/release/autopilot / && \ cp target/release/driver / && \ @@ -54,8 +60,9 @@ ENTRYPOINT [ "solvers" ] # Extract Binary FROM intermediate + RUN apt-get update && \ - apt-get install -y build-essential cmake git zlib1g-dev libelf-dev libdw-dev libboost-dev libboost-iostreams-dev libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libunwind-dev libzstd-dev git + apt-get install -y build-essential cmake git zlib1g-dev libelf-dev libdw-dev libboost-dev libboost-iostreams-dev libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libunwind-dev libzstd-dev git netcat-openbsd RUN git clone https://invent.kde.org/sdk/heaptrack.git /heaptrack && \ mkdir /heaptrack/build && cd /heaptrack/build && \ cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_GUI=OFF .. && \ diff --git a/README.md b/README.md index a426961198..08f54b679b 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,54 @@ cargo install --locked tokio-console tokio-console ``` +## Heap Profiling + +All binaries support opt-in heap profiling using jemalloc's profiling capabilities. This allows you to analyze memory usage in production environments without restarting services. + +### Building with Heap Profiling + +Build with the `jemalloc-profiling` feature: +```bash +cargo build --release --features jemalloc-profiling +``` + +Or with Docker: +```bash +docker build --build-arg CARGO_BUILD_FEATURES="--features jemalloc-profiling" . +``` + +### Generating Heap Dumps + +When running with the profiling feature enabled, each binary opens a UNIX socket at `/tmp/heap_dump_.sock`. To generate a heap dump, connect to the socket and send the "dump" command: + +**Note:** Services must be run with the `MALLOC_CONF` environment variable set: +```bash +MALLOC_CONF="prof:true,prof_active:true,lg_prof_sample:19" +``` + +```bash +# From Kubernetes +kubectl exec -n -- sh -c "echo dump | nc -U /tmp/heap_dump_orderbook.sock" > heap.pprof + +# From Docker +docker exec sh -c "echo dump | nc -U /tmp/heap_dump_orderbook.sock" > heap.pprof +``` + +### Analyzing Heap Dumps + +The dumps are in pprof format and can be analyzed using Google's pprof tool: + +```bash +# Install pprof +go install github.com/google/pprof@latest + +# Interactive web UI +pprof -http=:8080 heap.pprof + +# Command-line analysis +pprof -top heap.pprof +``` + ## Changing Log Filters It's possible to change the tracing log filter while the process is running. This can be useful to debug an error that requires more verbose logs but which might no longer appear after restarting the system. diff --git a/crates/alerter/Cargo.toml b/crates/alerter/Cargo.toml index b7b1f89a69..86689a15c2 100644 --- a/crates/alerter/Cargo.toml +++ b/crates/alerter/Cargo.toml @@ -12,6 +12,8 @@ clap = { workspace = true } humantime = { workspace = true } observe = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } model = { workspace = true } number = { workspace = true } prometheus = { workspace = true } @@ -26,3 +28,6 @@ warp = { workspace = true } [lints] workspace = true + +[features] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/alerter/src/lib.rs b/crates/alerter/src/lib.rs index 76cf64a6b5..2cf1f6d8d6 100644 --- a/crates/alerter/src/lib.rs +++ b/crates/alerter/src/lib.rs @@ -392,6 +392,8 @@ pub async fn start(args: impl Iterator) { ); observe::tracing::initialize(&obs_config); observe::panic_hook::install(); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); observe::metrics::setup_registry(Some("gp_v2_alerter".to_string()), None); tracing::info!("running alerter with {:#?}", args); run(args).await; diff --git a/crates/alerter/src/main.rs b/crates/alerter/src/main.rs index 73a70ddf51..ecc2d72ca7 100644 --- a/crates/alerter/src/main.rs +++ b/crates/alerter/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/autopilot/Cargo.toml b/crates/autopilot/Cargo.toml index 6471f4d7a0..510c3ba404 100644 --- a/crates/autopilot/Cargo.toml +++ b/crates/autopilot/Cargo.toml @@ -40,6 +40,8 @@ indexmap = { workspace = true } itertools = { workspace = true } maplit = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } model = { workspace = true } num = { workspace = true } number = { workspace = true } @@ -74,3 +76,6 @@ vergen = { workspace = true, features = ["git", "gitcl"] } [lints] workspace = true + +[features] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/autopilot/src/main.rs b/crates/autopilot/src/main.rs index d3b40b5970..9a05ca22bb 100644 --- a/crates/autopilot/src/main.rs +++ b/crates/autopilot/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/autopilot/src/run.rs b/crates/autopilot/src/run.rs index 72cd5bd12a..25a21e66a8 100644 --- a/crates/autopilot/src/run.rs +++ b/crates/autopilot/src/run.rs @@ -146,6 +146,8 @@ pub async fn start(args: impl Iterator) { ); observe::tracing::initialize(&obs_config); observe::panic_hook::install(); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); let commit_hash = option_env!("VERGEN_GIT_SHA").unwrap_or("COMMIT_INFO_NOT_FOUND"); diff --git a/crates/driver/Cargo.toml b/crates/driver/Cargo.toml index 0f4b880741..89f59e8e53 100644 --- a/crates/driver/Cargo.toml +++ b/crates/driver/Cargo.toml @@ -36,6 +36,8 @@ humantime-serde = { workspace = true } hyper = { workspace = true } itertools = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } moka = { workspace = true, features = ["future"] } num = { workspace = true } number = { workspace = true } @@ -89,3 +91,6 @@ vergen = { workspace = true, features = ["git", "gitcl"] } [lints] workspace = true + +[features] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/driver/src/infra/observe/mod.rs b/crates/driver/src/infra/observe/mod.rs index f6dbab6fec..a2e5d69b61 100644 --- a/crates/driver/src/infra/observe/mod.rs +++ b/crates/driver/src/infra/observe/mod.rs @@ -38,6 +38,8 @@ pub mod metrics; pub fn init(obs_config: observe::Config) { observe::tracing::initialize_reentrant(&obs_config); metrics::init(); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); } /// Observe a received auction. diff --git a/crates/driver/src/main.rs b/crates/driver/src/main.rs index 13064f8724..bf43d356b4 100644 --- a/crates/driver/src/main.rs +++ b/crates/driver/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/observe/Cargo.toml b/crates/observe/Cargo.toml index 0448d9ec3e..ed98c00503 100644 --- a/crates/observe/Cargo.toml +++ b/crates/observe/Cargo.toml @@ -28,6 +28,7 @@ tracing-opentelemetry = { workspace = true } tracing-subscriber = { workspace = true, features = ["env-filter", "fmt", "time"] } warp = { workspace = true } tracing-serde = { workspace = true } +jemalloc_pprof = { workspace = true, optional = true } [lints] workspace = true @@ -35,3 +36,4 @@ workspace = true [features] default = [] axum-tracing = ["axum"] +jemalloc-profiling = ["dep:jemalloc_pprof"] diff --git a/crates/observe/src/heap_dump_handler.rs b/crates/observe/src/heap_dump_handler.rs new file mode 100644 index 0000000000..da3b6992d7 --- /dev/null +++ b/crates/observe/src/heap_dump_handler.rs @@ -0,0 +1,163 @@ +use { + std::time::Duration, + tokio::{ + io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, + net::{UnixListener, UnixStream}, + }, +}; + +/// Spawns a new async task that listens for connections to a UNIX socket +/// at "/tmp/heap_dump_.sock". +/// When "dump" command is sent, it generates a heap profile using +/// jemalloc_pprof and streams the binary protobuf data back through the socket. +/// +/// Usage: +/// ```bash +/// # From your local machine (one-liner): +/// kubectl exec -n -- sh -c "echo dump | nc -U /tmp/heap_dump_.sock" > heap.pprof +/// +/// # Analyze with pprof: +/// go tool pprof -http=:8080 heap.pprof +/// ``` +#[cfg(all(unix, feature = "jemalloc-profiling"))] +pub fn spawn_heap_dump_handler() { + // Check if jemalloc profiling is available before spawning the handler + // This prevents panics that would crash the entire process + let profiling_available = + std::panic::catch_unwind(|| jemalloc_pprof::PROF_CTL.as_ref().is_some()).unwrap_or(false); + + if !profiling_available { + tracing::warn!( + "jemalloc profiling not available - heap dump handler not started. Ensure service is \ + built with jemalloc-profiling feature and MALLOC_CONF is set." + ); + return; + } + + tokio::spawn(async move { + let name = binary_name().unwrap_or("unknown".to_string()); + let socket_path = format!("/tmp/heap_dump_{name}.sock"); + + tracing::info!(socket = socket_path, "heap dump handler started"); + + let _ = tokio::fs::remove_file(&socket_path).await; + let listener = match UnixListener::bind(&socket_path) { + Ok(listener) => listener, + Err(err) => { + tracing::error!( + ?err, + socket = socket_path, + "failed to bind heap dump socket" + ); + return; + } + }; + let handle = SocketHandle { + listener, + socket_path, + }; + + loop { + // Accept connection in main loop, then spawn task for each connection + // Sequential processing prevents multiple simultaneous expensive heap dumps + match handle.listener.accept().await { + Ok((socket, _addr)) => { + let mut handle = tokio::spawn(async move { + handle_connection_with_socket(socket).await; + }); + + // 1-minute timeout to prevent stuck dumps from blocking future requests + match tokio::time::timeout(Duration::from_secs(60), &mut handle).await { + Ok(Ok(())) => { + // Task completed successfully + } + Ok(Err(err)) => { + tracing::error!(?err, "panic in heap dump connection handler"); + } + Err(elapsed) => { + handle.abort(); + tracing::error!(?elapsed, "heap dump request timed out"); + } + } + } + Err(err) => { + tracing::debug!(?err, "failed to accept connection"); + } + } + } + }); +} + +struct SocketHandle { + socket_path: String, + listener: UnixListener, +} + +impl Drop for SocketHandle { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.socket_path); + } +} + +fn binary_name() -> Option { + Some( + std::env::current_exe() + .ok()? + .file_name()? + .to_str()? + .to_string(), + ) +} + +async fn handle_connection_with_socket(mut socket: UnixStream) { + let message = read_line(&mut socket).await; + match message.as_deref() { + Some("dump") => { + generate_and_stream_dump(&mut socket).await; + } + Some("") => { + tracing::debug!("client disconnected"); + } + None => { + tracing::debug!("failed to read message"); + } + Some(unknown) => { + tracing::debug!(command = unknown, "unknown command"); + } + } +} + +async fn generate_and_stream_dump(socket: &mut UnixStream) { + tracing::info!("generating heap dump"); + + // PROF_CTL was already verified to be available in spawn_heap_dump_handler + // so we can safely unwrap here. If this panics, it means there's a serious bug. + let prof_ctl = jemalloc_pprof::PROF_CTL + .as_ref() + .expect("PROF_CTL should be available - checked at handler spawn"); + + let pprof_data = { + let mut lock = prof_ctl.lock().await; + lock.dump_pprof() + }; + + match pprof_data { + Ok(pprof_data) => { + tracing::info!(size_bytes = pprof_data.len(), "heap dump generated"); + + if let Err(err) = socket.write_all(&pprof_data).await { + tracing::warn!(?err, "failed to write heap dump to socket"); + } + } + Err(err) => { + tracing::error!(?err, "failed to generate heap dump"); + } + } +} + +async fn read_line(socket: &mut UnixStream) -> Option { + let mut reader = BufReader::new(socket); + let mut buffer = String::new(); + reader.read_line(&mut buffer).await.ok()?; + Some(buffer.trim().to_owned()) +} diff --git a/crates/observe/src/lib.rs b/crates/observe/src/lib.rs index 290f3a2ceb..3ec2831204 100644 --- a/crates/observe/src/lib.rs +++ b/crates/observe/src/lib.rs @@ -4,6 +4,8 @@ pub mod config; pub mod distributed_tracing; pub mod future; +#[cfg(all(unix, feature = "jemalloc-profiling"))] +pub mod heap_dump_handler; pub mod metrics; pub mod panic_hook; pub mod tracing; diff --git a/crates/orderbook/Cargo.toml b/crates/orderbook/Cargo.toml index 10411832f4..33b09e7ae1 100644 --- a/crates/orderbook/Cargo.toml +++ b/crates/orderbook/Cargo.toml @@ -35,6 +35,8 @@ hex-literal = { workspace = true } humantime = { workspace = true } hyper = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } model = { workspace = true } multibase = { workspace = true } num = { workspace = true } @@ -71,3 +73,4 @@ workspace = true [features] e2e = [] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/orderbook/src/main.rs b/crates/orderbook/src/main.rs index b2f14d5ff2..0ed6dd3a83 100644 --- a/crates/orderbook/src/main.rs +++ b/crates/orderbook/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/orderbook/src/run.rs b/crates/orderbook/src/run.rs index e5052391f0..5b4d1bf795 100644 --- a/crates/orderbook/src/run.rs +++ b/crates/orderbook/src/run.rs @@ -71,6 +71,8 @@ pub async fn start(args: impl Iterator) { tracing::info!("running order book with validated arguments:\n{}", args); observe::panic_hook::install(); observe::metrics::setup_registry(Some("gp_v2_api".into()), None); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); run(args).await; } diff --git a/crates/refunder/Cargo.toml b/crates/refunder/Cargo.toml index a4dcdca595..90468633a0 100644 --- a/crates/refunder/Cargo.toml +++ b/crates/refunder/Cargo.toml @@ -18,6 +18,8 @@ gas-estimation = { workspace = true } const-hex = { workspace = true } humantime = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } number = { workspace = true } observe = { workspace = true } prometheus = { workspace = true } @@ -30,3 +32,6 @@ url = { workspace = true } [lints] workspace = true + +[features] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/refunder/src/lib.rs b/crates/refunder/src/lib.rs index ed4f010696..2da3460276 100644 --- a/crates/refunder/src/lib.rs +++ b/crates/refunder/src/lib.rs @@ -30,6 +30,8 @@ pub async fn start(args: impl Iterator) { ); observe::tracing::initialize(&obs_config); observe::panic_hook::install(); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); tracing::info!("running refunder with validated arguments:\n{}", args); observe::metrics::setup_registry(Some("refunder".into()), None); run(args).await; diff --git a/crates/refunder/src/main.rs b/crates/refunder/src/main.rs index be0bb41fa5..492ed93ce9 100644 --- a/crates/refunder/src/main.rs +++ b/crates/refunder/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/solvers/Cargo.toml b/crates/solvers/Cargo.toml index 184331fce1..6de8f230aa 100644 --- a/crates/solvers/Cargo.toml +++ b/crates/solvers/Cargo.toml @@ -29,6 +29,8 @@ hyper = { workspace = true } ethcontract = { workspace = true } itertools = { workspace = true } mimalloc = { workspace = true } +tikv-jemallocator = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, optional = true } num = { workspace = true } prometheus = { workspace = true } prometheus-metric-storage = { workspace = true } @@ -64,3 +66,6 @@ vergen = { workspace = true, features = ["git", "gitcl"] } [lints] workspace = true + +[features] +jemalloc-profiling = ["dep:tikv-jemallocator", "dep:jemalloc_pprof", "observe/jemalloc-profiling"] diff --git a/crates/solvers/src/main.rs b/crates/solvers/src/main.rs index 09c582b3eb..dc93d24e3a 100644 --- a/crates/solvers/src/main.rs +++ b/crates/solvers/src/main.rs @@ -1,3 +1,8 @@ +#[cfg(feature = "jemalloc-profiling")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +#[cfg(not(feature = "jemalloc-profiling"))] #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/solvers/src/run.rs b/crates/solvers/src/run.rs index b3a0f803e7..3d5d48a016 100644 --- a/crates/solvers/src/run.rs +++ b/crates/solvers/src/run.rs @@ -32,6 +32,8 @@ async fn run_with(args: cli::Args, bind: Option>) { None, ); observe::tracing::initialize_reentrant(&obs_config); + #[cfg(all(unix, feature = "jemalloc-profiling"))] + observe::heap_dump_handler::spawn_heap_dump_handler(); let commit_hash = option_env!("VERGEN_GIT_SHA").unwrap_or("COMMIT_INFO_NOT_FOUND");