From 8aa6e9e279d7e5b6e27b668d807a97917df80300 Mon Sep 17 00:00:00 2001 From: Anthony Whalley Date: Sun, 6 Nov 2022 16:55:19 +0000 Subject: [PATCH 1/5] fix: Agent clippy and FAQ Signed-off-by: Anthony Whalley --- FAQ.md | 18 ++++++++++++++++++ core-dump-agent/src/main.rs | 8 ++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/FAQ.md b/FAQ.md index 629dbef..9048134 100644 --- a/FAQ.md +++ b/FAQ.md @@ -4,6 +4,8 @@ - [Why is my core dump truncated?](#why-is-my-core-dump-truncated) +- [Why is my zip file corrupted?](#why-is-my-zip-file-corrupted) + - [Why is my log file exactly half of my configured line count?](#why-is-my-log-file-exactly-half-of-my-configured-line-count) - [Can I force an upload?](#can-i-force-an-upload) @@ -12,6 +14,8 @@ - [How do I use the custom endpoint?](#how-do-i-use-the-custom-endpoint) +- [Why am I getting the wrong container info?](#why-am-i-getting-the-wrong-container-info) + ## How should I integrate my own uploader? The core dump handler is designed to quickly move the cores *"off-box"* to an object storage environment with as much additional runtime information as possible. @@ -73,6 +77,14 @@ terminationGracePeriodSeconds: 120 ``` Also see [Kubernetes best practices: terminating with grace](https://cloud.google.com/blog/products/containers-kubernetes/kubernetes-best-practices-terminating-with-grace) +## Why is my zip file corrupted? + +As of v8.7.0 there is now have a timer on the core dump to prevent repeated hanging core dumps taking down the system. +For very large core dumps this means the process can be truncated and the zipfile incomplete. + +In v8.8.0 We have added the nocompression option to zip process to improve performance and you can increase the timeout default which is currently set to 10 minutes. + + ## Why is my log file exactly half of my configured line count? This appears to be a bug in some kubernetes services. @@ -134,3 +146,9 @@ extraEnvVars: | - name: S3_ENDPOINT value: https://the-endpoint ``` + +## Why am I getting the wrong container info? + +Core dump handler trys to find the container information for the crashing process based on the hostname of the pod. This works fine in most scenarios but when pods are created directly in multiple namespaces or the same Statefulsets are created in the same namespaces. + +The current recommendation is to create a unique name in both of those scenarios. [See issue 115](https://github.com/IBM/core-dump-handler/issues/115) \ No newline at end of file diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index ca986c5..3296632 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -288,7 +288,7 @@ async fn main() -> Result<(), anyhow::Error> { async fn process_file(zip_path: &Path, bucket: &Bucket) { info!("Uploading: {}", zip_path.display()); - let f = File::open(&zip_path).expect("no file found"); + let f = File::open(zip_path).expect("no file found"); match f.try_lock(FileLockMode::Shared) { Ok(_) => { /* If we can lock then we are ok */ } @@ -305,7 +305,7 @@ async fn process_file(zip_path: &Path, bucket: &Bucket) { } } - let metadata = fs::metadata(&zip_path).expect("unable to read metadata"); + let metadata = fs::metadata(zip_path).expect("unable to read metadata"); info!("zip size is {}", metadata.len()); let path_str = match zip_path.to_str() { Some(v) => v, @@ -496,7 +496,7 @@ fn get_sysctl(name: &str) -> Result { info!("Getting sysctl for {}", name); let output = Command::new("sysctl") .env("PATH", get_path()) - .args(&["-n", name]) + .args(["-n", name]) .output()?; let lines = String::from_utf8(output.stdout)?; let line = lines.lines().take(1).next().unwrap_or(""); @@ -522,7 +522,7 @@ fn overwrite_sysctl(name: &str, value: &str) -> Result<(), anyhow::Error> { let s = format!("{}={}", name, value); let output = Command::new("sysctl") .env("PATH", get_path()) - .args(&["-w", s.as_str()]) + .args(["-w", s.as_str()]) .status()?; if !output.success() { let e = Error::InvalidOverWrite { From 0f688bdbb075dedb286bde6ce7a4f7949c75a748 Mon Sep 17 00:00:00 2001 From: Anthony Whalley Date: Sun, 6 Nov 2022 16:55:57 +0000 Subject: [PATCH 2/5] fix: Update log4rs to latest Signed-off-by: Anthony Whalley --- core-dump-composer/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-dump-composer/Cargo.toml b/core-dump-composer/Cargo.toml index 922ad96..d7cb29c 100644 --- a/core-dump-composer/Cargo.toml +++ b/core-dump-composer/Cargo.toml @@ -13,7 +13,7 @@ uuid = { version = "1.1.0", features = ["serde", "v4"] } zip = "0.6.2" dotenv = "0.15.0" log = "0.4.14" -log4rs = { git = "https://github.com/No9/log4rs/", branch = "typemap-ors-fix" } +log4rs = "1.2.0" anyhow = "1.0.53" serde_json = "1.0.76" serde = { version = "1.0.134", features = ["derive"] } From 867b4a797a9517aefc062839be48cf97214446c6 Mon Sep 17 00:00:00 2001 From: Anthony Whalley Date: Sun, 6 Nov 2022 19:18:17 +0000 Subject: [PATCH 3/5] Parameterise changes Signed-off-by: Anthony Whalley --- Cargo.lock | 5 ++-- .../templates/daemonset.yaml | 4 +++ charts/core-dump-handler/values.schema.json | 15 ++++++++-- charts/core-dump-handler/values.yaml | 2 ++ core-dump-agent/src/main.rs | 8 ++++-- core-dump-agent/tests/basic.rs | 2 +- core-dump-composer/src/config.rs | 28 ++++++++++++------- core-dump-composer/src/main.rs | 11 ++++---- core-dump-composer/tests/timeout.rs | 3 +- 9 files changed, 52 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f3dab1b..1b925a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -859,8 +859,9 @@ checksum = "a94d21414c1f4a51209ad204c1776a3d0765002c76c6abcb602a6f09f1e881c7" [[package]] name = "log4rs" -version = "1.1.1" -source = "git+https://github.com/No9/log4rs/?branch=typemap-ors-fix#6e3af6876a8178ddd6c0d43440ff22f08d6d0d77" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d36ca1786d9e79b8193a68d480a0907b612f109537115c6ff655a3a1967533fd" dependencies = [ "anyhow", "arc-swap", diff --git a/charts/core-dump-handler/templates/daemonset.yaml b/charts/core-dump-handler/templates/daemonset.yaml index b109c67..aa9a1e9 100644 --- a/charts/core-dump-handler/templates/daemonset.yaml +++ b/charts/core-dump-handler/templates/daemonset.yaml @@ -47,6 +47,10 @@ spec: value: {{ .Values.composer.crioImageCmd }} - name: COMP_POD_SELECTOR_LABEL value: {{ .Values.composer.podSelectorLabel }} + - name: COMP_TIMEOUT + value: {{ .Values.composer.timeout | quote }} + - name: COMP_COMPRESSION + value: {{ .Values.composer.compression | quote }} - name: DEPLOY_CRIO_CONFIG value: {{ .Values.daemonset.deployCrioConfig | quote }} - name: CRIO_ENDPOINT diff --git a/charts/core-dump-handler/values.schema.json b/charts/core-dump-handler/values.schema.json index 9b6e7c6..acd5cb6 100644 --- a/charts/core-dump-handler/values.schema.json +++ b/charts/core-dump-handler/values.schema.json @@ -115,6 +115,13 @@ }, "podSelectorLabel": { "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 120 + }, + "compression": { + "type": "boolean" } }, "required": [ @@ -122,7 +129,9 @@ "ignoreCrio", "logLevel", "logLength", - "filenameTemplate" + "filenameTemplate", + "timeout", + "compression" ], "title": "Composer" }, @@ -183,7 +192,7 @@ "hostContainerRuntimeEndpoint" ] } - } + } ], "properties": { "name": { @@ -316,4 +325,4 @@ "title": "ServiceAccount" } } -} +} \ No newline at end of file diff --git a/charts/core-dump-handler/values.yaml b/charts/core-dump-handler/values.yaml index 437b68d..b4986fe 100644 --- a/charts/core-dump-handler/values.yaml +++ b/charts/core-dump-handler/values.yaml @@ -27,6 +27,8 @@ composer: filenameTemplate: "{uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}" logLength: 500 podSelectorLabel: "" + timeout: 600 + compression: true daemonset: name: "core-dump-handler" diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index 3296632..5cb51ac 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -473,11 +473,15 @@ fn create_env_file(host_location: &str) -> Result<(), std::io::Error> { }); let log_length = env::var("LOG_LENGTH").unwrap_or_else(|_| "500".to_string()); let pod_selector_label = env::var("COMP_POD_SELECTOR_LABEL").unwrap_or_default(); + let timeout = env::var("COMP_TIMEOUT").unwrap_or_else(|_| "600".to_string()); + let compression = env::var("COMP_COMPRESSION") + .unwrap_or_else(|_| "true".to_string()) + .to_lowercase(); info!("Creating {} file with LOG_LEVEL={}", destination, loglevel); let mut env_file = File::create(destination)?; let text = format!( - "LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\n", - loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label + "LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\nTIMEOUT={}\nCOMPRESSION={}\n", + loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label, timeout, compression ); info!("Writing composer .env \n{}", text); env_file.write_all(text.as_bytes())?; diff --git a/core-dump-agent/tests/basic.rs b/core-dump-agent/tests/basic.rs index c9ce656..9fd6473 100644 --- a/core-dump-agent/tests/basic.rs +++ b/core-dump-agent/tests/basic.rs @@ -95,7 +95,7 @@ fn basic() -> Result<(), std::io::Error> { "FILENAME_TEMPLATE={uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}" )); assert!(env_content.contains("LOG_LENGTH=500")); - assert_eq!(env_content.lines().count(), 7); + assert_eq!(env_content.lines().count(), 9); //TODO: [No9] Test uploading of a corefile //TODO: [No9] Test remove option //TODO: [No9] Test sweep option diff --git a/core-dump-composer/src/config.rs b/core-dump-composer/src/config.rs index f555ecb..9bc76a7 100644 --- a/core-dump-composer/src/config.rs +++ b/core-dump-composer/src/config.rs @@ -21,12 +21,13 @@ pub struct CoreConfig { pub pod_selector_label: String, pub use_crio_config: bool, pub ignore_crio: bool, + pub timeout: u32, + pub compression: bool, pub image_command: ImageCommand, pub bin_path: String, pub os_hostname: String, pub filename_template: String, pub params: CoreParams, - pub disable_compression: bool, } #[derive(Serialize)] @@ -39,7 +40,6 @@ pub struct CoreParams { pub directory: String, pub hostname: String, pub pathname: String, - pub timeout: u64, pub namespace: Option, pub podname: Option, pub uuid: Uuid, @@ -58,12 +58,12 @@ impl CoreConfig { let directory = matches.value_of("directory").unwrap_or("").to_string(); let hostname = matches.value_of("hostname").unwrap_or("").to_string(); let pathname = matches.value_of("pathname").unwrap_or("").to_string(); - let timeout = matches - .value_of("timeout") - .unwrap_or("600") - .parse::() - .unwrap(); - let disable_compression = matches.contains_id("disable-compression"); + // let timeout = matches + // .value_of("timeout") + // .unwrap_or("600") + // .parse::() + // .unwrap(); + // let disable_compression = matches.contains_id("disable-compression"); let uuid = Uuid::new_v4(); @@ -76,7 +76,6 @@ impl CoreConfig { directory, hostname, pathname, - timeout, namespace: None, podname: None, uuid, @@ -112,6 +111,14 @@ impl CoreConfig { .unwrap_or_else(|_| "false".to_string().to_lowercase()) .parse::() .unwrap(); + let compression = env::var("COMPRESSION") + .unwrap_or_else(|_| "true".to_string().to_lowercase()) + .parse::() + .unwrap(); + let timeout = env::var("TIMEOUT") + .unwrap_or_else(|_| "600".to_string()) + .parse::() + .unwrap(); let os_hostname = hostname::get() .unwrap_or_else(|_| OsString::from_str("unknown").unwrap_or_default()) .into_string() @@ -146,7 +153,8 @@ impl CoreConfig { filename_template, log_length, params, - disable_compression, + compression, + timeout, }) } diff --git a/core-dump-composer/src/main.rs b/core-dump-composer/src/main.rs index 74d1f3a..374c440 100644 --- a/core-dump-composer/src/main.rs +++ b/core-dump-composer/src/main.rs @@ -21,14 +21,13 @@ mod logging; fn main() -> Result<(), anyhow::Error> { let (send, recv) = channel(); let cc = config::CoreConfig::new()?; - let timeout = cc.params.timeout; - + let recv_time: u64 = cc.timeout as u64; thread::spawn(move || { let result = handle(cc); send.send(result).unwrap(); }); - let result = recv.recv_timeout(Duration::from_secs(timeout)); + let result = recv.recv_timeout(Duration::from_secs(recv_time)); match result { Ok(inner_result) => inner_result, @@ -111,10 +110,10 @@ fn handle(mut cc: config::CoreConfig) -> Result<(), anyhow::Error> { cc.set_podname(podname.to_string()); // Create the base zip file that we are going to put everything into - let compression_method = if cc.disable_compression { - zip::CompressionMethod::Stored - } else { + let compression_method = if cc.compression { zip::CompressionMethod::Deflated + } else { + zip::CompressionMethod::Stored }; let options = FileOptions::default() .compression_method(compression_method) diff --git a/core-dump-composer/tests/timeout.rs b/core-dump-composer/tests/timeout.rs index 48cf596..4d2d8f3 100644 --- a/core-dump-composer/tests/timeout.rs +++ b/core-dump-composer/tests/timeout.rs @@ -44,6 +44,7 @@ fn timeout_scenario() -> Result<(), std::io::Error> { .unwrap(); let cdc = Command::new("../target/debug/core-dump-composer") + .env("TIMEOUT", "1") .arg("-c") .arg("1000000000") .arg("-e") @@ -60,8 +61,6 @@ fn timeout_scenario() -> Result<(), std::io::Error> { .arg("1588462466") .arg("-h") .arg("crashing-app-699c49b4ff-86wrh") - .arg("--timeout") - .arg("1") .stdin(cat) .output() .expect("Couldn't execute"); From 61890ff8feb525a9f0cd6b3bbd0772a0586d6ce5 Mon Sep 17 00:00:00 2001 From: Anthony Whalley Date: Sun, 6 Nov 2022 19:21:22 +0000 Subject: [PATCH 4/5] fix: include tag Signed-off-by: Anthony Whalley --- charts/core-dump-handler/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/core-dump-handler/values.yaml b/charts/core-dump-handler/values.yaml index b4986fe..896f8a2 100644 --- a/charts/core-dump-handler/values.yaml +++ b/charts/core-dump-handler/values.yaml @@ -3,7 +3,7 @@ replicaCount: 1 image: registry: quay.io repository: icdh/core-dump-handler - tag: v8.7.0 + tag: schema-updates pullPolicy: Always pullSecrets: [] request_mem: "64Mi" From 924aea3cc4ddf82b3993db4eac1c3f0728d5bb23 Mon Sep 17 00:00:00 2001 From: Anthony Whalley Date: Tue, 8 Nov 2022 07:49:05 +0000 Subject: [PATCH 5/5] docs: update Chart README with new values Signed-off-by: Anthony Whalley --- charts/core-dump-handler/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/charts/core-dump-handler/README.md b/charts/core-dump-handler/README.md index a55ac83..d026b97 100644 --- a/charts/core-dump-handler/README.md +++ b/charts/core-dump-handler/README.md @@ -182,6 +182,14 @@ The agent pod has the following environment variables and these are all set by t "img" (Default): This is the value most crictls expect. "images": Digital Ocean, Newer OpenShift require this value +* COMP_TIMEOUT - The timeout for the composer in seconds. Defaults to 600. + + In testing ~ 3 mins per 512Mb so we have set it to 10 mins. + +* COMP_COMPRESSION - Enable compression Default: true + + Given the amount of time compression there is an option to disable it. + * CRIO_ENDPOINT - The CRIO endpoint to use. "unix:///run/containerd/containerd.sock" (Default): This is the default for most containerd nodes @@ -252,7 +260,9 @@ Composer * logLevel: The log level for the composer (Default "Warn") * ignoreCrio: Maps to the COMP_IGNORE_CRIO enviroment variable (Default false) * crioImageCmd: Maps to the COMP_CRIO_IMAGE_CMD enviroment variable (Default "img") -* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable +* timeout: Maps to the COMP_TIMEOUT environment variable ("Default 600) +* compression: Maps to the COMP_COMPRESSION environment varable (Default "true") +* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable (Default {{uuid}}-dump-{{timestamp}}-{{hostname}}-{{exe_name}}-{{pid}}-{{signal}}) Possible Values: