diff --git a/Cargo.lock b/Cargo.lock index f3dab1b..1b925a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -859,8 +859,9 @@ checksum = "a94d21414c1f4a51209ad204c1776a3d0765002c76c6abcb602a6f09f1e881c7" [[package]] name = "log4rs" -version = "1.1.1" -source = "git+https://github.com/No9/log4rs/?branch=typemap-ors-fix#6e3af6876a8178ddd6c0d43440ff22f08d6d0d77" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d36ca1786d9e79b8193a68d480a0907b612f109537115c6ff655a3a1967533fd" dependencies = [ "anyhow", "arc-swap", diff --git a/FAQ.md b/FAQ.md index 629dbef..9048134 100644 --- a/FAQ.md +++ b/FAQ.md @@ -4,6 +4,8 @@ - [Why is my core dump truncated?](#why-is-my-core-dump-truncated) +- [Why is my zip file corrupted?](#why-is-my-zip-file-corrupted) + - [Why is my log file exactly half of my configured line count?](#why-is-my-log-file-exactly-half-of-my-configured-line-count) - [Can I force an upload?](#can-i-force-an-upload) @@ -12,6 +14,8 @@ - [How do I use the custom endpoint?](#how-do-i-use-the-custom-endpoint) +- [Why am I getting the wrong container info?](#why-am-i-getting-the-wrong-container-info) + ## How should I integrate my own uploader? The core dump handler is designed to quickly move the cores *"off-box"* to an object storage environment with as much additional runtime information as possible. @@ -73,6 +77,14 @@ terminationGracePeriodSeconds: 120 ``` Also see [Kubernetes best practices: terminating with grace](https://cloud.google.com/blog/products/containers-kubernetes/kubernetes-best-practices-terminating-with-grace) +## Why is my zip file corrupted? + +As of v8.7.0 there is now have a timer on the core dump to prevent repeated hanging core dumps taking down the system. +For very large core dumps this means the process can be truncated and the zipfile incomplete. + +In v8.8.0 We have added the nocompression option to zip process to improve performance and you can increase the timeout default which is currently set to 10 minutes. + + ## Why is my log file exactly half of my configured line count? This appears to be a bug in some kubernetes services. @@ -134,3 +146,9 @@ extraEnvVars: | - name: S3_ENDPOINT value: https://the-endpoint ``` + +## Why am I getting the wrong container info? + +Core dump handler trys to find the container information for the crashing process based on the hostname of the pod. This works fine in most scenarios but when pods are created directly in multiple namespaces or the same Statefulsets are created in the same namespaces. + +The current recommendation is to create a unique name in both of those scenarios. [See issue 115](https://github.com/IBM/core-dump-handler/issues/115) \ No newline at end of file diff --git a/charts/core-dump-handler/README.md b/charts/core-dump-handler/README.md index a55ac83..d026b97 100644 --- a/charts/core-dump-handler/README.md +++ b/charts/core-dump-handler/README.md @@ -182,6 +182,14 @@ The agent pod has the following environment variables and these are all set by t "img" (Default): This is the value most crictls expect. "images": Digital Ocean, Newer OpenShift require this value +* COMP_TIMEOUT - The timeout for the composer in seconds. Defaults to 600. + + In testing ~ 3 mins per 512Mb so we have set it to 10 mins. + +* COMP_COMPRESSION - Enable compression Default: true + + Given the amount of time compression there is an option to disable it. + * CRIO_ENDPOINT - The CRIO endpoint to use. "unix:///run/containerd/containerd.sock" (Default): This is the default for most containerd nodes @@ -252,7 +260,9 @@ Composer * logLevel: The log level for the composer (Default "Warn") * ignoreCrio: Maps to the COMP_IGNORE_CRIO enviroment variable (Default false) * crioImageCmd: Maps to the COMP_CRIO_IMAGE_CMD enviroment variable (Default "img") -* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable +* timeout: Maps to the COMP_TIMEOUT environment variable ("Default 600) +* compression: Maps to the COMP_COMPRESSION environment varable (Default "true") +* filenameTemplate: Maps to COMP_FILENAME_TEMPLATE environment variable (Default {{uuid}}-dump-{{timestamp}}-{{hostname}}-{{exe_name}}-{{pid}}-{{signal}}) Possible Values: diff --git a/charts/core-dump-handler/templates/daemonset.yaml b/charts/core-dump-handler/templates/daemonset.yaml index b109c67..aa9a1e9 100644 --- a/charts/core-dump-handler/templates/daemonset.yaml +++ b/charts/core-dump-handler/templates/daemonset.yaml @@ -47,6 +47,10 @@ spec: value: {{ .Values.composer.crioImageCmd }} - name: COMP_POD_SELECTOR_LABEL value: {{ .Values.composer.podSelectorLabel }} + - name: COMP_TIMEOUT + value: {{ .Values.composer.timeout | quote }} + - name: COMP_COMPRESSION + value: {{ .Values.composer.compression | quote }} - name: DEPLOY_CRIO_CONFIG value: {{ .Values.daemonset.deployCrioConfig | quote }} - name: CRIO_ENDPOINT diff --git a/charts/core-dump-handler/values.schema.json b/charts/core-dump-handler/values.schema.json index 9b6e7c6..acd5cb6 100644 --- a/charts/core-dump-handler/values.schema.json +++ b/charts/core-dump-handler/values.schema.json @@ -115,6 +115,13 @@ }, "podSelectorLabel": { "type": "string" + }, + "timeout": { + "type": "integer", + "minimum": 120 + }, + "compression": { + "type": "boolean" } }, "required": [ @@ -122,7 +129,9 @@ "ignoreCrio", "logLevel", "logLength", - "filenameTemplate" + "filenameTemplate", + "timeout", + "compression" ], "title": "Composer" }, @@ -183,7 +192,7 @@ "hostContainerRuntimeEndpoint" ] } - } + } ], "properties": { "name": { @@ -316,4 +325,4 @@ "title": "ServiceAccount" } } -} +} \ No newline at end of file diff --git a/charts/core-dump-handler/values.yaml b/charts/core-dump-handler/values.yaml index 437b68d..896f8a2 100644 --- a/charts/core-dump-handler/values.yaml +++ b/charts/core-dump-handler/values.yaml @@ -3,7 +3,7 @@ replicaCount: 1 image: registry: quay.io repository: icdh/core-dump-handler - tag: v8.7.0 + tag: schema-updates pullPolicy: Always pullSecrets: [] request_mem: "64Mi" @@ -27,6 +27,8 @@ composer: filenameTemplate: "{uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}" logLength: 500 podSelectorLabel: "" + timeout: 600 + compression: true daemonset: name: "core-dump-handler" diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index ca986c5..5cb51ac 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -288,7 +288,7 @@ async fn main() -> Result<(), anyhow::Error> { async fn process_file(zip_path: &Path, bucket: &Bucket) { info!("Uploading: {}", zip_path.display()); - let f = File::open(&zip_path).expect("no file found"); + let f = File::open(zip_path).expect("no file found"); match f.try_lock(FileLockMode::Shared) { Ok(_) => { /* If we can lock then we are ok */ } @@ -305,7 +305,7 @@ async fn process_file(zip_path: &Path, bucket: &Bucket) { } } - let metadata = fs::metadata(&zip_path).expect("unable to read metadata"); + let metadata = fs::metadata(zip_path).expect("unable to read metadata"); info!("zip size is {}", metadata.len()); let path_str = match zip_path.to_str() { Some(v) => v, @@ -473,11 +473,15 @@ fn create_env_file(host_location: &str) -> Result<(), std::io::Error> { }); let log_length = env::var("LOG_LENGTH").unwrap_or_else(|_| "500".to_string()); let pod_selector_label = env::var("COMP_POD_SELECTOR_LABEL").unwrap_or_default(); + let timeout = env::var("COMP_TIMEOUT").unwrap_or_else(|_| "600".to_string()); + let compression = env::var("COMP_COMPRESSION") + .unwrap_or_else(|_| "true".to_string()) + .to_lowercase(); info!("Creating {} file with LOG_LEVEL={}", destination, loglevel); let mut env_file = File::create(destination)?; let text = format!( - "LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\n", - loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label + "LOG_LEVEL={}\nIGNORE_CRIO={}\nCRIO_IMAGE_CMD={}\nUSE_CRIO_CONF={}\nFILENAME_TEMPLATE={}\nLOG_LENGTH={}\nPOD_SELECTOR_LABEL={}\nTIMEOUT={}\nCOMPRESSION={}\n", + loglevel, ignore_crio, crio_image, use_crio_config, filename_template, log_length, pod_selector_label, timeout, compression ); info!("Writing composer .env \n{}", text); env_file.write_all(text.as_bytes())?; @@ -496,7 +500,7 @@ fn get_sysctl(name: &str) -> Result { info!("Getting sysctl for {}", name); let output = Command::new("sysctl") .env("PATH", get_path()) - .args(&["-n", name]) + .args(["-n", name]) .output()?; let lines = String::from_utf8(output.stdout)?; let line = lines.lines().take(1).next().unwrap_or(""); @@ -522,7 +526,7 @@ fn overwrite_sysctl(name: &str, value: &str) -> Result<(), anyhow::Error> { let s = format!("{}={}", name, value); let output = Command::new("sysctl") .env("PATH", get_path()) - .args(&["-w", s.as_str()]) + .args(["-w", s.as_str()]) .status()?; if !output.success() { let e = Error::InvalidOverWrite { diff --git a/core-dump-agent/tests/basic.rs b/core-dump-agent/tests/basic.rs index c9ce656..9fd6473 100644 --- a/core-dump-agent/tests/basic.rs +++ b/core-dump-agent/tests/basic.rs @@ -95,7 +95,7 @@ fn basic() -> Result<(), std::io::Error> { "FILENAME_TEMPLATE={uuid}-dump-{timestamp}-{hostname}-{exe_name}-{pid}-{signal}" )); assert!(env_content.contains("LOG_LENGTH=500")); - assert_eq!(env_content.lines().count(), 7); + assert_eq!(env_content.lines().count(), 9); //TODO: [No9] Test uploading of a corefile //TODO: [No9] Test remove option //TODO: [No9] Test sweep option diff --git a/core-dump-composer/Cargo.toml b/core-dump-composer/Cargo.toml index 922ad96..d7cb29c 100644 --- a/core-dump-composer/Cargo.toml +++ b/core-dump-composer/Cargo.toml @@ -13,7 +13,7 @@ uuid = { version = "1.1.0", features = ["serde", "v4"] } zip = "0.6.2" dotenv = "0.15.0" log = "0.4.14" -log4rs = { git = "https://github.com/No9/log4rs/", branch = "typemap-ors-fix" } +log4rs = "1.2.0" anyhow = "1.0.53" serde_json = "1.0.76" serde = { version = "1.0.134", features = ["derive"] } diff --git a/core-dump-composer/src/config.rs b/core-dump-composer/src/config.rs index f555ecb..9bc76a7 100644 --- a/core-dump-composer/src/config.rs +++ b/core-dump-composer/src/config.rs @@ -21,12 +21,13 @@ pub struct CoreConfig { pub pod_selector_label: String, pub use_crio_config: bool, pub ignore_crio: bool, + pub timeout: u32, + pub compression: bool, pub image_command: ImageCommand, pub bin_path: String, pub os_hostname: String, pub filename_template: String, pub params: CoreParams, - pub disable_compression: bool, } #[derive(Serialize)] @@ -39,7 +40,6 @@ pub struct CoreParams { pub directory: String, pub hostname: String, pub pathname: String, - pub timeout: u64, pub namespace: Option, pub podname: Option, pub uuid: Uuid, @@ -58,12 +58,12 @@ impl CoreConfig { let directory = matches.value_of("directory").unwrap_or("").to_string(); let hostname = matches.value_of("hostname").unwrap_or("").to_string(); let pathname = matches.value_of("pathname").unwrap_or("").to_string(); - let timeout = matches - .value_of("timeout") - .unwrap_or("600") - .parse::() - .unwrap(); - let disable_compression = matches.contains_id("disable-compression"); + // let timeout = matches + // .value_of("timeout") + // .unwrap_or("600") + // .parse::() + // .unwrap(); + // let disable_compression = matches.contains_id("disable-compression"); let uuid = Uuid::new_v4(); @@ -76,7 +76,6 @@ impl CoreConfig { directory, hostname, pathname, - timeout, namespace: None, podname: None, uuid, @@ -112,6 +111,14 @@ impl CoreConfig { .unwrap_or_else(|_| "false".to_string().to_lowercase()) .parse::() .unwrap(); + let compression = env::var("COMPRESSION") + .unwrap_or_else(|_| "true".to_string().to_lowercase()) + .parse::() + .unwrap(); + let timeout = env::var("TIMEOUT") + .unwrap_or_else(|_| "600".to_string()) + .parse::() + .unwrap(); let os_hostname = hostname::get() .unwrap_or_else(|_| OsString::from_str("unknown").unwrap_or_default()) .into_string() @@ -146,7 +153,8 @@ impl CoreConfig { filename_template, log_length, params, - disable_compression, + compression, + timeout, }) } diff --git a/core-dump-composer/src/main.rs b/core-dump-composer/src/main.rs index 74d1f3a..374c440 100644 --- a/core-dump-composer/src/main.rs +++ b/core-dump-composer/src/main.rs @@ -21,14 +21,13 @@ mod logging; fn main() -> Result<(), anyhow::Error> { let (send, recv) = channel(); let cc = config::CoreConfig::new()?; - let timeout = cc.params.timeout; - + let recv_time: u64 = cc.timeout as u64; thread::spawn(move || { let result = handle(cc); send.send(result).unwrap(); }); - let result = recv.recv_timeout(Duration::from_secs(timeout)); + let result = recv.recv_timeout(Duration::from_secs(recv_time)); match result { Ok(inner_result) => inner_result, @@ -111,10 +110,10 @@ fn handle(mut cc: config::CoreConfig) -> Result<(), anyhow::Error> { cc.set_podname(podname.to_string()); // Create the base zip file that we are going to put everything into - let compression_method = if cc.disable_compression { - zip::CompressionMethod::Stored - } else { + let compression_method = if cc.compression { zip::CompressionMethod::Deflated + } else { + zip::CompressionMethod::Stored }; let options = FileOptions::default() .compression_method(compression_method) diff --git a/core-dump-composer/tests/timeout.rs b/core-dump-composer/tests/timeout.rs index 48cf596..4d2d8f3 100644 --- a/core-dump-composer/tests/timeout.rs +++ b/core-dump-composer/tests/timeout.rs @@ -44,6 +44,7 @@ fn timeout_scenario() -> Result<(), std::io::Error> { .unwrap(); let cdc = Command::new("../target/debug/core-dump-composer") + .env("TIMEOUT", "1") .arg("-c") .arg("1000000000") .arg("-e") @@ -60,8 +61,6 @@ fn timeout_scenario() -> Result<(), std::io::Error> { .arg("1588462466") .arg("-h") .arg("crashing-app-699c49b4ff-86wrh") - .arg("--timeout") - .arg("1") .stdin(cat) .output() .expect("Couldn't execute");