From 7aa30ff849e7e187a68585cf1ad529d65f0d077b Mon Sep 17 00:00:00 2001 From: Anton Whalley Date: Wed, 21 Dec 2022 08:52:56 +0000 Subject: [PATCH 1/5] fix: scheduler crash Signed-off-by: Anton Whalley --- Cargo.lock | 2 +- charts/core-dump-handler/values.yaml | 4 ++-- core-dump-agent/src/main.rs | 35 ++++++++++++---------------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ffaafd5..679867e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -269,7 +269,7 @@ checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" [[package]] name = "core-dump-agent" -version = "8.6.0" +version = "8.8.0" dependencies = [ "advisory-lock", "anyhow", diff --git a/charts/core-dump-handler/values.yaml b/charts/core-dump-handler/values.yaml index bd47904..5d490e9 100644 --- a/charts/core-dump-handler/values.yaml +++ b/charts/core-dump-handler/values.yaml @@ -41,8 +41,8 @@ daemonset: suidDumpable: 2 vendor: default # interval: 60000 - # schedule: "1/60 * * * * *" - useINotify: true + schedule: "1/1 * * * * *" + # useINotify: false deployCrioConfig: false includeCrioExe: false # S3 access diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index 4567be4..b43aff7 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -18,7 +18,6 @@ use std::process; use std::process::Command; use std::time::Duration; use thiserror::Error; -use tokio::runtime::Handle; use tokio_cron_scheduler::{Job, JobScheduler}; #[allow(dead_code)] @@ -59,7 +58,7 @@ async fn main() -> Result<(), anyhow::Error> { env_logger::Builder::from_env(Env::default().default_filter_or("info")).init(); let host_dir = env::var("HOST_DIR").unwrap_or_else(|_| DEFAULT_BASE_DIR.to_string()); - let core_dir = env::var("CORE_DIR").unwrap_or_else(|_| DEFAULT_CORE_DIR.to_string()); + let core_dir_command = env::var("CORE_DIR").unwrap_or_else(|_| DEFAULT_CORE_DIR.to_string()); let suid = env::var("SUID_DUMPABLE").unwrap_or_else(|_| DEFAULT_SUID_DUMPABLE.to_string()); let deploy_crio_config = env::var("DEPLOY_CRIO_CONFIG") .unwrap_or_else(|_| "false".to_string()) @@ -94,9 +93,8 @@ async fn main() -> Result<(), anyhow::Error> { info!("Uploading {}", file); process_file(p, &bucket).await; } else { - let core_store = core_dir.clone(); - info!("Uploading all content in {}", core_store); - run_polling_agent(core_store.as_str()).await; + info!("Uploading all content in {}", core_dir_command); + run_polling_agent().await; } process::exit(0); } @@ -119,7 +117,7 @@ async fn main() -> Result<(), anyhow::Error> { format!("{}/core_pattern.bak", host_location).as_str(), format!( "|{}/{} -c=%c -e=%e -p=%p -s=%s -t=%t -d={} -h=%h -E=%E", - host_location, CDC_NAME, core_dir + host_location, CDC_NAME, core_dir_command ) .as_str(), )?; @@ -135,8 +133,6 @@ async fn main() -> Result<(), anyhow::Error> { &suid, )?; - let core_location = core_dir.clone(); - create_env_file(host_location)?; // Run polling agent on startup to clean up files. @@ -155,7 +151,7 @@ async fn main() -> Result<(), anyhow::Error> { std::thread::sleep(Duration::from_millis(1000)); } } else { - run_polling_agent(core_location.as_str()).await; + run_polling_agent().await; } if !interval.is_empty() && !schedule.is_empty() { @@ -180,7 +176,6 @@ async fn main() -> Result<(), anyhow::Error> { } } - let notify_location = core_location.clone(); if !schedule.is_empty() { info!("Schedule Initialising with: {}", schedule); let sched = match JobScheduler::new().await { @@ -190,12 +185,11 @@ async fn main() -> Result<(), anyhow::Error> { panic!("Schedule Creation Failed with {}", e) } }; - let s_job = match Job::new(schedule.as_str(), move |_uuid, _l| { - let handle = Handle::current(); - let core_str = core_location.clone(); - handle.spawn(async move { - run_polling_agent(&core_str).await; - }); + + let s_job = match Job::new_async(schedule.as_str(), move |_uuid, _l| { + Box::pin(async move { + run_polling_agent().await; + }) }) { Ok(v) => v, Err(e) => { @@ -231,14 +225,14 @@ async fn main() -> Result<(), anyhow::Error> { } }; info!("INotify Initialised..."); - match inotify.add_watch(¬ify_location, WatchMask::CLOSE) { + match inotify.add_watch(&core_dir_command, WatchMask::CLOSE) { Ok(_) => {} Err(e) => { error!("Add watch failed: {}", e); panic!("Add watch failed: {}", e) } }; - info!("INotify watching : {}", notify_location); + info!("INotify watching : {}", core_dir_command); let mut buffer = [0; 4096]; loop { let events = match inotify.read_events_blocking(&mut buffer) { @@ -264,7 +258,7 @@ async fn main() -> Result<(), anyhow::Error> { Some(s) => { let file = format!( "{}/{}", - notify_location, + core_dir_command, s.to_str().unwrap_or_default() ); let p = Path::new(&file); @@ -389,7 +383,8 @@ fn get_bucket() -> Result { Ok(Bucket::new(&s3.bucket, s3.region, s3.credentials)?.with_path_style()) } -async fn run_polling_agent(core_location: &str) { +async fn run_polling_agent() { + let core_location = env::var("CORE_DIR").unwrap_or_else(|_| DEFAULT_CORE_DIR.to_string()); info!("Executing Agent with location : {}", core_location); let bucket = match get_bucket() { From 9f1187533a53b0a1b9e8143b108560a01e269bd7 Mon Sep 17 00:00:00 2001 From: Anton Whalley Date: Thu, 22 Dec 2022 13:35:48 +0000 Subject: [PATCH 2/5] fix: update tag in test Signed-off-by: Anton Whalley --- charts/core-dump-handler/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/core-dump-handler/values.yaml b/charts/core-dump-handler/values.yaml index 5d490e9..5b640a3 100644 --- a/charts/core-dump-handler/values.yaml +++ b/charts/core-dump-handler/values.yaml @@ -3,7 +3,7 @@ replicaCount: 1 image: registry: quay.io repository: icdh/core-dump-handler - tag: v8.8.0 + tag: scheduler-fix pullPolicy: Always pullSecrets: [] request_mem: "64Mi" @@ -42,7 +42,7 @@ daemonset: vendor: default # interval: 60000 schedule: "1/1 * * * * *" - # useINotify: false + # useINotify: true deployCrioConfig: false includeCrioExe: false # S3 access From 80794db9218cebfb49a01855e1ea6c7ddf7a9b0c Mon Sep 17 00:00:00 2001 From: Anton Whalley Date: Thu, 22 Dec 2022 17:05:15 +0000 Subject: [PATCH 3/5] fix: stop falling through on schedule Signed-off-by: Anton Whalley --- core-dump-agent/src/main.rs | 3 +++ integration/run-ibm.sh | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index b43aff7..d505a4a 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -212,6 +212,9 @@ async fn main() -> Result<(), anyhow::Error> { panic!("Schedule Start failed, {:#?}", e); } }; + loop { + std::thread::sleep(Duration::from_millis(1000)); + } } if use_inotify == "true" { diff --git a/integration/run-ibm.sh b/integration/run-ibm.sh index 07aeb01..c29168f 100755 --- a/integration/run-ibm.sh +++ b/integration/run-ibm.sh @@ -1,8 +1,9 @@ #! /bin/bash cd ../ +set -a export $(grep -v '^#' .env | xargs) - +set +a cd ./charts/core-dump-handler helm install core-dump-handler . --create-namespace --namespace observe \ From d79266e7b595bb4d0b112acf34d5a5409573f734 Mon Sep 17 00:00:00 2001 From: Anton Whalley Date: Thu, 22 Dec 2022 17:10:17 +0000 Subject: [PATCH 4/5] fix: tighter loop Signed-off-by: Anton Whalley --- core-dump-agent/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index d505a4a..522a11e 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -213,7 +213,7 @@ async fn main() -> Result<(), anyhow::Error> { } }; loop { - std::thread::sleep(Duration::from_millis(1000)); + std::thread::sleep(Duration::from_millis(100)); } } From 869c90e5f35547f2ac58a07044e5b397a006dc3a Mon Sep 17 00:00:00 2001 From: Anton Whalley Date: Thu, 22 Dec 2022 17:34:48 +0000 Subject: [PATCH 5/5] fix: log schedule Signed-off-by: Anton Whalley --- core-dump-agent/src/main.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/core-dump-agent/src/main.rs b/core-dump-agent/src/main.rs index 522a11e..a2d7bfa 100644 --- a/core-dump-agent/src/main.rs +++ b/core-dump-agent/src/main.rs @@ -186,9 +186,16 @@ async fn main() -> Result<(), anyhow::Error> { } }; - let s_job = match Job::new_async(schedule.as_str(), move |_uuid, _l| { + let s_job = match Job::new_async(schedule.as_str(), move |uuid, mut l| { Box::pin(async move { - run_polling_agent().await; + let next_tick = l.next_tick_for_job(uuid).await; + match next_tick { + Ok(Some(ts)) => { + info!("Next scheduled run {:?}", ts); + run_polling_agent().await; + } + _ => warn!("Could not get next tick for job"), + } }) }) { Ok(v) => v,