diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index 9173c28..0699682 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -3,6 +3,7 @@ == 0.6.0 - unreleased :262: https://github.com/stackabletech/agent/pull/262[#262] +:263: https://github.com/stackabletech/agent/pull/263[#263] :267: https://github.com/stackabletech/agent/pull/267[#267] :270: https://github.com/stackabletech/agent/pull/270[#270] :273: https://github.com/stackabletech/agent/pull/273[#273] @@ -23,6 +24,16 @@ but not any longer with versions prior to v1.19 ({267}). * Error message improved which is logged if a systemd unit file cannot be created ({276}). +* Handling of service restarts moved from the Stackable agent to + systemd ({263}). + +=== Removed +* Check removed if a service starts up correctly within 10 seconds. + systemd manages restarts now and the Stackable agent cannot detect if + a service is in a restart loop ({263}). + +=== Fixed +* Systemd services in session mode are restarted after a reboot ({263}). == 0.5.0 - 2021-07-26 diff --git a/Cargo.lock b/Cargo.lock index f61b39f..3524532 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2834,6 +2834,7 @@ dependencies = [ "handlebars", "hostname", "indoc", + "json-patch", "k8s-openapi", "krator", "kube", diff --git a/Cargo.toml b/Cargo.toml index fb5788c..776dc59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ krator = { git = "https://github.com/stackabletech/krustlet.git", branch = "stac kube = { version= "0.48", default-features = false, features = ["derive", "native-tls"] } kubelet = { git = "https://github.com/stackabletech/krustlet.git", branch = "stackable_patches_v0.7.0", default-features = true, features= ["derive", "cli"] } # version = "0.7" Inflector = "0.11" +json-patch = "0.2" lazy_static = "1.4" log = "0.4" multimap = "0.8" diff --git a/docs/modules/ROOT/pages/services.adoc b/docs/modules/ROOT/pages/services.adoc index d743877..858a1ab 100644 --- a/docs/modules/ROOT/pages/services.adoc +++ b/docs/modules/ROOT/pages/services.adoc @@ -15,7 +15,3 @@ A pod which provides a service should never terminate on its own, so the command: - restartPolicy: Always - -After a container command is executed the agent waits for 10 seconds -before the container status is set to running. When all containers are -running, also the pod phase is switched from `Pending` to `Running`. diff --git a/src/provider/kubernetes/status.rs b/src/provider/kubernetes/status.rs index fc384de..3b356f2 100644 --- a/src/provider/kubernetes/status.rs +++ b/src/provider/kubernetes/status.rs @@ -1,7 +1,11 @@ //! Functions for patching the pod status +use anyhow::anyhow; use k8s_openapi::api::core::v1::Pod as KubePod; -use kube::{Api, Client}; +use kube::{ + api::{Patch, PatchParams}, + Api, Client, +}; use kubelet::{ container::{ContainerKey, Status}, pod::Pod, @@ -30,3 +34,39 @@ pub async fn patch_container_status( ); } } + +/// Patches the restart count of a container. +pub async fn patch_restart_count( + client: &Client, + pod: &Pod, + container_key: &ContainerKey, + restart_count: u32, +) -> anyhow::Result<()> { + let api: Api = Api::namespaced(client.clone(), pod.namespace()); + + let index = pod + .container_status_index(container_key) + .ok_or_else(|| anyhow!("Container not found"))?; + + let container_type = if container_key.is_init() { + "initContainer" + } else { + "container" + }; + + let patch = json_patch::Patch(vec![json_patch::PatchOperation::Replace( + json_patch::ReplaceOperation { + path: format!("/status/{}Statuses/{}/restartCount", container_type, index), + value: restart_count.into(), + }, + )]); + + api.patch_status( + pod.name(), + &PatchParams::default(), + &Patch::<()>::Json(patch), + ) + .await?; + + Ok(()) +} diff --git a/src/provider/states/pod/running.rs b/src/provider/states/pod/running.rs index 6cd88aa..2785ba5 100644 --- a/src/provider/states/pod/running.rs +++ b/src/provider/states/pod/running.rs @@ -11,17 +11,15 @@ use kubelet::{ use log::{debug, info, trace, warn}; use tokio::time::Duration; -use super::{installing::Installing, starting::Starting, terminated::Terminated}; +use super::terminated::Terminated; use crate::provider::{ - kubernetes::{ - accessor::{restart_policy, RestartPolicy}, - status::patch_container_status, - }, + kubernetes::status::{patch_container_status, patch_restart_count}, + systemdmanager::service::ServiceState, PodHandle, PodState, ProviderState, }; #[derive(Debug, TransitionTo)] -#[transition_to(Installing, Starting, Terminated)] +#[transition_to(Terminated)] pub struct Running { pub transition_time: Time, } @@ -66,7 +64,7 @@ impl State for Running { // Interruption of this loop is triggered externally by the Krustlet code when // - the pod which this state machine refers to gets deleted // - Krustlet shuts down - while !running_containers.is_empty() && !container_failed { + while !running_containers.is_empty() { tokio::time::sleep(Duration::from_secs(10)).await; trace!( "Checking if service {} is still running.", @@ -79,23 +77,23 @@ impl State for Running { for (container_key, container_handle) in running_containers.iter() { let systemd_service = &container_handle.systemd_service; - match systemd_service.is_running().await { - Ok(true) => {} - Ok(false) => match systemd_service.failed().await { - Ok(true) => failed_containers - .push((container_key.to_owned(), container_handle.to_owned())), - Ok(false) => succeeded_containers - .push((container_key.to_owned(), container_handle.to_owned())), - Err(dbus_error) => warn!( - "Error querying Failed property for Unit [{}] of service [{}]: [{}]", + match systemd_service.service_state().await { + Ok(ServiceState::Created) => { + warn!( + "The unit [{}] of service [{}] was not started. \ + This should not happen. Ignoring this state for now.", systemd_service.file(), - pod_state.service_name, - dbus_error - ), - }, + pod_state.service_name + ); + } + Ok(ServiceState::Started) => {} + Ok(ServiceState::Succeeded) => succeeded_containers + .push((container_key.to_owned(), container_handle.to_owned())), + Ok(ServiceState::Failed) => failed_containers + .push((container_key.to_owned(), container_handle.to_owned())), Err(dbus_error) => { warn!( - "Error querying ActiveState for Unit [{}] of service [{}]: [{}].", + "Error querying state for unit [{}] of service [{}]: [{}].", systemd_service.file(), pod_state.service_name, dbus_error @@ -132,29 +130,38 @@ impl State for Running { ) .await; running_containers.remove(container_key); + container_failed = true; } - for container_handle in running_containers.values() { + for (container_key, container_handle) in running_containers.iter() { trace!( "Unit [{}] of service [{}] still running ...", container_handle.service_unit, pod_state.service_name ); - } - - container_failed = !failed_containers.is_empty(); - } - if container_failed { - if restart_policy(&pod) == RestartPolicy::Never { - Transition::next(self, Terminated { successful: false }) - } else { - debug!("Restart policy is set to restart, starting..."); - Transition::next(self, Starting {}) + match container_handle.systemd_service.restart_count().await { + Ok(restart_count) => { + if let Err(error) = + patch_restart_count(&client, &pod, container_key, restart_count).await + { + warn!("Could not patch restart count: {}", error); + } + } + Err(error) => warn!( + "Could retrieve restart count from unit [{}]: {}", + container_handle.service_unit, error + ), + } } - } else { - Transition::next(self, Terminated { successful: true }) } + + Transition::next( + self, + Terminated { + successful: !container_failed, + }, + ) } async fn status(&self, pod_state: &mut PodState, _pod: &Pod) -> anyhow::Result { diff --git a/src/provider/states/pod/starting.rs b/src/provider/states/pod/starting.rs index 6ef61a2..e3b4a89 100644 --- a/src/provider/states/pod/starting.rs +++ b/src/provider/states/pod/starting.rs @@ -1,14 +1,10 @@ use super::running::Running; use crate::provider::{ - kubernetes::{ - accessor::{restart_policy, RestartPolicy}, - status::patch_container_status, - }, - systemdmanager::service::SystemdService, - PodHandle, PodState, ProviderState, + kubernetes::status::patch_container_status, systemdmanager::service::ServiceState, PodHandle, + PodState, ProviderState, }; -use anyhow::{anyhow, Result}; +use anyhow::Result; use kube::{ api::{Patch, PatchParams}, Api, Client, @@ -17,8 +13,6 @@ use kubelet::pod::{Pod, PodKey}; use kubelet::{container::Status, pod::state::prelude::*}; use log::{debug, error, info}; use serde_json::json; -use std::time::Instant; -use tokio::time::{self, Duration}; #[derive(Default, Debug, TransitionTo)] #[transition_to(Running)] @@ -50,9 +44,7 @@ impl State for Starting { /// Starts the service units for the containers of the given pod. /// -/// The units are started and enabled if they are not already running. -/// The startup is considered successful if the unit is still running -/// after 10 seconds. +/// The units are started and enabled if they were not already started. async fn start_service_units( shared: SharedState, pod_state: &PodState, @@ -72,32 +64,19 @@ async fn start_service_units( for (container_key, container_handle) in pod_handle.unwrap_or_default() { let systemd_service = &container_handle.systemd_service; + let service_unit = &container_handle.service_unit; - if systemd_service.is_running().await? { - debug!( - "Unit [{}] for service [{}] is already running. Skip startup.", - systemd_service.file(), - &pod_state.service_name - ); - } else { - let service_unit = &container_handle.service_unit; - + if systemd_service.service_state().await? == ServiceState::Created { info!("Starting systemd unit [{}]", service_unit); systemd_manager.start(service_unit).await?; info!("Enabling systemd unit [{}]", service_unit); systemd_manager.enable(service_unit).await?; - - if restart_policy(pod) == RestartPolicy::Always { - // TODO: does this need to be configurable, or ar we happy with a hard coded value - // for now. I've briefly looked at the podspec and couldn't identify a good field - // to use for this - also, currently this starts containers (= systemd units) in - // order and waits 10 seconds for every unit, so a service with five containers - // would take 50 seconds until it reported running - which is totally fine in case - // the units actually depend on each other, but a case could be made for waiting - // once at the end - await_startup(systemd_service, Duration::from_secs(10)).await?; - } + } else { + debug!( + "Unit [{}] for service [{}] was already started. Skipping startup.", + service_unit, &pod_state.service_name + ); } add_annotation( @@ -114,35 +93,6 @@ async fn start_service_units( Ok(()) } -/// Checks if the given service unit is still running after the given duration. -async fn await_startup(systemd_service: &SystemdService, duration: Duration) -> Result<()> { - let start_time = Instant::now(); - while start_time.elapsed() < duration { - time::sleep(Duration::from_secs(1)).await; - - debug!( - "Checking if unit [{}] is still up and running.", - systemd_service.file() - ); - - if systemd_service.is_running().await? { - debug!( - "Service [{}] still running after [{}] seconds", - systemd_service.file(), - start_time.elapsed().as_secs() - ); - } else { - return Err(anyhow!( - "Unit [{}] stopped unexpectedly during startup after [{}] seconds.", - systemd_service.file(), - start_time.elapsed().as_secs() - )); - } - } - - Ok(()) -} - /// Adds an annotation to the given pod. /// /// If there is already an annotation with the given key then the value diff --git a/src/provider/systemdmanager/service.rs b/src/provider/systemdmanager/service.rs index 57a1b78..2804be8 100644 --- a/src/provider/systemdmanager/service.rs +++ b/src/provider/systemdmanager/service.rs @@ -1,8 +1,22 @@ //! Exposes methods from the systemd unit and service interfaces. -use super::systemd1_api::{ActiveState, AsyncManagerProxy, AsyncServiceProxy, AsyncUnitProxy}; -use crate::provider::systemdmanager::systemd1_api::ServiceResult; +use super::systemd1_api::{ + ActiveState, AsyncManagerProxy, AsyncServiceProxy, AsyncUnitProxy, SUB_STATE_SERVICE_EXITED, +}; use anyhow::anyhow; +/// Represents the state of a service unit object. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ServiceState { + /// The service was not started yet. + Created, + /// The service was started and is currently running or restarting. + Started, + /// The service terminated successfully and will not be restarted. + Succeeded, + /// The service terminated unsuccessfully and will not be restarted. + Failed, +} + /// Stores proxies of a systemd unit and service #[derive(Clone, Debug)] pub struct SystemdService { @@ -49,34 +63,60 @@ impl SystemdService { self.file.clone() } - /// Checks if the ActiveState is set to active. - pub async fn is_running(&self) -> anyhow::Result { - self.unit_proxy - .active_state() - .await - .map(|state| state == ActiveState::Active) - .map_err(|error| { - anyhow!( - "ActiveState of systemd unit [{}] cannot be retrieved: {}", - self.file, - error - ) - }) + /// Returns a coarse-grained state of the service unit object. + /// + /// It is assumed that RemainAfterExit is set to "yes" in the given + /// unit if the service can terminate. Otherwise it would not be + /// possible to distinguish between "inactive and never run" and + /// "inactive and terminated successfully". + pub async fn service_state(&self) -> anyhow::Result { + let active_state = self.unit_proxy.active_state().await?; + + let service_state = match active_state { + ActiveState::Inactive => { + // ActiveState "inactive" means in general that the + // previous run was successful or no previous run has + // taken place yet. If RemainAfterExit is set to "yes" + // then a successfully terminated service stays in + // ActiveState "active" and only a service which was not + // started before is in ActiveState "inactive". It is + // assumed here that RemainAfterExit is enabled. + ServiceState::Created + } + ActiveState::Active => { + let sub_state = self.unit_proxy.sub_state().await?; + if sub_state == SUB_STATE_SERVICE_EXITED { + // The service terminated successfully (otherwise + // ActiveState would be set to "failed") and will + // not be restarted (otherwise ActiveState would be + // set to "activating") and RemainAfterExit is set + // to "yes" (otherwise ActiveState would be set to + // "inactive"). It is assumed here that + // RemainAfterExit is enabled. + ServiceState::Succeeded + } else { + ServiceState::Started + } + } + ActiveState::Failed => { + // The service terminated unsuccessfully and will not be + // restarted (otherwise ActiveState would be set to + // "activating"). + ServiceState::Failed + } + ActiveState::Reloading => ServiceState::Started, + ActiveState::Activating => ServiceState::Started, + ActiveState::Deactivating => ServiceState::Started, + }; + + Ok(service_state) } - /// Checks if the result is not set to success. - pub async fn failed(&self) -> anyhow::Result { + pub async fn restart_count(&self) -> anyhow::Result { self.service_proxy - .result() + .nrestarts() .await - .map(|state| state != ServiceResult::Success) - .map_err(|error| { - anyhow!( - "Result of systemd unit [{}] cannot be retrieved: {}", - self.file, - error - ) - }) + .map_err(|e| anyhow!("Error receiving NRestarts of unit [{}]. {}", self.file, e)) } /// Retrieves the current invocation ID. diff --git a/src/provider/systemdmanager/systemd1_api.rs b/src/provider/systemdmanager/systemd1_api.rs index 8a0cf23..c085fdb 100644 --- a/src/provider/systemdmanager/systemd1_api.rs +++ b/src/provider/systemdmanager/systemd1_api.rs @@ -1,4 +1,7 @@ //! Binding to the D-Bus interface of systemd +//! +//! Further documentation can be found in the +//! [manual](https://www.freedesktop.org/software/systemd/man/org.freedesktop.systemd1). use fmt::Display; use inflector::cases::kebabcase; use serde::{de::Visitor, Deserialize, Serialize}; @@ -368,6 +371,11 @@ pub enum ActiveState { impl_tryfrom_ownedvalue_for_enum!(ActiveState); +/// Sub state of a service unit object which is set if the service +/// terminated successfully but is still active due to the +/// RemainAfterExit setting. +pub const SUB_STATE_SERVICE_EXITED: &str = "exited"; + /// Unique ID for a runtime cycle of a unit #[derive(Clone, Debug, Eq, PartialEq)] pub struct InvocationId(Vec); @@ -404,6 +412,23 @@ trait Unit { #[dbus_proxy(property)] fn active_state(&self) -> zbus::Result; + /// SubState encodes states of the same state machine that + /// ActiveState covers, but knows more fine-grained states that are + /// unit-type-specific. Where ActiveState only covers six high-level + /// states, SubState covers possibly many more low-level + /// unit-type-specific states that are mapped to the six high-level + /// states. Note that multiple low-level states might map to the + /// same high-level state, but not vice versa. Not all high-level + /// states have low-level counterparts on all unit types. At this + /// point the low-level states are not documented here, and are more + /// likely to be extended later on than the common high-level + /// states. + /// + /// Possible sub states can be found in the source code of systemd: + /// https://github.com/systemd/systemd/blob/v249/src/basic/unit-def.h + #[dbus_proxy(property)] + fn sub_state(&self) -> zbus::Result; + /// Unique ID for a runtime cycle of a unit #[dbus_proxy(property, name = "InvocationID")] fn invocation_id(&self) -> zbus::Result; @@ -482,6 +507,10 @@ trait Service { /// state (see ['ActiveState::Failed`]). #[dbus_proxy(property)] fn result(&self) -> zbus::Result; + + /// Number of restarts + #[dbus_proxy(property, name = "NRestarts")] + fn nrestarts(&self) -> zbus::Result; } /// A systemd job object diff --git a/src/provider/systemdmanager/systemdunit.rs b/src/provider/systemdmanager/systemdunit.rs index c57d4e1..d3d1d95 100644 --- a/src/provider/systemdmanager/systemdunit.rs +++ b/src/provider/systemdmanager/systemdunit.rs @@ -6,6 +6,7 @@ use kubelet::pod::Pod; use crate::provider::error::StackableError; use crate::provider::error::StackableError::PodValidationError; +use crate::provider::kubernetes::accessor::{restart_policy, RestartPolicy}; use crate::provider::states::pod::creating_config::CreatingConfig; use crate::provider::states::pod::PodState; use crate::provider::systemdmanager::manager::UnitTypes; @@ -39,6 +40,97 @@ lazy_static! { Regex::new("^[a-zA-Z_][a-zA-Z0-9_-]{0,30}$").unwrap(); } +/// Boolean arguments used in unit files +#[derive(Clone, Debug, Display, Eq, PartialEq)] +#[strum(serialize_all = "kebab-case")] +pub enum Boolean { + Yes, + No, +} + +impl From for Boolean { + fn from(value: bool) -> Self { + if value { + Boolean::Yes + } else { + Boolean::No + } + } +} + +/// Configures whether the service shall be restarted when the service +/// process exits, is killed, or a timeout is reached. +/// +/// The service process may be the main service process, but it may also +/// be one of the processes specified with `ExecStartPre=`, +/// `ExecStartPost=`, `ExecStop=`, `ExecStopPost=`, or `ExecReload=`. +/// When the death of the process is a result of systemd operation (e.g. +/// service stop or restart), the service will not be restarted. +/// Timeouts include missing the watchdog "keep-alive ping" deadline and +/// a service start, reload, and stop operation timeouts. +/// +/// As exceptions to the setting, the service will not be restarted if +/// the exit code or signal is specified in `RestartPreventExitStatus=` +/// or the service is stopped with `systemctl stop` or an equivalent +/// operation. Also, the services will always be restarted if the exit +/// code or signal is specified in `RestartForceExitStatus=`. +/// +/// Note that service restart is subject to unit start rate limiting +/// configured with `StartLimitIntervalSec=` and `StartLimitBurst=`. A +/// restarted service enters the failed state only after the start +/// limits are reached. +/// +/// Setting this to "RestartOption::OnFailure" is the recommended choice +/// for long-running services, in order to increase reliability by +/// attempting automatic recovery from errors. For services that shall +/// be able to terminate on their own choice (and avoid immediate +/// restarting), "RestartOption::OnAbnormal" is an alternative choice. +#[derive(Clone, Debug, Display, Eq, PartialEq)] +#[strum(serialize_all = "kebab-case")] +pub enum RestartOption { + /// The service will be restarted regardless of whether it exited + /// cleanly or not, got terminated abnormally by a signal, or hit a + /// timeout. + Always, + /// The service will not be restarted. + No, + /// The service will be restarted when the process is terminated by + /// a signal (including on core dump, excluding the signals + /// `SIGHUP`, `SIGINT`, `SIGTERM`, or `SIGPIPE`), when an operation + /// times out, or when the watchdog timeout is triggered. + OnAbnormal, + /// The service will be restarted only if the service process exits + /// due to an uncaught signal not specified as a clean exit status. + OnAbort, + /// The service will be restarted when the process exits with a + /// non-zero exit code, is terminated by a signal (including on core + /// dump, but excluding the signals `SIGHUP`, `SIGINT`, `SIGTERM`, + /// or `SIGPIPE`), when an operation (such as service reload) times + /// out, and when the configured watchdog timeout is triggered. + OnFailure, + /// The service will be restarted only when the service process + /// exits cleanly. In this context, a clean exit means any of the + /// following: + /// - exit code of 0; + /// - for types other than Type=oneshot, one of the signals + /// `SIGHUP`, `SIGINT`, `SIGTERM`, or `SIGPIPE`; + /// - exit statuses and signals specified in SuccessExitStatus=. + OnSuccess, + /// The service will be restarted only if the watchdog timeout for + /// the service expires. + OnWatchdog, +} + +impl From for RestartOption { + fn from(restart_policy: RestartPolicy) -> Self { + match restart_policy { + RestartPolicy::Always => RestartOption::Always, + RestartPolicy::OnFailure => RestartOption::OnFailure, + RestartPolicy::Never => RestartOption::OnAbnormal, + } + } +} + /// A struct that represents an individual systemd unit #[derive(Clone, Debug)] pub struct SystemDUnit { @@ -135,7 +227,15 @@ impl SystemDUnit { } // This one is mandatory, as otherwise enabling the unit fails - unit.set_property(Section::Install, "WantedBy", "multi-user.target"); + unit.set_property( + Section::Install, + "WantedBy", + if user_mode { + "default.target" + } else { + "multi-user.target" + }, + ); Ok(unit) } @@ -205,13 +305,28 @@ impl SystemDUnit { unit.set_property(Section::Service, "TimeoutStopSec", &termination_timeout); - if let Some(stop_timeout) = pod_spec.termination_grace_period_seconds { - unit.set_property( - Section::Service, - "TimeoutStopSec", - stop_timeout.to_string().as_str(), - ); - } + let restart_option = RestartOption::from(restart_policy(pod)); + unit.set_restart_option(&restart_option); + + // Relieve the machine a little bit on restart loops but choose + // a moderate value so that tests are not slowed down too much. + unit.set_restart_sec_option(2); + + // Adhere to the given restart policy and do not limit the + // number of restarts. + unit.set_start_limit_interval_sec_option(0); + + // If the service can terminate successfully then + // RemainAfterExit must be set to "yes" so that the state of the + // service unit object can be reliably determined after + // termination, see manager::SystemdManager::service_state. + // + // If Restart is set to "always" then the service cannot + // terminate and there is no need to determine the state after + // termination. Furthermore RemainAfterExit must not be set + // because otherwise the Restart option would be ignored when + // the service returns a successful return code. + unit.set_remain_after_exit_option((restart_option != RestartOption::Always).into()); if let Some(user_name) = SystemDUnit::get_user_name_from_pod_security_context(pod)? { if !user_mode { @@ -224,6 +339,34 @@ impl SystemDUnit { Ok(unit) } + /// Configures whether the service shall be restarted when the + /// service process exits, is killed, or a timeout is reached. + fn set_restart_option(&mut self, setting: &RestartOption) { + self.set_property(Section::Service, "Restart", &setting.to_string()); + } + + /// Configures the time to sleep in seconds before restarting a + /// service (as configured with [set_restart_option]). Defaults to + /// 100ms. + fn set_restart_sec_option(&mut self, seconds: u32) { + self.set_property(Section::Service, "RestartSec", &seconds.to_string()); + } + + /// Configures unit start rate limiting. Units which are started too + /// often within the given time span are not permitted to start any + /// more. The allowed number of restarts can be set with + /// "StartLimitBurst". May be set to 0 to disable any kind of rate + /// limiting. + fn set_start_limit_interval_sec_option(&mut self, seconds: u32) { + self.set_property(Section::Unit, "StartLimitIntervalSec", &seconds.to_string()); + } + + /// Causes systemd to consider the unit to be active if the start + /// action exited successfully. + fn set_remain_after_exit_option(&mut self, setting: Boolean) { + self.set_property(Section::Service, "RemainAfterExit", &setting.to_string()); + } + fn get_user_name_from_pod_security_context(pod: &Pod) -> Result, StackableError> { let validate = |user_name| { if USER_NAME_PATTERN.is_match(user_name) { @@ -501,7 +644,13 @@ mod test { runAsUserName: pod-user", "stackable.service", indoc! {" + [Unit] + StartLimitIntervalSec=0 + [Service] + RemainAfterExit=no + Restart=always + RestartSec=2 TimeoutStopSec=30 User=pod-user"} )] @@ -535,11 +684,15 @@ mod test { indoc! {r#" [Unit] Description=default-stackable-test-container + StartLimitIntervalSec=0 [Service] Environment="LOG_DIR=/var/log/default-stackable" Environment="LOG_LEVEL=INFO" ExecStart=start.sh arg /etc/default-stackable + RemainAfterExit=no + Restart=always + RestartSec=2 StandardError=journal StandardOutput=journal TimeoutStopSec=30 @@ -570,15 +723,19 @@ mod test { indoc! {r#" [Unit] Description=default-stackable-test-container + StartLimitIntervalSec=0 [Service] ExecStart=start.sh + RemainAfterExit=no + Restart=always + RestartSec=2 StandardError=journal StandardOutput=journal TimeoutStopSec=30 [Install] - WantedBy=multi-user.target"#} + WantedBy=default.target"#} )] #[case::set_termination_timeout( BusType::System, @@ -592,9 +749,37 @@ mod test { containers: []", "stackable.service", indoc! {" + [Unit] + StartLimitIntervalSec=0 + [Service] + RemainAfterExit=no + Restart=always + RestartSec=2 TimeoutStopSec=10"} )] + #[case::set_restart_policy( + BusType::System, + " + apiVersion: v1 + kind: Pod + metadata: + name: stackable + spec: + containers: [] + restartPolicy: OnFailure", + "stackable.service", + indoc! {" + [Unit] + StartLimitIntervalSec=0 + + [Service] + RemainAfterExit=yes + Restart=on-failure + RestartSec=2 + TimeoutStopSec=30" + } + )] fn create_unit_from_pod( #[case] bus_type: BusType,