Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 25 additions & 25 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 13 additions & 7 deletions simvue/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,17 +990,23 @@ def get_alerts(
RuntimeError
if there was a failure retrieving data from the server
"""

if not run_id:
if critical_only:
raise RuntimeError(
"critical_only is ambiguous when returning alerts with no run ID specified."
)
return [alert.name if names_only else alert for _, alert in Alert.get()] # type: ignore

return [
alert.get("name")
if names_only
else Alert(identifier=alert.get("id"), **alert)
_alerts = [
Alert(identifier=alert.get("id"), **alert)
for alert in Run(identifier=run_id).get_alert_details()
if not critical_only or alert["status"].get("current") == "critical"
] # type: ignore
]

return [
alert.name if names_only else alert
for alert in _alerts
if not critical_only or alert.get_status(run_id) == "critical"
]

@prettify_pydantic
@pydantic.validate_call
Expand Down
6 changes: 4 additions & 2 deletions simvue/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,9 +348,11 @@ def _update_alerts(self) -> None:
if self._runner._dispatcher:
self._runner._dispatcher.purge()

self._runner.log_alert(self._alert_ids[proc_id], "critical")
self._runner.log_alert(
identifier=self._alert_ids[proc_id], state="critical"
)
else:
self._runner.log_alert(self._alert_ids[proc_id], "ok")
self._runner.log_alert(identifier=self._alert_ids[proc_id], state="ok")

_current_time: float = 0
while (
Expand Down
8 changes: 6 additions & 2 deletions simvue/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,18 @@ def get_process_memory(processes: list[psutil.Process]) -> int:
return rss


def get_process_cpu(processes: list[psutil.Process]) -> int:
def get_process_cpu(
processes: list[psutil.Process], interval: float | None = None
) -> int:
"""
Get the CPU usage

If first time being called, use a small interval to collect initial CPU metrics.
"""
cpu_percent: int = 0
for process in processes:
with contextlib.suppress(Exception):
cpu_percent += process.cpu_percent()
cpu_percent += process.cpu_percent(interval=interval)

return cpu_percent

Expand Down
68 changes: 51 additions & 17 deletions simvue/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,20 @@ def processes(self) -> list[psutil.Process]:
def _get_sysinfo(self) -> dict[str, typing.Any]:
"""Retrieve system administration

Parameters
----------
interval : float | None
The interval to use for collection of CPU metrics, by default None (non blocking)

Returns
-------
dict[str, typing.Any]
retrieved system specifications
"""
cpu = get_process_cpu(self.processes)
memory = get_process_memory(self.processes)
gpu = get_gpu_metrics(self.processes)
processes = self.processes
cpu = get_process_cpu(processes, interval=0.1)
memory = get_process_memory(processes)
gpu = get_gpu_metrics(processes)
data: dict[str, typing.Any] = {}

if memory is not None and cpu is not None:
Expand Down Expand Up @@ -351,6 +357,9 @@ def _heartbeat(
last_heartbeat = time.time()
last_res_metric_call = time.time()

if self._resources_metrics_interval:
self._add_metrics_to_dispatch(self._get_sysinfo(), join_on_fail=False)

while not heartbeat_trigger.is_set():
time.sleep(0.1)

Expand Down Expand Up @@ -699,6 +708,7 @@ def init(
self._sv_obj.alerts = []
self._sv_obj.created = time.time()
self._sv_obj.notifications = notification
self._sv_obj._staging["folder_id"] = self._folder.id

if self._status == "running":
self._sv_obj.system = get_system()
Expand Down Expand Up @@ -931,7 +941,7 @@ def reconnect(self, run_id: str) -> bool:
self._status = "running"

self._id = run_id
self._sv_obj = RunObject(identifier=self._id)
self._sv_obj = RunObject(identifier=self._id, _read_only=False)
self._start(reconnect=True)

return True
Expand All @@ -947,6 +957,7 @@ def set_pid(self, pid: int) -> None:
PID of the process to be monitored
"""
self._pid = pid
self._parent_process = psutil.Process(self._pid)

@skip_if_failed("_aborted", "_suppress_errors", False)
@pydantic.validate_call
Expand Down Expand Up @@ -1602,15 +1613,13 @@ def set_folder_details(
return False

try:
self._folder.read_only(False)
if metadata:
self._folder.metadata = metadata
if tags:
self._folder.tags = tags
if description:
self._folder.description = description
self._folder.commit()
self._folder.read_only(True)
except (RuntimeError, ValueError, pydantic.ValidationError) as e:
self._error(f"Failed to update folder '{self._folder.name}' details: {e}")
return False
Expand Down Expand Up @@ -1918,16 +1927,21 @@ def create_user_alert(
@check_run_initialised
@pydantic.validate_call
def log_alert(
self, identifier: str, state: typing.Literal["ok", "critical"]
self,
identifier: str | None = None,
name: str | None = None,
state: typing.Literal["ok", "critical"] = "critical",
) -> bool:
"""Set the state of an alert
"""Set the state of an alert - either specify the alert by ID or name.

Parameters
----------
identifier : str
identifier of alert to update
identifier : str | None
ID of alert to update, by default None
name : str | None
Name of the alert to update, by default None
state : Literal['ok', 'critical']
state to set alert to
state to set alert to, by default 'critical'

Returns
-------
Expand All @@ -1938,13 +1952,33 @@ def log_alert(
self._error('state must be either "ok" or "critical"')
return False

if (identifier and name) or (not identifier and not name):
self._error("Please specify alert to update either by ID or by name.")
return False

if name:
try:
if alerts := Alert.get(offline=self._user_config.run.mode == "offline"):
identifier = next(
(id for id, alert in alerts if alert.name == name), None
)
else:
self._error("No existing alerts")
return False
except RuntimeError as e:
self._error(f"{e.args[0]}")
return False

if not identifier:
self._error(f"Alert with name '{name}' could not be found.")

_alert = UserAlert(identifier=identifier)
# if not isinstance(_alert, UserAlert):
# self._error(
# f"Cannot update state for alert '{identifier}' "
# f"of type '{_alert.__class__.__name__.lower()}'"
# )
# return False
if not isinstance(_alert, UserAlert):
self._error(
f"Cannot update state for alert '{identifier}' "
f"of type '{_alert.__class__.__name__.lower()}'"
)
return False
_alert.read_only(False)
_alert.set_status(run_id=self._id, status=state)
_alert.commit()
Expand Down
Loading