diff --git a/src/cloudai/_core/json_gen_strategy.py b/src/cloudai/_core/json_gen_strategy.py index 3fecb6432..81e1741b9 100644 --- a/src/cloudai/_core/json_gen_strategy.py +++ b/src/cloudai/_core/json_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -55,8 +55,13 @@ def sanitize_k8s_job_name(self, job_name: str) -> str: sanitized_name = job_name.lower() sanitized_name = re.sub(r"[^a-z0-9-]", "-", sanitized_name) sanitized_name = re.sub(r"^[^a-z0-9]+", "", sanitized_name) + sanitized_name = sanitized_name[:253] sanitized_name = re.sub(r"[^a-z0-9]+$", "", sanitized_name) - return sanitized_name[:253] + + if not sanitized_name: + raise ValueError(f"'{job_name}' cannot be sanitized to a valid Kubernetes job name.") + + return sanitized_name def store_test_run(self) -> None: from cloudai.models.scenario import TestRunDetails diff --git a/src/cloudai/systems/kubernetes/kubernetes_system.py b/src/cloudai/systems/kubernetes/kubernetes_system.py index 09bf94140..2a0c40e63 100644 --- a/src/cloudai/systems/kubernetes/kubernetes_system.py +++ b/src/cloudai/systems/kubernetes/kubernetes_system.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -431,13 +431,25 @@ def _delete_mpi_job(self, job_name: str) -> None: def _delete_batch_job(self, job_name: str) -> None: logging.debug(f"Deleting batch job '{job_name}'") - api_response = self.batch_v1.delete_namespaced_job( - name=job_name, - namespace=self.default_namespace, - body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5), - ) - api_response = cast("k8s.client.V1Job", api_response) + try: + api_response = self.batch_v1.delete_namespaced_job( + name=job_name, + namespace=self.default_namespace, + body=lazy.k8s.client.V1DeleteOptions(propagation_policy="Foreground", grace_period_seconds=5), + ) + except lazy.k8s.client.ApiException as e: + if e.status == 404: + logging.debug(f"Batch job '{job_name}' not found. It may have already been deleted.") + return + + logging.error( + f"An error occurred while attempting to delete batch job '{job_name}'. " + f"Error code: {e.status}. Message: {e.reason}. " + "Please verify the job name and Kubernetes API server." + ) + raise + api_response = cast("k8s.client.V1Status", api_response) logging.debug(f"Batch job '{job_name}' deleted with status: {api_response.status}") def _delete_dynamo_graph_deployment(self, job_name: str) -> None: @@ -662,7 +674,7 @@ def store_logs_for_job(self, job_name: str, output_dir: Path) -> None: """ pod_names = self.get_pod_names_for_job(job_name) if not pod_names: - logging.warning(f"No pods found for job '{job_name}'") + logging.debug(f"No pods found for job '{job_name}'") return output_dir.mkdir(parents=True, exist_ok=True) diff --git a/src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py b/src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py index fa737838f..1761ce1f9 100644 --- a/src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py +++ b/src/cloudai/workloads/sleep/kubernetes_json_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,7 +33,10 @@ def gen_json(self) -> Dict[Any, Any]: job_spec = { "apiVersion": "batch/v1", "kind": "Job", - "metadata": {"name": self.test_run.name, "namespace": kubernetes_system.default_namespace}, + "metadata": { + "name": self.sanitize_k8s_job_name(self.test_run.name), + "namespace": kubernetes_system.default_namespace, + }, "spec": { "ttlSecondsAfterFinished": 0, "template": { diff --git a/tests/json_gen_strategy/test_common_kubernetes.py b/tests/json_gen_strategy/test_common_kubernetes.py new file mode 100644 index 000000000..703c602d9 --- /dev/null +++ b/tests/json_gen_strategy/test_common_kubernetes.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest + +from cloudai.core import JsonGenStrategy, TestRun +from cloudai.systems.kubernetes import KubernetesSystem + + +class MyJsonGenStrategy(JsonGenStrategy): + def gen_json(self) -> dict: + return {} + + +@pytest.mark.parametrize( + "tname,expected", + [ + ("simple-name", "simple-name"), + ("name_with_underscores", "name-with-underscores"), + ("name.with.dots", "name-with-dots"), + ("name@with#special$chars", "name-with-special-chars"), + ("NameWithUpperCase", "namewithuppercase"), + ("a" * 260, "a" * 253), + ("---leading-and-trailing---", "leading-and-trailing"), + ("a" * 250 + "-" * 3 + "b" * 10, "a" * 250), # ensure no trailing hyphens on truncation + ], +) +def test_job_name_sanitization(k8s_system: KubernetesSystem, base_tr: TestRun, tname: str, expected: str) -> None: + base_tr.name = tname + json_gen = MyJsonGenStrategy(k8s_system, base_tr) + assert json_gen.sanitize_k8s_job_name(base_tr.name) == expected + + +def test_job_name_sanitization_raises(k8s_system: KubernetesSystem, base_tr: TestRun) -> None: + base_tr.name = "!@#$%^&*()" + json_gen = MyJsonGenStrategy(k8s_system, base_tr) + with pytest.raises(ValueError): + json_gen.sanitize_k8s_job_name(base_tr.name) diff --git a/tests/json_gen_strategy/test_sleep_kubernetes_json_gen_strategy.py b/tests/json_gen_strategy/test_sleep_kubernetes_json_gen_strategy.py new file mode 100644 index 000000000..59d9e04a7 --- /dev/null +++ b/tests/json_gen_strategy/test_sleep_kubernetes_json_gen_strategy.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from cloudai.core import TestRun +from cloudai.systems.kubernetes import KubernetesSystem +from cloudai.workloads.sleep import SleepCmdArgs, SleepKubernetesJsonGenStrategy, SleepTestDefinition + + +def test_job_name_sanitization(k8s_system: KubernetesSystem) -> None: + tdef = SleepTestDefinition(name="name", description="desc", test_template_name="tt", cmd_args=SleepCmdArgs()) + tr = TestRun(name="t!e@st#-n$am%e^", test=tdef, nodes=["node1"], num_nodes=1) + json_gen = SleepKubernetesJsonGenStrategy(k8s_system, tr) + + assert json_gen.gen_json()["metadata"]["name"] == json_gen.sanitize_k8s_job_name(tr.name)