From ea908dae5e330d371d91b07797428fc42a422e1e Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Sun, 14 Jul 2024 19:26:53 +0800 Subject: [PATCH 1/3] try to fix experiment already exist issue Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/handlers/mlflow_handler.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index df209c1c8b..ed2c028503 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -29,6 +29,7 @@ mlflow.entities, _ = optional_import( "mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler." ) +MlflowException, _ = optional_import("mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler.") pandas, _ = optional_import("pandas", descriptor="Please install pandas for recording the dataset.") tqdm, _ = optional_import("tqdm", "4.47.0", min_version, "tqdm") @@ -236,15 +237,26 @@ def start(self, engine: Engine) -> None: def _set_experiment(self): experiment = self.experiment if not experiment: - experiment = self.client.get_experiment_by_name(self.experiment_name) - if not experiment: - experiment_id = self.client.create_experiment(self.experiment_name) - experiment = self.client.get_experiment(experiment_id) + for _ in range(3): + try: + experiment = self.client.get_experiment_by_name(self.experiment_name) + if not experiment: + experiment_id = self.client.create_experiment(self.experiment_name) + experiment = self.client.get_experiment(experiment_id) + break + except MlflowException as e: + if "RESOURCE_ALREADY_EXISTS" in str(e): + time.sleep(1) + continue + else: + raise e if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE: raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment") self.experiment = experiment + + @staticmethod def _get_pandas_dataset_info(pandas_dataset): dataset_name = pandas_dataset.name From 9503f47035e15d868c938d501d5f1ef70c42101d Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:09:36 +0800 Subject: [PATCH 2/3] add warning Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/handlers/mlflow_handler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index ed2c028503..778e342fb5 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -21,6 +21,7 @@ import torch from torch.utils.data import Dataset +from monai.apps.utils import get_logger from monai.config import IgniteInfo from monai.utils import CommonKeys, ensure_tuple, min_version, optional_import @@ -29,7 +30,9 @@ mlflow.entities, _ = optional_import( "mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler." ) -MlflowException, _ = optional_import("mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler.") +MlflowException, _ = optional_import( + "mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler." +) pandas, _ = optional_import("pandas", descriptor="Please install pandas for recording the dataset.") tqdm, _ = optional_import("tqdm", "4.47.0", min_version, "tqdm") @@ -42,6 +45,8 @@ DEFAULT_TAG = "Loss" +logger = get_logger(module_name=__name__) + class MLFlowHandler: """ @@ -246,6 +251,7 @@ def _set_experiment(self): break except MlflowException as e: if "RESOURCE_ALREADY_EXISTS" in str(e): + logger.warning("Experiment already exists; delaying before retrying.") time.sleep(1) continue else: @@ -255,8 +261,6 @@ def _set_experiment(self): raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment") self.experiment = experiment - - @staticmethod def _get_pandas_dataset_info(pandas_dataset): dataset_name = pandas_dataset.name From 4fb7f0bcb31ec23c2f1effbe449ee44da9a3e063 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 17 Jul 2024 13:26:36 +0800 Subject: [PATCH 3/3] address comments Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/handlers/mlflow_handler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 778e342fb5..6d19579d9e 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -242,7 +242,7 @@ def start(self, engine: Engine) -> None: def _set_experiment(self): experiment = self.experiment if not experiment: - for _ in range(3): + for _retry_time in range(3): try: experiment = self.client.get_experiment_by_name(self.experiment_name) if not experiment: @@ -253,7 +253,8 @@ def _set_experiment(self): if "RESOURCE_ALREADY_EXISTS" in str(e): logger.warning("Experiment already exists; delaying before retrying.") time.sleep(1) - continue + if _retry_time == 2: + raise e else: raise e