diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index df209c1c8b..6d19579d9e 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -21,6 +21,7 @@ import torch from torch.utils.data import Dataset +from monai.apps.utils import get_logger from monai.config import IgniteInfo from monai.utils import CommonKeys, ensure_tuple, min_version, optional_import @@ -29,6 +30,9 @@ mlflow.entities, _ = optional_import( "mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler." ) +MlflowException, _ = optional_import( + "mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler." +) pandas, _ = optional_import("pandas", descriptor="Please install pandas for recording the dataset.") tqdm, _ = optional_import("tqdm", "4.47.0", min_version, "tqdm") @@ -41,6 +45,8 @@ DEFAULT_TAG = "Loss" +logger = get_logger(module_name=__name__) + class MLFlowHandler: """ @@ -236,10 +242,21 @@ def start(self, engine: Engine) -> None: def _set_experiment(self): experiment = self.experiment if not experiment: - experiment = self.client.get_experiment_by_name(self.experiment_name) - if not experiment: - experiment_id = self.client.create_experiment(self.experiment_name) - experiment = self.client.get_experiment(experiment_id) + for _retry_time in range(3): + try: + experiment = self.client.get_experiment_by_name(self.experiment_name) + if not experiment: + experiment_id = self.client.create_experiment(self.experiment_name) + experiment = self.client.get_experiment(experiment_id) + break + except MlflowException as e: + if "RESOURCE_ALREADY_EXISTS" in str(e): + logger.warning("Experiment already exists; delaying before retrying.") + time.sleep(1) + if _retry_time == 2: + raise e + else: + raise e if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE: raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment")