From ee53aa468532066cefbecc2001b0f5efe784bb64 Mon Sep 17 00:00:00 2001 From: binliu Date: Thu, 27 Apr 2023 10:12:16 +0000 Subject: [PATCH 01/14] add the finish statement check for the mlflow run Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 1 + tests/test_handler_mlflow.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index e49a2e967e..541892cf1e 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -191,6 +191,7 @@ def start(self, engine: Engine) -> None: run_name = f"run_{time.strftime('%Y%m%d_%H%M%S')}" if self.run_name is None else self.run_name runs = self.client.search_runs(self.experiment.experiment_id) runs = [r for r in runs if r.info.run_name == run_name or not self.run_name] + runs = [r for r in runs if r.info.status != "FINISHED"] if runs: self.cur_run = self.client.get_run(runs[-1].info.run_id) # pick latest active run else: diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index d9474a9a72..2675d8942a 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -68,6 +68,36 @@ def tearDown(self): if tmpdir and os.path.exists(tmpdir): shutil.rmtree(tmpdir) + def test_multi_run(self): + with tempfile.TemporaryDirectory() as tempdir: + # set up engine + def _train_func(engine, batch): + return [batch + 1.0] + run_id_list = [] + for _ in range(2): + engine = Engine(_train_func) + @engine.on(Events.EPOCH_COMPLETED) + def _update_metric(engine): + current_metric = engine.state.metrics.get("acc", 0.1) + engine.state.metrics["acc"] = current_metric + 0.1 + engine.state.test = current_metric + # set up testing handler + test_path = os.path.join(tempdir, "mlflow_test") + handler = MLFlowHandler( + iteration_log=False, + epoch_log=True, + tracking_uri=path_to_uri(test_path), + state_attributes=["test"], + close_on_complete=True, + ) + handler.attach(engine) + engine.run(range(3), max_epochs=2) + cur_run = handler.client.search_runs(handler.experiment.experiment_id)[0] + run_id_list.append(cur_run.info.run_id) + handler.close() + # check logging output + self.assertTrue(run_id_list[0] != run_id_list[1]) + def test_metrics_track(self): experiment_param = {"backbone": "efficientnet_b0"} with tempfile.TemporaryDirectory() as tempdir: From 89ceb39474db045bf147ca8f50b73f242b258c21 Mon Sep 17 00:00:00 2001 From: binliu Date: Thu, 27 Apr 2023 10:13:36 +0000 Subject: [PATCH 02/14] fix the format issue Signed-off-by: binliu --- tests/test_handler_mlflow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index 2675d8942a..d54b628a8e 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -73,14 +73,17 @@ def test_multi_run(self): # set up engine def _train_func(engine, batch): return [batch + 1.0] + run_id_list = [] for _ in range(2): engine = Engine(_train_func) + @engine.on(Events.EPOCH_COMPLETED) def _update_metric(engine): current_metric = engine.state.metrics.get("acc", 0.1) engine.state.metrics["acc"] = current_metric + 0.1 engine.state.test = current_metric + # set up testing handler test_path = os.path.join(tempdir, "mlflow_test") handler = MLFlowHandler( From 78473664bd5ec3309585e6a2edc4d23190e9b9ac Mon Sep 17 00:00:00 2001 From: binliu Date: Thu, 27 Apr 2023 14:10:08 +0000 Subject: [PATCH 03/14] update the doc-string and finish status Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 18 ++++++++++-------- tests/test_handler_mlflow.py | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 541892cf1e..61b08574b8 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -76,7 +76,7 @@ class MLFlowHandler: The default behavior is to track loss from output[0] as output is a decollated list and we replicated loss value for every item of the decollated list. `engine.state` and `output_transform` inherit from the ignite concept: - https://pytorch.org/ignite/concepts.html#state, explanation and usage example are in the tutorial: + https://pytorch-ignite.ai/concepts/03-state/, explanation and usage example are in the tutorial: https://github.com/Project-MONAI/tutorials/blob/master/modules/batch_output_transform.ipynb. global_epoch_transform: a callable that is used to customize global epoch number. For example, in evaluation, the evaluator engine might want to track synced epoch number @@ -84,13 +84,13 @@ class MLFlowHandler: state_attributes: expected attributes from `engine.state`, if provided, will extract them when epoch completed. tag_name: when iteration output is a scalar, `tag_name` is used to track, defaults to `'Loss'`. - experiment_name: name for an experiment, defaults to `default_experiment`. - run_name: name for run in an experiment. - experiment_param: a dict recording parameters which will not change through whole experiment, + experiment_name: a name for an experiment, defaults to `monai_experiment`. + run_name: a name for a run in an experiment. + experiment_param: a dict recording parameters which will not change through the whole workflow, like torch version, cuda version and so on. - artifacts: paths to images that need to be recorded after a whole run. - optimizer_param_names: parameters' name in optimizer that need to be record during running, - defaults to "lr". + artifacts: paths to images that need to be recorded after running the workflow. + optimizer_param_names: parameter names in the optimizer that need to be recorded during running, + default to "lr". close_on_complete: whether to close the mlflow run in `complete` phase in workflow, default to False. For more details of MLFlow usage, please refer to: https://mlflow.org/docs/latest/index.html. @@ -99,6 +99,7 @@ class MLFlowHandler: # parameters that are logged at the start of training default_tracking_params = ["max_epochs", "epoch_length"] + finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) def __init__( self, @@ -191,7 +192,8 @@ def start(self, engine: Engine) -> None: run_name = f"run_{time.strftime('%Y%m%d_%H%M%S')}" if self.run_name is None else self.run_name runs = self.client.search_runs(self.experiment.experiment_id) runs = [r for r in runs if r.info.run_name == run_name or not self.run_name] - runs = [r for r in runs if r.info.status != "FINISHED"] + # runs marked as finish should not record info any more + runs = [r for r in runs if r.info.status != self.finish_status] if runs: self.cur_run = self.client.get_run(runs[-1].info.run_id) # pick latest active run else: diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index d54b628a8e..fb786b806b 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -99,7 +99,7 @@ def _update_metric(engine): run_id_list.append(cur_run.info.run_id) handler.close() # check logging output - self.assertTrue(run_id_list[0] != run_id_list[1]) + self.assertNotEqual(run_id_list[0], run_id_list[1]) def test_metrics_track(self): experiment_param = {"backbone": "efficientnet_b0"} From 75d06392d1c3bf843e7ae1821082bf4e6b5a9a18 Mon Sep 17 00:00:00 2001 From: binliu Date: Thu, 27 Apr 2023 14:18:54 +0000 Subject: [PATCH 04/14] update the name of run finish status Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 61b08574b8..f08c0ce3cf 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -99,7 +99,7 @@ class MLFlowHandler: # parameters that are logged at the start of training default_tracking_params = ["max_epochs", "epoch_length"] - finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) + run_finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) def __init__( self, @@ -193,7 +193,7 @@ def start(self, engine: Engine) -> None: runs = self.client.search_runs(self.experiment.experiment_id) runs = [r for r in runs if r.info.run_name == run_name or not self.run_name] # runs marked as finish should not record info any more - runs = [r for r in runs if r.info.status != self.finish_status] + runs = [r for r in runs if r.info.status != self.run_finish_status] if runs: self.cur_run = self.client.get_run(runs[-1].info.run_id) # pick latest active run else: From fd6dd940a6593be6c027ab88ec4a8f6b001d8bcb Mon Sep 17 00:00:00 2001 From: binliu Date: Thu, 27 Apr 2023 14:27:22 +0000 Subject: [PATCH 05/14] update the close function with run finish status Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index f08c0ce3cf..593f792330 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -99,6 +99,7 @@ class MLFlowHandler: # parameters that are logged at the start of training default_tracking_params = ["max_epochs", "epoch_length"] + # the finish status of a mlflow run run_finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) def __init__( @@ -267,8 +268,7 @@ def close(self) -> None: """ if self.cur_run: - status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) - self.client.set_terminated(self.cur_run.info.run_id, status) + self.client.set_terminated(self.cur_run.info.run_id, self.run_finish_status) self.cur_run = None def epoch_completed(self, engine: Engine) -> None: From e9d6c2aed9f1e66c81de138b796a7b32b9dccf02 Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 02:38:23 +0000 Subject: [PATCH 06/14] update the comment in the unit test file of mlflow handler Signed-off-by: binliu --- tests/test_handler_mlflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index fb786b806b..22c8c87651 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -70,7 +70,7 @@ def tearDown(self): def test_multi_run(self): with tempfile.TemporaryDirectory() as tempdir: - # set up engine + # set up the train function for engine def _train_func(engine, batch): return [batch + 1.0] @@ -98,7 +98,7 @@ def _update_metric(engine): cur_run = handler.client.search_runs(handler.experiment.experiment_id)[0] run_id_list.append(cur_run.info.run_id) handler.close() - # check logging output + # check the two runs are different self.assertNotEqual(run_id_list[0], run_id_list[1]) def test_metrics_track(self): From b94c89c6af8b8a0f6ec79b734c14df7414853518 Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 03:19:18 +0000 Subject: [PATCH 07/14] add the no mlflow warning Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 593f792330..773074f112 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -23,7 +23,9 @@ from monai.utils import ensure_tuple, min_version, optional_import Events, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Events") -mlflow, _ = optional_import("mlflow") +mlflow, _ = optional_import( + "mlflow", descriptor="MLFlow is not installed. Please install it before using MLFlowHandler." +) mlflow.entities, _ = optional_import("mlflow.entities") if TYPE_CHECKING: @@ -99,8 +101,6 @@ class MLFlowHandler: # parameters that are logged at the start of training default_tracking_params = ["max_epochs", "epoch_length"] - # the finish status of a mlflow run - run_finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) def __init__( self, @@ -134,6 +134,7 @@ def __init__( self.artifacts = ensure_tuple(artifacts) self.optimizer_param_names = ensure_tuple(optimizer_param_names) self.client = mlflow.MlflowClient(tracking_uri=tracking_uri if tracking_uri else None) + self.run_finish_status = mlflow.entities.RunStatus.to_string(mlflow.entities.RunStatus.FINISHED) self.close_on_complete = close_on_complete self.experiment = None self.cur_run = None From 7a41bce0d7bc5e2d15e1ff2366baa3d8040d2091 Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 03:36:07 +0000 Subject: [PATCH 08/14] update the descriptor of optional import mlflow and mlflow.entities Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 773074f112..772819ca1d 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -23,10 +23,10 @@ from monai.utils import ensure_tuple, min_version, optional_import Events, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Events") -mlflow, _ = optional_import( - "mlflow", descriptor="MLFlow is not installed. Please install it before using MLFlowHandler." +mlflow, _ = optional_import("mlflow", descriptor="Please install mlflow before using MLFlowHandler.") +mlflow.entities, _ = optional_import( + "mlflow.entities", descriptor="Please install mlflow.entities before using MLFLowHandler." ) -mlflow.entities, _ = optional_import("mlflow.entities") if TYPE_CHECKING: from ignite.engine import Engine From 01dc94ae2c90ab70e3d4cc8c9cd11fa9631df3f0 Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 03:39:08 +0000 Subject: [PATCH 09/14] update the descriptor of mlflow and mlflow.entities optional import Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 772819ca1d..d2f6fc4d72 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -25,7 +25,7 @@ Events, _ = optional_import("ignite.engine", IgniteInfo.OPT_IMPORT_VERSION, min_version, "Events") mlflow, _ = optional_import("mlflow", descriptor="Please install mlflow before using MLFlowHandler.") mlflow.entities, _ = optional_import( - "mlflow.entities", descriptor="Please install mlflow.entities before using MLFLowHandler." + "mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler." ) if TYPE_CHECKING: From 1aecf49805e1de91832d329a56cd485a0412f7bd Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 04:18:35 +0000 Subject: [PATCH 10/14] update the unit test of mlflow handler Signed-off-by: binliu --- tests/test_handler_mlflow.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index 22c8c87651..f55ea56087 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -74,8 +74,8 @@ def test_multi_run(self): def _train_func(engine, batch): return [batch + 1.0] - run_id_list = [] - for _ in range(2): + create_engine_times = 3 + for _ in range(create_engine_times): engine = Engine(_train_func) @engine.on(Events.EPOCH_COMPLETED) @@ -95,11 +95,10 @@ def _update_metric(engine): ) handler.attach(engine) engine.run(range(3), max_epochs=2) - cur_run = handler.client.search_runs(handler.experiment.experiment_id)[0] - run_id_list.append(cur_run.info.run_id) + run_cnt = len(handler.client.search_runs(handler.experiment.experiment_id)) handler.close() - # check the two runs are different - self.assertNotEqual(run_id_list[0], run_id_list[1]) + # the run count should equal to the times of creating engine + self.assertEqual(create_engine_times, run_cnt) def test_metrics_track(self): experiment_param = {"backbone": "efficientnet_b0"} From 92347b14f237c02e06947462d7108636e0559c29 Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 04:23:24 +0000 Subject: [PATCH 11/14] update the comment in mlflow handler unit test Signed-off-by: binliu --- tests/test_handler_mlflow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_handler_mlflow.py b/tests/test_handler_mlflow.py index f55ea56087..f09f9b93d5 100644 --- a/tests/test_handler_mlflow.py +++ b/tests/test_handler_mlflow.py @@ -74,6 +74,7 @@ def test_multi_run(self): def _train_func(engine, batch): return [batch + 1.0] + # create and run an engine several times to get several runs create_engine_times = 3 for _ in range(create_engine_times): engine = Engine(_train_func) From 911a54de4720bb7c92718345412aa7394fc4a8ad Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 07:30:05 +0000 Subject: [PATCH 12/14] update the doc-string of the mlflow handler Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index d2f6fc4d72..308e6e5c27 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -86,8 +86,10 @@ class MLFlowHandler: state_attributes: expected attributes from `engine.state`, if provided, will extract them when epoch completed. tag_name: when iteration output is a scalar, `tag_name` is used to track, defaults to `'Loss'`. - experiment_name: a name for an experiment, defaults to `monai_experiment`. - run_name: a name for a run in an experiment. + experiment_name: the experiment name of MLflow, defaults to `monai_experiment`. An experiment can be + used to record several runs. + run_name: the run name in an experiment. A run can be used to record information about a workflow, + like the loss, metrics and so on. experiment_param: a dict recording parameters which will not change through the whole workflow, like torch version, cuda version and so on. artifacts: paths to images that need to be recorded after running the workflow. From ec6569460a40ef6ced8872b3f535e9c9e5e7629b Mon Sep 17 00:00:00 2001 From: binliu Date: Fri, 28 Apr 2023 07:51:49 +0000 Subject: [PATCH 13/14] fix some typos Signed-off-by: binliu --- monai/handlers/mlflow_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monai/handlers/mlflow_handler.py b/monai/handlers/mlflow_handler.py index 308e6e5c27..20add9b11f 100644 --- a/monai/handlers/mlflow_handler.py +++ b/monai/handlers/mlflow_handler.py @@ -86,15 +86,15 @@ class MLFlowHandler: state_attributes: expected attributes from `engine.state`, if provided, will extract them when epoch completed. tag_name: when iteration output is a scalar, `tag_name` is used to track, defaults to `'Loss'`. - experiment_name: the experiment name of MLflow, defaults to `monai_experiment`. An experiment can be + experiment_name: the experiment name of MLflow, default to `'monai_experiment'`. An experiment can be used to record several runs. run_name: the run name in an experiment. A run can be used to record information about a workflow, like the loss, metrics and so on. experiment_param: a dict recording parameters which will not change through the whole workflow, like torch version, cuda version and so on. artifacts: paths to images that need to be recorded after running the workflow. - optimizer_param_names: parameter names in the optimizer that need to be recorded during running, - default to "lr". + optimizer_param_names: parameter names in the optimizer that need to be recorded during running the + workflow, default to `'lr'`. close_on_complete: whether to close the mlflow run in `complete` phase in workflow, default to False. For more details of MLFlow usage, please refer to: https://mlflow.org/docs/latest/index.html. From b0a9adb1206a0ee5044706459eb15caa2beed490 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Sat, 29 Apr 2023 09:41:24 +0100 Subject: [PATCH 14/14] fixes #6449 Signed-off-by: Wenqi Li --- tests/test_auto3dseg_bundlegen.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/test_auto3dseg_bundlegen.py b/tests/test_auto3dseg_bundlegen.py index ba5d81e3cc..99e4ff44eb 100644 --- a/tests/test_auto3dseg_bundlegen.py +++ b/tests/test_auto3dseg_bundlegen.py @@ -25,7 +25,13 @@ from monai.bundle.config_parser import ConfigParser from monai.data import create_test_image_3d from monai.utils import set_determinism -from tests.utils import get_testing_algo_template_path, skip_if_downloading_fails, skip_if_no_cuda, skip_if_quick +from tests.utils import ( + SkipIfBeforePyTorchVersion, + get_testing_algo_template_path, + skip_if_downloading_fails, + skip_if_no_cuda, + skip_if_quick, +) num_images_perfold = max(torch.cuda.device_count(), 4) num_images_per_batch = 2 @@ -97,6 +103,7 @@ def run_auto3dseg_before_bundlegen(test_path, work_dir): @skip_if_no_cuda +@SkipIfBeforePyTorchVersion((1, 11, 1)) @skip_if_quick class TestBundleGen(unittest.TestCase): def setUp(self) -> None: