diff --git a/.gitignore b/.gitignore index c30f242fd2..b15c95db2f 100644 --- a/.gitignore +++ b/.gitignore @@ -104,3 +104,5 @@ venv.bak/ .mypy_cache/ examples/scd_lvsegs.npz .idea/ + +*~ diff --git a/examples/multi_gpu_test.ipynb b/examples/multi_gpu_test.ipynb new file mode 100644 index 0000000000..8ceb30af67 --- /dev/null +++ b/examples/multi_gpu_test.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONAI version: 0.0.1\n", + "Python version: 3.7.3 (default, Mar 27 2019, 22:11:17) [GCC 7.3.0]\n", + "Numpy version: 1.16.4\n", + "Pytorch version: 1.4.0\n", + "Ignite version: 0.3.0\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import DataLoader\n", + "import torchvision.transforms as transforms\n", + "\n", + "import numpy as np\n", + "\n", + "from ignite.engine import create_supervised_trainer\n", + "from ignite.engine.engine import Events\n", + "from ignite.handlers import ModelCheckpoint\n", + "\n", + "# assumes the framework is found here, change as necessary\n", + "sys.path.append(\"..\")\n", + "\n", + "from monai import application, data, networks, utils\n", + "\n", + "\n", + "application.config.print_config()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/localek10/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py:26: UserWarning: \n", + " There is an imbalance between your GPUs. You may want to exclude GPU 1 which\n", + " has less than 75% of the memory or cores of GPU 0. You can do so by setting\n", + " the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES\n", + " environment variable.\n", + " warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))\n" + ] + }, + { + "data": { + "text/plain": [ + "State:\n", + "\titeration: 4\n", + "\tepoch: 2\n", + "\tepoch_length: 2\n", + "\tmax_epochs: 2\n", + "\toutput: 20912.578125\n", + "\tbatch: \n", + "\tmetrics: \n", + "\tdataloader: \n", + "\tseed: 12" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr = 1e-3\n", + "\n", + "net = networks.nets.UNet(\n", + " dimensions=2,\n", + " in_channels=1,\n", + " num_classes=1,\n", + " channels=(16, 32, 64, 128, 256),\n", + " strides=(2, 2, 2, 2),\n", + " num_res_units=2,\n", + ")\n", + "\n", + "\n", + "def fake_loss(y_pred,y):\n", + " return (y_pred[0]+y).sum()\n", + "\n", + "\n", + "def fake_data_stream():\n", + " while True:\n", + " yield torch.rand((10,1,64,64)),torch.rand((10,1,64,64))\n", + " \n", + " \n", + "# 1 GPU\n", + "opt = torch.optim.Adam(net.parameters(), lr)\n", + "trainer=application.engine.create_multigpu_supervised_trainer(net,opt,fake_loss,[torch.device('cuda:0')])\n", + "trainer.run(fake_data_stream(),2,2)\n", + "\n", + "# all GPUs\n", + "opt = torch.optim.Adam(net.parameters(), lr)\n", + "trainer=application.engine.create_multigpu_supervised_trainer(net,opt,fake_loss,None)\n", + "trainer.run(fake_data_stream(),2,2)\n", + "\n", + "# CPU\n", + "opt = torch.optim.Adam(net.parameters(), lr)\n", + "trainer=application.engine.create_multigpu_supervised_trainer(net,opt,fake_loss,[])\n", + "trainer.run(fake_data_stream(),2,2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/unet_segmentation_3d.ipynb b/examples/unet_segmentation_3d.ipynb index 81ee6ce8b2..6af33fe27b 100644 --- a/examples/unet_segmentation_3d.ipynb +++ b/examples/unet_segmentation_3d.ipynb @@ -24,7 +24,6 @@ "import sys\n", "import tempfile\n", "from glob import glob\n", - "from functools import partial\n", "\n", "import torch\n", "import torch.nn as nn\n", diff --git a/monai/application/engine/multi_gpu_supervised_trainer.py b/monai/application/engine/multi_gpu_supervised_trainer.py new file mode 100644 index 0000000000..51e7d4ca7c --- /dev/null +++ b/monai/application/engine/multi_gpu_supervised_trainer.py @@ -0,0 +1,115 @@ +# Copyright 2020 MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from ignite.engine import create_supervised_trainer, create_supervised_evaluator, _prepare_batch + +import monai + + +def get_devices_spec(devices=None): + """ + Get a valid specification for one or more devices. If `devices` is None get devices for all CUDA devices available. + If `devices` is and zero-length structure a single CPU compute device is returned. In any other cases `devices` is + returned unchanged. + + Args: + devices (list, optional): list of devices to request, None for all GPU devices, [] for CPU. + + Returns: + list of torch.device: list of devices. + """ + if devices is None: + devices = [torch.device('cuda:%i' % d) for d in range(torch.cuda.device_count())] + + if len(devices) == 0: + raise ValueError("No GPU devices available") + + elif len(devices) == 0: + devices = [torch.device("cpu")] + + return devices + + +def _default_transform(x, y, y_pred, loss): + return loss.item() + + +def _default_eval_transform(x, y, y_pred): + return y_pred, y + + +@monai.utils.export("monai.application.engine") +def create_multigpu_supervised_trainer(net, optimizer, loss_fn, devices=None, non_blocking=False, + prepare_batch=_prepare_batch, output_transform=_default_transform): + """ + ***Derived from `create_supervised_trainer` in Ignite. + + Factory function for creating a trainer for supervised models. + Args: + net (`torch.nn.Module`): the network to train. + optimizer (`torch.optim.Optimizer`): the optimizer to use. + loss_fn (torch.nn loss function): the loss function to use. + devices (list, optional): device(s) type specification (default: None). + Applies to both model and batches. None is all devices used, empty list is CPU only. + non_blocking (bool, optional): if True and this copy is between CPU and GPU, the copy may occur asynchronously + with respect to the host. For other cases, this argument has no effect. + prepare_batch (callable, optional): function that receives `batch`, `device`, `non_blocking` and outputs + tuple of tensors `(batch_x, batch_y)`. + output_transform (callable, optional): function that receives 'x', 'y', 'y_pred', 'loss' and returns value + to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`. + Note: `engine.state.output` for this engine is defind by `output_transform` parameter and is the loss + of the processed batch by default. + Returns: + Engine: a trainer engine with supervised update function. + """ + + devices = get_devices_spec(devices) + + if len(devices) > 1: + net = torch.nn.parallel.DataParallel(net) + + return create_supervised_trainer(net, optimizer, loss_fn, devices[0], non_blocking, prepare_batch, output_transform) + + +@monai.utils.export("monai.application.engine") +def create_multigpu_supervised_evaluator(net, metrics=None, device=None, non_blocking=False, + prepare_batch=_prepare_batch, output_transform=_default_eval_transform): + """ + ***Derived from `create_supervised_evaluator` in Ignite. + + Factory function for creating an evaluator for supervised models. + Args: + net (`torch.nn.Module`): the model to train. + metrics (dict of str - :class:`~ignite.metrics.Metric`): a map of metric names to Metrics. + devices (list, optional): device(s) type specification (default: None). + Applies to both model and batches. None is all devices used, empty list is CPU only. + non_blocking (bool, optional): if True and this copy is between CPU and GPU, the copy may occur asynchronously + with respect to the host. For other cases, this argument has no effect. + prepare_batch (callable, optional): function that receives `batch`, `device`, `non_blocking` and outputs + tuple of tensors `(batch_x, batch_y)`. + output_transform (callable, optional): function that receives 'x', 'y', 'y_pred' and returns value + to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits + output expected by metrics. If you change it you should use `output_transform` in metrics. + Note: `engine.state.output` for this engine is defind by `output_transform` parameter and is + a tuple of `(batch_pred, batch_y)` by default. + Returns: + Engine: an evaluator engine with supervised inference function. + """ + + devices = get_devices_spec(devices) + + if len(devices) > 1: + net = torch.nn.parallel.DataParallel(net) + + return create_supervised_evaluator(net, metrics, devices[0], non_blocking, prepare_batch, output_transform) diff --git a/tests/test_parallel_execution.py b/tests/test_parallel_execution.py new file mode 100644 index 0000000000..0ef0dccd2a --- /dev/null +++ b/tests/test_parallel_execution.py @@ -0,0 +1,63 @@ +# Copyright 2020 MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import warnings + +import torch + +from monai.application.engine import create_multigpu_supervised_trainer + + +def fake_loss(y_pred, y): + return (y_pred[0] + y).sum() + + +def fake_data_stream(): + while True: + yield torch.rand((10, 1, 64, 64)), torch.rand((10, 1, 64, 64)) + +def expect_failure_if_no_gpu(test): + if not torch.cuda.is_available(): + return unittest.expectedFailure(test) + else: + return test + + +class TestParallelExecution(unittest.TestCase): + """ + Tests single GPU, multi GPU, and CPU execution with the Ignite supervised trainer. + """ + + @expect_failure_if_no_gpu + def test_single_gpu(self): + net = torch.nn.Conv2d(1, 1, 3, padding=1) + opt = torch.optim.Adam(net.parameters(), 1e-3) + trainer = create_multigpu_supervised_trainer(net, opt, fake_loss, [torch.device("cuda:0")]) + trainer.run(fake_data_stream(), 2, 2) + + @expect_failure_if_no_gpu + def test_multi_gpu(self): + net = torch.nn.Conv2d(1, 1, 3, padding=1) + opt = torch.optim.Adam(net.parameters(), 1e-3) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") # ignore warnings about imbalanced GPU memory + + trainer = create_multigpu_supervised_trainer(net, opt, fake_loss, None) + + trainer.run(fake_data_stream(), 2, 2) + + def test_cpu(self): + net = torch.nn.Conv2d(1, 1, 3, padding=1) + opt = torch.optim.Adam(net.parameters(), 1e-3) + trainer = create_multigpu_supervised_trainer(net, opt, fake_loss, []) + trainer.run(fake_data_stream(), 2, 2)