From 4ed074b760dc0511eb8cda441705fd1e2fb2ff85 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Wed, 19 Aug 2020 20:49:04 -0700 Subject: [PATCH 01/12] initial commit, dummy training loop, pure pytorch but not DDP --- benchmarks/oss.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 benchmarks/oss.py diff --git a/benchmarks/oss.py b/benchmarks/oss.py new file mode 100755 index 000000000..6b4a648d1 --- /dev/null +++ b/benchmarks/oss.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. + + +from torch.utils.data import DataLoader +from torchvision.datasets import FakeData +from torchvision.models import resnet50 +import torch.nn as nn +import torch +from typing import List, Any +from torchvision.transforms import ToTensor + + +def train(num_epochs: int = 10, batch_size: int = 64, device: torch.device = torch.device("cuda")): + # Standard RN50 + model = resnet50(pretrained=False, progress=True) + print("Benchmarking model: ", model) + + # Data setup, dummy data + def collate(inputs: List[Any]): + return { + "inputs": torch.stack([i[0] for i in inputs]).to(device), + "label": torch.stack([i[1] for i in inputs]).to(device), + } + + dataloader = DataLoader(dataset=FakeData(transform=ToTensor(), size=200), batch_size=batch_size, collate_fn=collate) + loss_fn = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4) + + # Dummy training loop + model.to(device) + model.train() + for epoch in range(num_epochs): + print(f"Epoch {epoch}") + for batch in dataloader: + + def closure(): + model.zero_grad() + outputs = model(batch["inputs"]) + loss = loss_fn(outputs, batch["label"]) + loss.backward() + print(f"dummy loss {loss.item()}") + return loss + + optimizer.step(closure) + + +if __name__ == "__main__": + # TODO: move all this to pytorch DDP + train() From a16728948e34dcbe82512fe47186d119383d5d64 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Wed, 19 Aug 2020 20:57:18 -0700 Subject: [PATCH 02/12] probably slightly broken, but rough DDP benchmark run --- .circleci/config.yml | 9 +++++++++ benchmarks/oss.py | 28 ++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 00f151d9e..2925c2b5c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -96,6 +96,12 @@ run_transformer_benchmark: &run_transformer_benchmark command: | python benchmarks/transformer.py +run_oss_benchmark: &run_oss_benchmark + - run: + name: Run OSS Benchmark + command: | + python benchmarks/oss.py + # ------------------------------------------------------------------------------------- # Jobs to run # ------------------------------------------------------------------------------------- @@ -244,6 +250,9 @@ jobs: - <<: *run_transformer_benchmark + - <<: *run_oss_benchmark + + workflows: version: 2 diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 6b4a648d1..ac408ae4e 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -8,9 +8,27 @@ import torch from typing import List, Any from torchvision.transforms import ToTensor +from fairscale.optim.oss import OSS +import torch.distributed as dist +import torch.multiprocessing as mp +import os -def train(num_epochs: int = 10, batch_size: int = 64, device: torch.device = torch.device("cuda")): +BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore + + +def dist_init(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29501" + dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) + + +def train( + rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 64, device: torch.device = torch.device("cuda") +): + # DDP + dist_init(rank, world_size) + # Standard RN50 model = resnet50(pretrained=False, progress=True) print("Benchmarking model: ", model) @@ -24,7 +42,9 @@ def collate(inputs: List[Any]): dataloader = DataLoader(dataset=FakeData(transform=ToTensor(), size=200), batch_size=batch_size, collate_fn=collate) loss_fn = nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4) + + # Shard the optimizer + optimizer = OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4) # Dummy training loop model.to(device) @@ -45,5 +65,5 @@ def closure(): if __name__ == "__main__": - # TODO: move all this to pytorch DDP - train() + world_size = 2 + mp.spawn(train, args=(world_size,), nprocs=world_size, join=True) From 20b981de33b9a2e374a7927ac45dd5e4903210ca Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Wed, 19 Aug 2020 21:16:27 -0700 Subject: [PATCH 03/12] adding the torchvision requirement for testing --- benchmarks/oss.py | 16 +++++++++------- requirements-test.txt | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index ac408ae4e..79529077d 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -1,18 +1,19 @@ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +import os +from typing import Any, List + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn from torch.utils.data import DataLoader from torchvision.datasets import FakeData from torchvision.models import resnet50 -import torch.nn as nn -import torch -from typing import List, Any from torchvision.transforms import ToTensor -from fairscale.optim.oss import OSS -import torch.distributed as dist -import torch.multiprocessing as mp -import os +from fairscale.optim.oss import OSS BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore @@ -65,5 +66,6 @@ def closure(): if __name__ == "__main__": + # TODO: really use DDP, not multiprocessing world_size = 2 mp.spawn(train, args=(world_size,), nprocs=world_size, join=True) diff --git a/requirements-test.txt b/requirements-test.txt index 66e3f1ba2..e4e8f68cf 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,5 +6,6 @@ pytest == 5.4.1 pytest-cov == 2.10.0 torchtext == 0.6.0 torch >= 1.5.1 +torchvision >= 1.5.1 # NOTE(msb) not a dependency but needed by torch numpy == 1.17.4 From 8a2377c325acbe8380b1f18707dab32c2f36a7dd Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Wed, 19 Aug 2020 21:20:05 -0700 Subject: [PATCH 04/12] brainfart --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index e4e8f68cf..832046de3 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,6 +6,6 @@ pytest == 5.4.1 pytest-cov == 2.10.0 torchtext == 0.6.0 torch >= 1.5.1 -torchvision >= 1.5.1 +torchvision >= 0.7.0 # NOTE(msb) not a dependency but needed by torch numpy == 1.17.4 From 41dcf6985a2361b646a002cfa20b867cf0abfde3 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Wed, 19 Aug 2020 21:27:41 -0700 Subject: [PATCH 05/12] reduce the loss, do something slightly distributed --- benchmarks/oss.py | 8 +++++--- requirements-test.txt | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 79529077d..c7428f968 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -25,7 +25,7 @@ def dist_init(rank, world_size): def train( - rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 64, device: torch.device = torch.device("cuda") + rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32, device: torch.device = torch.device("cuda") ): # DDP dist_init(rank, world_size) @@ -51,15 +51,17 @@ def collate(inputs: List[Any]): model.to(device) model.train() for epoch in range(num_epochs): - print(f"Epoch {epoch}") + print(f"[{dist.get_rank()}] : Epoch {epoch}") for batch in dataloader: def closure(): model.zero_grad() outputs = model(batch["inputs"]) loss = loss_fn(outputs, batch["label"]) + dist.all_reduce(loss, op=dist.ReduceOp.SUM) + loss /= world_size loss.backward() - print(f"dummy loss {loss.item()}") + print(f"[{dist.get_rank()}] : loss {loss.item()}") return loss optimizer.step(closure) diff --git a/requirements-test.txt b/requirements-test.txt index 832046de3..b3cacabe7 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,6 +6,6 @@ pytest == 5.4.1 pytest-cov == 2.10.0 torchtext == 0.6.0 torch >= 1.5.1 -torchvision >= 0.7.0 +torchvision >= 0.6.0 # NOTE(msb) not a dependency but needed by torch numpy == 1.17.4 From b212deeeedf40a95c214e9e9df1e6ba6145af4e2 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 20 Aug 2020 09:50:32 -0700 Subject: [PATCH 06/12] Some cleanup, distributing the training on two GPUs --- benchmarks/oss.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index c7428f968..72aed74e7 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -2,6 +2,7 @@ import os +import time from typing import Any, List import torch @@ -9,12 +10,12 @@ import torch.multiprocessing as mp import torch.nn as nn from torch.utils.data import DataLoader + +from fairscale.optim.oss import OSS from torchvision.datasets import FakeData from torchvision.models import resnet50 from torchvision.transforms import ToTensor -from fairscale.optim.oss import OSS - BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore @@ -24,34 +25,40 @@ def dist_init(rank, world_size): dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) -def train( - rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32, device: torch.device = torch.device("cuda") -): +def train(rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32): # DDP dist_init(rank, world_size) # Standard RN50 - model = resnet50(pretrained=False, progress=True) + model = resnet50(pretrained=False, progress=True).to(rank) print("Benchmarking model: ", model) # Data setup, dummy data def collate(inputs: List[Any]): return { - "inputs": torch.stack([i[0] for i in inputs]).to(device), - "label": torch.stack([i[1] for i in inputs]).to(device), + "inputs": torch.stack([i[0] for i in inputs]).to(rank), + "label": torch.stack([i[1] for i in inputs]).to(rank), } - dataloader = DataLoader(dataset=FakeData(transform=ToTensor(), size=200), batch_size=batch_size, collate_fn=collate) + def _print(msg): + if dist.get_rank() == 0: + print(msg) + + num_images = 200 + dataloader = DataLoader( + dataset=FakeData(transform=ToTensor(), size=num_images), batch_size=batch_size, collate_fn=collate + ) loss_fn = nn.CrossEntropyLoss() # Shard the optimizer optimizer = OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4) # Dummy training loop - model.to(device) model.train() for epoch in range(num_epochs): - print(f"[{dist.get_rank()}] : Epoch {epoch}") + _print(f"\n[{dist.get_rank()}] : Epoch {epoch}") + epoch_start = time.monotonic() + for batch in dataloader: def closure(): @@ -61,11 +68,15 @@ def closure(): dist.all_reduce(loss, op=dist.ReduceOp.SUM) loss /= world_size loss.backward() - print(f"[{dist.get_rank()}] : loss {loss.item()}") + _print(f"[{dist.get_rank()}] : loss {loss.item():.2f}") return loss optimizer.step(closure) + epoch_end = time.monotonic() + img_per_sec = num_images / (epoch_end - epoch_start) + _print(f"[{dist.get_rank()}] : processed {img_per_sec:.2f} img per sec") + if __name__ == "__main__": # TODO: really use DDP, not multiprocessing From b5cacbdc1d248ce08cb39081017befdeb1adafef Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 20 Aug 2020 11:14:22 -0700 Subject: [PATCH 07/12] some cleanup + adding a vanilla run, still not good to go --- benchmarks/oss.py | 44 ++++++++++++++++++++++++++++++------------ fairscale/optim/oss.py | 2 +- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 72aed74e7..7b9328500 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -10,12 +10,12 @@ import torch.multiprocessing as mp import torch.nn as nn from torch.utils.data import DataLoader - -from fairscale.optim.oss import OSS from torchvision.datasets import FakeData from torchvision.models import resnet50 from torchvision.transforms import ToTensor +from fairscale.optim.oss import OSS + BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore @@ -25,13 +25,14 @@ def dist_init(rank, world_size): dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) -def train(rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32): +def train( + rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32, data_size: int = 200, use_oss: bool = True +): # DDP dist_init(rank, world_size) # Standard RN50 model = resnet50(pretrained=False, progress=True).to(rank) - print("Benchmarking model: ", model) # Data setup, dummy data def collate(inputs: List[Any]): @@ -44,16 +45,21 @@ def _print(msg): if dist.get_rank() == 0: print(msg) - num_images = 200 dataloader = DataLoader( - dataset=FakeData(transform=ToTensor(), size=num_images), batch_size=batch_size, collate_fn=collate + dataset=FakeData(transform=ToTensor(), size=data_size), batch_size=batch_size, collate_fn=collate ) loss_fn = nn.CrossEntropyLoss() # Shard the optimizer - optimizer = OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4) + optimizer = ( + OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4) + if use_oss + else torch.optim.SGD(model.parameters(), lr=1e-4) + ) # Dummy training loop + torch.cuda.synchronize(rank) + training_start = time.monotonic() model.train() for epoch in range(num_epochs): _print(f"\n[{dist.get_rank()}] : Epoch {epoch}") @@ -68,17 +74,31 @@ def closure(): dist.all_reduce(loss, op=dist.ReduceOp.SUM) loss /= world_size loss.backward() - _print(f"[{dist.get_rank()}] : loss {loss.item():.2f}") return loss optimizer.step(closure) epoch_end = time.monotonic() - img_per_sec = num_images / (epoch_end - epoch_start) + img_per_sec = data_size / (epoch_end - epoch_start) _print(f"[{dist.get_rank()}] : processed {img_per_sec:.2f} img per sec") + torch.cuda.synchronize(rank) + training_stop = time.monotonic() + img_per_sec = data_size / (training_stop - training_start) * num_epochs + max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20 + + _print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall") + print(f"[{dist.get_rank()}] Peak memory: {max_memory:.1f}MiB") + if __name__ == "__main__": - # TODO: really use DDP, not multiprocessing - world_size = 2 - mp.spawn(train, args=(world_size,), nprocs=world_size, join=True) + WORLD_SIZE = 2 + EPOCHS = 10 + BATCH_SIZE = 64 + DATA_SIZE = 512 + + print("Benchmark OSS") + mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, True), nprocs=WORLD_SIZE, join=True) + + print("Benchmark vanilla SGD") + mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, False), nprocs=WORLD_SIZE, join=True) diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py index d8214ba57..de18f6d3b 100644 --- a/fairscale/optim/oss.py +++ b/fairscale/optim/oss.py @@ -49,7 +49,7 @@ class OSS(Optimizer): in_super_constructor: bool def __init__(self, params: _params_t, optim: Type[Optimizer] = SGD, group: Any = dist.group.WORLD, **defaults: Any): - # Hold all the nmodel params in the root .param_groups + # Hold all the model params in the root .param_groups self.in_super_constructor = True super().__init__(params, defaults) self.in_super_constructor = False From 928791e97725356e4f38ae0e1c2d1f0a59a3a425 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 20 Aug 2020 11:38:04 -0700 Subject: [PATCH 08/12] less silly defaults, gtg for a start I think --- benchmarks/oss.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 7b9328500..09d1a992b 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -11,7 +11,7 @@ import torch.nn as nn from torch.utils.data import DataLoader from torchvision.datasets import FakeData -from torchvision.models import resnet50 +from torchvision.models import resnet101 from torchvision.transforms import ToTensor from fairscale.optim.oss import OSS @@ -31,8 +31,8 @@ def train( # DDP dist_init(rank, world_size) - # Standard RN50 - model = resnet50(pretrained=False, progress=True).to(rank) + # Standard RN101 + model = resnet101(pretrained=False, progress=True).to(rank) # Data setup, dummy data def collate(inputs: List[Any]): @@ -52,9 +52,9 @@ def _print(msg): # Shard the optimizer optimizer = ( - OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4) + OSS(params=model.parameters(), optim=torch.optim.SGD, lr=1e-4, momentum=0.9) if use_oss - else torch.optim.SGD(model.parameters(), lr=1e-4) + else torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) ) # Dummy training loop @@ -62,7 +62,6 @@ def _print(msg): training_start = time.monotonic() model.train() for epoch in range(num_epochs): - _print(f"\n[{dist.get_rank()}] : Epoch {epoch}") epoch_start = time.monotonic() for batch in dataloader: @@ -80,7 +79,7 @@ def closure(): epoch_end = time.monotonic() img_per_sec = data_size / (epoch_end - epoch_start) - _print(f"[{dist.get_rank()}] : processed {img_per_sec:.2f} img per sec") + _print(f"[{dist.get_rank()}] : Epoch {epoch} - processed {img_per_sec:.2f} img per sec") torch.cuda.synchronize(rank) training_stop = time.monotonic() @@ -97,8 +96,8 @@ def closure(): BATCH_SIZE = 64 DATA_SIZE = 512 - print("Benchmark OSS") - mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, True), nprocs=WORLD_SIZE, join=True) - print("Benchmark vanilla SGD") mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, False), nprocs=WORLD_SIZE, join=True) + + print("Benchmark OSS") + mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, True), nprocs=WORLD_SIZE, join=True) From e6a4756c1c2927d35af2148dbfb8d0e1f3bff797 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 20 Aug 2020 14:16:35 -0700 Subject: [PATCH 09/12] smaller batch to fit the smaller gpus used in the circleci rigs --- benchmarks/oss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 09d1a992b..5178819bb 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -87,17 +87,17 @@ def closure(): max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20 _print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall") - print(f"[{dist.get_rank()}] Peak memory: {max_memory:.1f}MiB") + print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB") if __name__ == "__main__": WORLD_SIZE = 2 EPOCHS = 10 - BATCH_SIZE = 64 + BATCH_SIZE = 32 DATA_SIZE = 512 - print("Benchmark vanilla SGD") + print("\nBenchmark vanilla SGD") mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, False), nprocs=WORLD_SIZE, join=True) - print("Benchmark OSS") + print("\nBenchmark OSS") mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, True), nprocs=WORLD_SIZE, join=True) From 0e643061f1e9fa9a32257269e73c48085195843f Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Fri, 21 Aug 2020 15:03:44 -0700 Subject: [PATCH 10/12] Adding some options for the benchmark, and regression testing --- benchmarks/oss.py | 68 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index 5178819bb..ab15b3a25 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -13,8 +13,9 @@ from torchvision.datasets import FakeData from torchvision.models import resnet101 from torchvision.transforms import ToTensor - from fairscale.optim.oss import OSS +import math +import argparse BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore @@ -26,7 +27,14 @@ def dist_init(rank, world_size): def train( - rank: int, world_size: int, num_epochs: int = 10, batch_size: int = 32, data_size: int = 200, use_oss: bool = True + rank: int, + world_size: int, + num_epochs: int = 10, + batch_size: int = 32, + data_size: int = 200, + use_oss: bool = True, + check_regression: bool = True, + reference_speed: float = -1.0, ): # DDP dist_init(rank, world_size) @@ -61,6 +69,9 @@ def _print(msg): torch.cuda.synchronize(rank) training_start = time.monotonic() model.train() + + measurements = [] + for epoch in range(num_epochs): epoch_start = time.monotonic() @@ -78,26 +89,61 @@ def closure(): optimizer.step(closure) epoch_end = time.monotonic() - img_per_sec = data_size / (epoch_end - epoch_start) - _print(f"[{dist.get_rank()}] : Epoch {epoch} - processed {img_per_sec:.2f} img per sec") + measurements.append(data_size / (epoch_end - epoch_start)) + _print(f"Epoch {epoch} - processed {measurements[-1]:.2f} img per sec") torch.cuda.synchronize(rank) training_stop = time.monotonic() img_per_sec = data_size / (training_stop - training_start) * num_epochs max_memory = torch.cuda.max_memory_allocated(rank) / 2 ** 20 - _print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall") + print(f"[{dist.get_rank()}] : Training done. {img_per_sec:.2f} img per sec overall") print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB") + if use_oss and check_regression and dist.get_rank() == 0: + # Compute the mean and average img per second + mean = sum(measurements) / len(measurements) + diff = map(lambda x: pow(x - mean, 2.0), measurements) + std = math.sqrt(sum(diff) / (len(measurements) - 1)) + print(f"[Regression Test] Mean: {mean:.2f} +/- {std:.2f}") + assert (mean - 3.0 * std) < reference_speed, "Regression detected" + print("[Regression Test] VALID") + if __name__ == "__main__": - WORLD_SIZE = 2 - EPOCHS = 10 - BATCH_SIZE = 32 - DATA_SIZE = 512 + + parser = argparse.ArgumentParser( + description="Benchmark the optimizer state sharding, on a typical computer vision workload" + ) + parser.add_argument("--world_size", action="store", default=2, type=int) + parser.add_argument("--epochs", action="store", default=10, type=int) + parser.add_argument("--batch_size", action="store", default=32, type=int) + parser.add_argument("--data_size", action="store", default=512, type=int) + parser.add_argument("--check_regression", action="store", default=True, type=bool) + parser.add_argument("--reference_speed", action="store", default=39.82, type=float) + + args = parser.parse_args() print("\nBenchmark vanilla SGD") - mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, False), nprocs=WORLD_SIZE, join=True) + mp.spawn( + train, + args=(args.world_size, args.epochs, args.batch_size, args.data_size, False, False), + nprocs=args.world_size, + join=True, + ) print("\nBenchmark OSS") - mp.spawn(train, args=(WORLD_SIZE, EPOCHS, BATCH_SIZE, DATA_SIZE, True), nprocs=WORLD_SIZE, join=True) + mp.spawn( + train, + args=( + args.world_size, + args.epochs, + args.batch_size, + args.data_size, + True, + args.check_regression, + args.reference_speed, + ), + nprocs=args.world_size, + join=True, + ) From bfbca2e79da61b5a67cffa1cb5b99c7de7988813 Mon Sep 17 00:00:00 2001 From: Jun Ru Anderson <33384298+andersonic@users.noreply.github.com> Date: Fri, 21 Aug 2020 10:24:36 -0700 Subject: [PATCH 11/12] [test] set torch seed for Adam tests (#49) Set the torch seed for tests. xfail mixed precision and memory-efficient mixed-precision state_dict tests due to their states being cast to FP16 and back to FP32 during load_state_dict. Co-authored-by: Jun Ru Anderson --- benchmarks/transformer.py | 6 +++++- fairscale/optim/adam.py | 4 ++++ tests/optim/test_adam.py | 34 ++++++++++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/benchmarks/transformer.py b/benchmarks/transformer.py index 962871668..7c2cbfce8 100644 --- a/benchmarks/transformer.py +++ b/benchmarks/transformer.py @@ -135,7 +135,11 @@ def make_model(device, ntokens): criterion = nn.CrossEntropyLoss() lr = 0.01 # learning rate - optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION) + + try: + optimizer = Adam(p.parameters(), lr=lr, precision=Precision.MIXED_PRECISION) + except NameError: + optimizer = Adam(p.parameters(), lr=lr) return p, criterion, optimizer diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py index 905eec5c4..531ea7dce 100644 --- a/fairscale/optim/adam.py +++ b/fairscale/optim/adam.py @@ -147,6 +147,10 @@ def mixed_precision(self) -> bool: def load_state_dict(self, state_dict: Dict[str, Any]) -> None: super().load_state_dict(state_dict) + + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision. Eventually + # we want to fix this, as some precision may be lost for group in self.param_groups: for p in group["params"]: self.state[p]["exp_avg"] = self.state[p]["exp_avg"].type(self.optim_type) diff --git a/tests/optim/test_adam.py b/tests/optim/test_adam.py index 54ff8a207..cd047bc35 100644 --- a/tests/optim/test_adam.py +++ b/tests/optim/test_adam.py @@ -20,6 +20,12 @@ skip_if_no_adam = pytest.mark.skipif(not imported_adam, reason="Fairscale Adam not available") +@pytest.fixture(autouse=True) +def set_torch_seed(): + torch.manual_seed(1) + yield + + def make_full_precision_params(): weight = torch.randn(2, 1).cuda().requires_grad_() bias = torch.randn(2).cuda().requires_grad_() @@ -75,12 +81,26 @@ def fn_base(optimizer, weight, bias, input): # Load state dict state_dict = deepcopy(optimizer.state_dict()) optimizer_c.load_state_dict(state_dict) + + for group, group_c in zip(optimizer.param_groups, optimizer_c.param_groups): + for p, p_c in zip(group["params"], group_c["params"]): + assert torch.equal(optimizer.state[p]["exp_avg"], optimizer_c.state[p_c]["exp_avg"]) + assert torch.equal(optimizer.state[p]["exp_avg_sq"], optimizer_c.state[p_c]["exp_avg_sq"]) + + if optimizer.fp32_param_groups: + # When using mixed precision, fp32_param_groups are made from FP16 params rather than + # copied via state_dict, introducing differences between the original optimizer and + # the copy. Because this test requires that they be the exact same, we copy the + # fp32 params from the original optimizer to the copy + optimizer_c.fp32_param_groups = deepcopy(optimizer.fp32_param_groups) + # Run both optimizations in parallel for _i in range(5): optimizer.step(fn) optimizer_c.step(fn_c) - (weight - weight_c).to("cpu").detach().apply_(assert_almost_zero) - (bias - bias_c).to("cpu").detach().apply_(assert_almost_zero) + + assert torch.equal(weight, weight_c) + assert torch.equal(bias, bias_c) def assert_almost_zero(x): @@ -230,7 +250,12 @@ def test_state_dict_full_precision(): @skip_if_no_cuda @skip_if_no_adam +@pytest.mark.xfail def test_state_dict_mixed_precision(): + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision, resulting + # in a potential loss of precision. Thus, as training proceeds, we don't + # necessarily expect the parameters to remain the exact same. weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MIXED_PRECISION) @@ -239,7 +264,12 @@ def test_state_dict_mixed_precision(): @skip_if_no_cuda @skip_if_no_adam +@pytest.mark.xfail def test_state_dict_memory_efficient(): + # TODO: Optimizer state gets cast to FP16 and back to FP32 for + # mixed-precision and memory-efficient mixed-precision, resulting + # in a potential loss of precision. Thus, as training proceeds, we don't + # necessarily expect the parameters to remain the exact same. weight, bias, input = make_half_precision_params() optimizer = Adam([weight, bias], lr=1e-3, precision=Precision.MEMORY_EFFICIENT_MIXED_PRECISION) From a20fa519453638b6bc9a3cfbad81748102abe69d Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Fri, 21 Aug 2020 15:09:08 -0700 Subject: [PATCH 12/12] linting, I really need to automate this isort insanity --- benchmarks/oss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/oss.py b/benchmarks/oss.py index ab15b3a25..bcc434b2c 100755 --- a/benchmarks/oss.py +++ b/benchmarks/oss.py @@ -1,6 +1,8 @@ # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +import argparse +import math import os import time from typing import Any, List @@ -13,9 +15,8 @@ from torchvision.datasets import FakeData from torchvision.models import resnet101 from torchvision.transforms import ToTensor + from fairscale.optim.oss import OSS -import math -import argparse BACKEND = dist.Backend.NCCL if torch.cuda.is_available() else dist.Backend.GLOO # type: ignore