From 23b033886230bf54139cc9fbe6788aa34b894074 Mon Sep 17 00:00:00 2001 From: Yizhou Wang Date: Wed, 25 Sep 2024 06:13:12 +0000 Subject: [PATCH 1/2] init --- .github/workflows/xpu-compile.yml | 58 ++++++++++++++++ tests/torch_compile/ds_config.json | 41 +++++++++++ tests/torch_compile/test_compile.py | 103 ++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 .github/workflows/xpu-compile.yml create mode 100644 tests/torch_compile/ds_config.json create mode 100644 tests/torch_compile/test_compile.py diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml new file mode 100644 index 000000000000..e960340614bf --- /dev/null +++ b/.github/workflows/xpu-compile.yml @@ -0,0 +1,58 @@ +name: xpu-compile + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - ".github/workflows/xpu-compile.yml" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + compile-tests: + runs-on: [self-hosted, intel, xpu] + container: + image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 + ports: + - 80 + options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL + + steps: + - uses: actions/checkout@v4 + - name: Install prerequisite + run: | + apt-get update + apt-get install clinfo libaio-dev python3-pip -y + pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/ + pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/ + pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/ + pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/ + pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl + pip install py-cpuinfo numpy + pip install .[dev,autotuning] + + - name: Check container state + run: | + ldd --version + ds_report + python3 -c "import torch; print('torch:', torch.__version__, torch)" + python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())" + python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)" + pip list + + - name: Compile Status + shell: bash + run: | + ulimit -n 1048575 + cd tests/torch_compile + export ZE_AFFINITY_MASK=0,1 + deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt + cat log.txt | grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config.json new file mode 100644 index 000000000000..5de8c730bcac --- /dev/null +++ b/tests/torch_compile/ds_config.json @@ -0,0 +1,41 @@ +{ + "train_batch_size": 8, + "steps_per_print": 2000, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.001, + "betas": [ + 0.8, + 0.999 + ], + "eps": 1e-8, + "weight_decay": 3e-7 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 1000 + } + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "bf16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 15 + }, + "wall_clock_breakdown": false, + "zero_optimization": { + "stage": 3, + "reduce_scatter": true, + "overlap_comm": false, + "contiguous_gradients": false + } +} \ No newline at end of file diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py new file mode 100644 index 000000000000..ddc4f6e51acd --- /dev/null +++ b/tests/torch_compile/test_compile.py @@ -0,0 +1,103 @@ +import os +import torch.nn as nn +import argparse +import deepspeed +from deepspeed.accelerator import get_accelerator +from deepspeed import comm + +import torch +import intel_extension_for_pytorch # noq +from torch.utils.data import Dataset, DataLoader + +torch._dynamo.config.cache_size_limit = 100 + +import collections + +def get_dynamo_stats(): + # TODO: consider deepcopy'ing the entire counters struct and + # adding a helper to do subtraction on it + return collections.Counter( + { + "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], + "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], + "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), + # NB: The plus removes zero counts + "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), + "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][ + "captures" + ], + "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][ + "compiles" + ], + "cudagraph_skips": torch._dynamo.utils.counters["inductor"][ + "cudagraph_skips" + ], + } + ) + +class RandomDataset(Dataset): + def __init__(self, size, length): + self.len = length + self.data = torch.randn(length, size).to(torch.bfloat16) + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return self.len + + + +data_size = 1024 +data_length = 100 +rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), + batch_size=1, + shuffle=False) + + +class MyModule(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.fc0 = torch.nn.Linear(1024, 256, bias=False) + self.fc1 = torch.nn.Linear(256, 256, bias=False) + self.dropout = torch.nn.Dropout(0.5) + def forward(self, data, residual): + output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5 + return output + + +model = MyModule() +params = model.parameters() + + +parser = argparse.ArgumentParser() +parser.add_argument('--local_rank', type=int, default=-1, + help='local rank passed from distributed launcher') +parser.add_argument('--deepspeed_config', type=str, default='ds_config.json', + help='path to DeepSpeed configuration file') +cmd_args = parser.parse_args() + + +# initialize the DeepSpeed engine +model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params) +model_engine.compile() + +residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_device_name()) + +start_stats = get_dynamo_stats() + +for step, batch in enumerate(rand_loader): + if step % 10 == 0 and comm.get_rank() == 0: + print (f'step={step}') + # forward() method + loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum() + # runs backpropagation + model_engine.backward(loss) + # weight update + model_engine.step() + +dynamo_stats = get_dynamo_stats() +dynamo_stats.subtract(start_stats) + +if comm.get_rank() == 0: + print(dynamo_stats) From f52222c1dc7a3e9c6920b17d30cfcb3d94dfc69f Mon Sep 17 00:00:00 2001 From: Yizhou Wang Date: Thu, 26 Sep 2024 03:13:57 +0000 Subject: [PATCH 2/2] format fix & adding env variable --- .github/workflows/xpu-compile.yml | 3 +- tests/torch_compile/ds_config.json | 2 +- tests/torch_compile/test_compile.py | 58 ++++++++++++++--------------- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml index e960340614bf..c2392091012f 100644 --- a/.github/workflows/xpu-compile.yml +++ b/.github/workflows/xpu-compile.yml @@ -51,8 +51,9 @@ jobs: - name: Compile Status shell: bash run: | + export FI_HMEM=system ulimit -n 1048575 cd tests/torch_compile export ZE_AFFINITY_MASK=0,1 deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt - cat log.txt | grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY \ No newline at end of file + cat log.txt | grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config.json index 5de8c730bcac..361bc115eaee 100644 --- a/tests/torch_compile/ds_config.json +++ b/tests/torch_compile/ds_config.json @@ -38,4 +38,4 @@ "overlap_comm": false, "contiguous_gradients": false } -} \ No newline at end of file +} diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py index ddc4f6e51acd..529ca56ae0a8 100644 --- a/tests/torch_compile/test_compile.py +++ b/tests/torch_compile/test_compile.py @@ -1,41 +1,39 @@ -import os -import torch.nn as nn +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import argparse import deepspeed from deepspeed.accelerator import get_accelerator from deepspeed import comm import torch -import intel_extension_for_pytorch # noq +import intel_extension_for_pytorch # noqa: F401 # type: ignore from torch.utils.data import Dataset, DataLoader torch._dynamo.config.cache_size_limit = 100 import collections + def get_dynamo_stats(): # TODO: consider deepcopy'ing the entire counters struct and # adding a helper to do subtraction on it - return collections.Counter( - { - "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], - "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], - "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), - # NB: The plus removes zero counts - "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), - "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][ - "captures" - ], - "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][ - "compiles" - ], - "cudagraph_skips": torch._dynamo.utils.counters["inductor"][ - "cudagraph_skips" - ], - } - ) + return collections.Counter({ + "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"], + "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"], + "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()), + # NB: The plus removes zero counts + "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]), + "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"]["captures"], + "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"]["compiles"], + "cudagraph_skips": torch._dynamo.utils.counters["inductor"]["cudagraph_skips"], + }) + class RandomDataset(Dataset): + def __init__(self, size, length): self.len = length self.data = torch.randn(length, size).to(torch.bfloat16) @@ -47,20 +45,19 @@ def __len__(self): return self.len - data_size = 1024 data_length = 100 -rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), - batch_size=1, - shuffle=False) +rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), batch_size=1, shuffle=False) class MyModule(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.fc0 = torch.nn.Linear(1024, 256, bias=False) self.fc1 = torch.nn.Linear(256, 256, bias=False) self.dropout = torch.nn.Dropout(0.5) + def forward(self, data, residual): output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5 return output @@ -69,15 +66,14 @@ def forward(self, data, residual): model = MyModule() params = model.parameters() - parser = argparse.ArgumentParser() -parser.add_argument('--local_rank', type=int, default=-1, - help='local rank passed from distributed launcher') -parser.add_argument('--deepspeed_config', type=str, default='ds_config.json', +parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') +parser.add_argument('--deepspeed_config', + type=str, + default='ds_config.json', help='path to DeepSpeed configuration file') cmd_args = parser.parse_args() - # initialize the DeepSpeed engine model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params) model_engine.compile() @@ -88,7 +84,7 @@ def forward(self, data, residual): for step, batch in enumerate(rand_loader): if step % 10 == 0 and comm.get_rank() == 0: - print (f'step={step}') + print(f'step={step}') # forward() method loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum() # runs backpropagation