From 23b033886230bf54139cc9fbe6788aa34b894074 Mon Sep 17 00:00:00 2001
From: Yizhou Wang <yizhou.wang@intel.com>
Date: Wed, 25 Sep 2024 06:13:12 +0000
Subject: [PATCH 1/2] init

---
 .github/workflows/xpu-compile.yml   |  58 ++++++++++++++++
 tests/torch_compile/ds_config.json  |  41 +++++++++++
 tests/torch_compile/test_compile.py | 103 ++++++++++++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 .github/workflows/xpu-compile.yml
 create mode 100644 tests/torch_compile/ds_config.json
 create mode 100644 tests/torch_compile/test_compile.py

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
new file mode 100644
index 000000000000..e960340614bf
--- /dev/null
+++ b/.github/workflows/xpu-compile.yml
@@ -0,0 +1,58 @@
+name: xpu-compile
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu-compile.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  compile-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install prerequisite
+      run: |
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/us/torchvision/
+        pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
+        pip install py-cpuinfo numpy
+        pip install .[dev,autotuning]
+
+    - name: Check container state
+      run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+        pip list
+
+    - name: Compile Status
+      shell: bash
+      run: |
+        ulimit -n 1048575
+        cd tests/torch_compile
+        export ZE_AFFINITY_MASK=0,1
+        deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt
+        cat log.txt |  grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config.json
new file mode 100644
index 000000000000..5de8c730bcac
--- /dev/null
+++ b/tests/torch_compile/ds_config.json
@@ -0,0 +1,41 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 3,
+      "reduce_scatter": true,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
\ No newline at end of file
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
new file mode 100644
index 000000000000..ddc4f6e51acd
--- /dev/null
+++ b/tests/torch_compile/test_compile.py
@@ -0,0 +1,103 @@
+import os
+import torch.nn as nn
+import argparse
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm
+
+import torch
+import intel_extension_for_pytorch # noq
+from torch.utils.data import Dataset, DataLoader
+
+torch._dynamo.config.cache_size_limit = 100
+
+import collections
+
+def get_dynamo_stats():
+    # TODO: consider deepcopy'ing the entire counters struct and
+    # adding a helper to do subtraction on it
+    return collections.Counter(
+        {
+            "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
+            "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
+            "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
+            # NB: The plus removes zero counts
+            "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
+            "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][
+                "captures"
+            ],
+            "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][
+                "compiles"
+            ],
+            "cudagraph_skips": torch._dynamo.utils.counters["inductor"][
+                "cudagraph_skips"
+            ],
+        }
+    )
+
+class RandomDataset(Dataset):
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size).to(torch.bfloat16)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+
+data_size = 1024
+data_length = 100
+rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length),
+                         batch_size=1,
+                         shuffle=False)
+
+
+class MyModule(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.fc0 = torch.nn.Linear(1024, 256, bias=False)
+        self.fc1 = torch.nn.Linear(256, 256, bias=False)
+        self.dropout = torch.nn.Dropout(0.5)
+    def forward(self, data, residual):
+        output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5
+        return output
+
+
+model = MyModule()
+params = model.parameters()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1,
+                    help='local rank passed from distributed launcher')
+parser.add_argument('--deepspeed_config', type=str, default='ds_config.json',
+                    help='path to DeepSpeed configuration file')
+cmd_args = parser.parse_args()
+
+
+# initialize the DeepSpeed engine
+model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params)
+model_engine.compile()
+
+residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_device_name())
+
+start_stats = get_dynamo_stats()
+
+for step, batch in enumerate(rand_loader):
+    if step % 10 == 0 and comm.get_rank() == 0:
+        print (f'step={step}')
+    # forward() method
+    loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum()
+    # runs backpropagation
+    model_engine.backward(loss)
+    # weight update
+    model_engine.step()
+
+dynamo_stats = get_dynamo_stats()
+dynamo_stats.subtract(start_stats)
+
+if comm.get_rank() == 0:
+    print(dynamo_stats)

From f52222c1dc7a3e9c6920b17d30cfcb3d94dfc69f Mon Sep 17 00:00:00 2001
From: Yizhou Wang <yizhou.wang@intel.com>
Date: Thu, 26 Sep 2024 03:13:57 +0000
Subject: [PATCH 2/2] format fix & adding env variable

---
 .github/workflows/xpu-compile.yml   |  3 +-
 tests/torch_compile/ds_config.json  |  2 +-
 tests/torch_compile/test_compile.py | 58 ++++++++++++++---------------
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
index e960340614bf..c2392091012f 100644
--- a/.github/workflows/xpu-compile.yml
+++ b/.github/workflows/xpu-compile.yml
@@ -51,8 +51,9 @@ jobs:
     - name: Compile Status
       shell: bash
       run: |
+        export FI_HMEM=system
         ulimit -n 1048575
         cd tests/torch_compile
         export ZE_AFFINITY_MASK=0,1
         deepspeed test_compile.py --deepspeed_config ds_config.json 2>&1 | tee log.txt
-        cat log.txt |  grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+        cat log.txt |  grep "'graph_breaks'" | sed 's/,/ /g' | awk '{print $2}' >> $GITHUB_STEP_SUMMARY
diff --git a/tests/torch_compile/ds_config.json b/tests/torch_compile/ds_config.json
index 5de8c730bcac..361bc115eaee 100644
--- a/tests/torch_compile/ds_config.json
+++ b/tests/torch_compile/ds_config.json
@@ -38,4 +38,4 @@
       "overlap_comm": false,
       "contiguous_gradients": false
   }
-}
\ No newline at end of file
+}
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
index ddc4f6e51acd..529ca56ae0a8 100644
--- a/tests/torch_compile/test_compile.py
+++ b/tests/torch_compile/test_compile.py
@@ -1,41 +1,39 @@
-import os
-import torch.nn as nn
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import argparse
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed import comm
 
 import torch
-import intel_extension_for_pytorch # noq
+import intel_extension_for_pytorch  # noqa: F401 # type: ignore
 from torch.utils.data import Dataset, DataLoader
 
 torch._dynamo.config.cache_size_limit = 100
 
 import collections
 
+
 def get_dynamo_stats():
     # TODO: consider deepcopy'ing the entire counters struct and
     # adding a helper to do subtraction on it
-    return collections.Counter(
-        {
-            "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
-            "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
-            "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
-            # NB: The plus removes zero counts
-            "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
-            "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"][
-                "captures"
-            ],
-            "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"][
-                "compiles"
-            ],
-            "cudagraph_skips": torch._dynamo.utils.counters["inductor"][
-                "cudagraph_skips"
-            ],
-        }
-    )
+    return collections.Counter({
+        "calls_captured": torch._dynamo.utils.counters["stats"]["calls_captured"],
+        "unique_graphs": torch._dynamo.utils.counters["stats"]["unique_graphs"],
+        "graph_breaks": sum(torch._dynamo.utils.counters["graph_break"].values()),
+        # NB: The plus removes zero counts
+        "unique_graph_breaks": len(+torch._dynamo.utils.counters["graph_break"]),
+        "autograd_captures": torch._dynamo.utils.counters["compiled_autograd"]["captures"],
+        "autograd_compiles": torch._dynamo.utils.counters["compiled_autograd"]["compiles"],
+        "cudagraph_skips": torch._dynamo.utils.counters["inductor"]["cudagraph_skips"],
+    })
+
 
 class RandomDataset(Dataset):
+
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size).to(torch.bfloat16)
@@ -47,20 +45,19 @@ def __len__(self):
         return self.len
 
 
-
 data_size = 1024
 data_length = 100
-rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length),
-                         batch_size=1,
-                         shuffle=False)
+rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), batch_size=1, shuffle=False)
 
 
 class MyModule(torch.nn.Module):
+
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.fc0 = torch.nn.Linear(1024, 256, bias=False)
         self.fc1 = torch.nn.Linear(256, 256, bias=False)
         self.dropout = torch.nn.Dropout(0.5)
+
     def forward(self, data, residual):
         output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5
         return output
@@ -69,15 +66,14 @@ def forward(self, data, residual):
 model = MyModule()
 params = model.parameters()
 
-
 parser = argparse.ArgumentParser()
-parser.add_argument('--local_rank', type=int, default=-1,
-                    help='local rank passed from distributed launcher')
-parser.add_argument('--deepspeed_config', type=str, default='ds_config.json',
+parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
+parser.add_argument('--deepspeed_config',
+                    type=str,
+                    default='ds_config.json',
                     help='path to DeepSpeed configuration file')
 cmd_args = parser.parse_args()
 
-
 # initialize the DeepSpeed engine
 model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params)
 model_engine.compile()
@@ -88,7 +84,7 @@ def forward(self, data, residual):
 
 for step, batch in enumerate(rand_loader):
     if step % 10 == 0 and comm.get_rank() == 0:
-        print (f'step={step}')
+        print(f'step={step}')
     # forward() method
     loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum()
     # runs backpropagation