From 4a0df0fb2ea228417c4609758a3e84cfa28fd7b7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 2 Apr 2021 10:39:51 -0700
Subject: [PATCH 1/2] flatten/unflatten benchmarks

---
 tests/benchmarks/flatten_bench.py   | 108 ++++++++++++++++++++++++++
 tests/benchmarks/unflatten_bench.py | 115 ++++++++++++++++++++++++++++
 2 files changed, 223 insertions(+)
 create mode 100755 tests/benchmarks/flatten_bench.py
 create mode 100755 tests/benchmarks/unflatten_bench.py

diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
new file mode 100755
index 000000000000..6b6ea12d9414
--- /dev/null
+++ b/tests/benchmarks/flatten_bench.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./flatten_bench.py -t
+# ./flatten_bench.py -c
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+import argparse
+
+import gc
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()]
+t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(t)
+flat_cpp = flatten(t)
+flat_apex = flatten_apex(t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+TIMES = 1000
+
+# the programs being tested
+def py():
+    for i in range(TIMES):
+        flat = _flatten_dense_tensors(t)
+
+def cpp():
+    for i in range(TIMES):
+        flat = flatten(t)
+
+def apex():
+    for i in range(TIMES):
+        flat = flatten_apex(t)
+
+#### cProfile ####
+
+import cProfile
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+
+#### timeit ####
+
+import timeit
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect(); torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect(); torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect(); torch.cuda.empty_cache()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
new file mode 100755
index 000000000000..9851bf3a2506
--- /dev/null
+++ b/tests/benchmarks/unflatten_bench.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./unflatten_bench.py -t
+# ./unflatten_bench.py -c
+# kernprof -l unflatten_bench.py -l; python -m line_profiler  unflatten_bench.py.lprof
+
+import argparse
+import gc
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+from apex_C import unflatten as unflatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()]
+unflat_t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(unflat_t)
+flat_cpp = flatten(unflat_t)
+flat_apex = flatten_apex(unflat_t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+flat_t = flat_py
+unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
+for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_py[i]).all()
+unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
+for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
+unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
+for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_apex[i]).all()
+
+# the programs being tested
+def py():
+    for i in range(1000):
+        unflat = _unflatten_dense_tensors(flat_t, unflat_t)
+
+def cpp():
+    for i in range(1000):
+        unflat = unflatten(flat_t, unflat_t)
+
+def apex():
+    for i in range(1000):
+        unflat = unflatten_apex(flat_t, unflat_t)
+
+
+#### cProfile ####
+
+import cProfile
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect(); torch.cuda.empty_cache()
+
+#### timeit ####
+
+import timeit
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect(); torch.cuda.empty_cache()
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect(); torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect(); torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect(); torch.cuda.empty_cache()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()

From a54d6d085a29655258707e0c9d6861a06a3329c5 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 2 Apr 2021 10:44:43 -0700
Subject: [PATCH 2/2] style

---
 tests/benchmarks/flatten_bench.py   | 46 ++++++++++++++++++------
 tests/benchmarks/unflatten_bench.py | 54 ++++++++++++++++++++++-------
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
index 6b6ea12d9414..b3ed3c601492 100755
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@@ -22,7 +22,14 @@
 
 torch.manual_seed(0)
 # emulate a small typical model weights
-x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()]
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
 t = x * 30
 
 # warm up and check that the same output is produced
@@ -35,64 +42,83 @@
 
 TIMES = 1000
 
+
 # the programs being tested
 def py():
     for i in range(TIMES):
         flat = _flatten_dense_tensors(t)
 
+
 def cpp():
     for i in range(TIMES):
         flat = flatten(t)
 
+
 def apex():
     for i in range(TIMES):
         flat = flatten_apex(t)
 
+
 #### cProfile ####
 
 import cProfile
 
+
 def cprofileme():
     print("--------------- cProfile -----------------")
     print("py")
     cProfile.run("py()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 #### timeit ####
 
 import timeit
 
+
 def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 #### line_profiler ####
 # this one requires a special way to be called
 # pip install line_profiler
 # kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
 
+
 def line_profileme():
     print("--------------- line_profier -----------------")
     print("py")
     profile(py)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("cpp")
     profile(cpp)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("apex")
     profile(apex)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
index 9851bf3a2506..85baf751ad9c 100755
--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
@@ -22,7 +22,14 @@
 
 torch.manual_seed(0)
 # emulate a small typical model weights
-x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()]
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
 unflat_t = x * 30
 
 # warm up and check that the same output is produced
@@ -35,21 +42,27 @@
 
 flat_t = flat_py
 unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
-for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_py[i]).all()
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_py[i]).all()
 unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
-for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
 unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
-for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_apex[i]).all()
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_apex[i]).all()
+
 
 # the programs being tested
 def py():
     for i in range(1000):
         unflat = _unflatten_dense_tensors(flat_t, unflat_t)
 
+
 def cpp():
     for i in range(1000):
         unflat = unflatten(flat_t, unflat_t)
 
+
 def apex():
     for i in range(1000):
         unflat = unflatten_apex(flat_t, unflat_t)
@@ -59,47 +72,62 @@ def apex():
 
 import cProfile
 
+
 def cprofileme():
     print("--------------- cProfile -----------------")
     print("py")
     cProfile.run("py()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("cpp")
     cProfile.run("cpp()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("apex")
     cProfile.run("apex()", sort=-1)
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 #### timeit ####
 
 import timeit
 
+
 def timeme():
     print("--------------- timeit -----------------")
     print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 #### line_profiler ####
 # this one requires a special way to be called
 # pip install line_profiler
 # kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof
 
+
 def line_profileme():
     print("--------------- line_profier -----------------")
     print("py")
     profile(py)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("cpp")
     profile(cpp)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
     print("apex")
     profile(apex)()
-    gc.collect(); torch.cuda.empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()