From 4a0df0fb2ea228417c4609758a3e84cfa28fd7b7 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 2 Apr 2021 10:39:51 -0700 Subject: [PATCH 1/2] flatten/unflatten benchmarks --- tests/benchmarks/flatten_bench.py | 108 ++++++++++++++++++++++++++ tests/benchmarks/unflatten_bench.py | 115 ++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100755 tests/benchmarks/flatten_bench.py create mode 100755 tests/benchmarks/unflatten_bench.py diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py new file mode 100755 index 000000000000..6b6ea12d9414 --- /dev/null +++ b/tests/benchmarks/flatten_bench.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l) +# +# usage: +# ./flatten_bench.py -t +# ./flatten_bench.py -c +# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof + +import argparse + +import gc + +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from deepspeed.ops.op_builder import UtilsBuilder + +from apex_C import flatten as flatten_apex + +util_ops = UtilsBuilder().load() +flatten = util_ops.flatten +unflatten = util_ops.unflatten + +torch.manual_seed(0) +# emulate a small typical model weights +x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()] +t = x * 30 + +# warm up and check that the same output is produced +flat_py = _flatten_dense_tensors(t) +flat_cpp = flatten(t) +flat_apex = flatten_apex(t) +#numel = flat_cpp.numel() +assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor" +assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor" + +TIMES = 1000 + +# the programs being tested +def py(): + for i in range(TIMES): + flat = _flatten_dense_tensors(t) + +def cpp(): + for i in range(TIMES): + flat = flatten(t) + +def apex(): + for i in range(TIMES): + flat = flatten_apex(t) + +#### cProfile #### + +import cProfile + +def cprofileme(): + print("--------------- cProfile -----------------") + print("py") + cProfile.run("py()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + print("cpp") + cProfile.run("cpp()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + print("apex") + cProfile.run("apex()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + +#### timeit #### + +import timeit + +def timeme(): + print("--------------- timeit -----------------") + print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + +#### line_profiler #### +# this one requires a special way to be called +# pip install line_profiler +# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof + +def line_profileme(): + print("--------------- line_profier -----------------") + print("py") + profile(py)() + gc.collect(); torch.cuda.empty_cache() + print("cpp") + profile(cpp)() + gc.collect(); torch.cuda.empty_cache() + print("apex") + profile(apex)() + gc.collect(); torch.cuda.empty_cache() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-l", action='store_true') + parser.add_argument("-c", action='store_true') + parser.add_argument("-t", action='store_true') + args = parser.parse_args() + if args.l: + line_profileme() + elif args.c: + cprofileme() + elif args.t: + timeme() diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py new file mode 100755 index 000000000000..9851bf3a2506 --- /dev/null +++ b/tests/benchmarks/unflatten_bench.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l) +# +# usage: +# ./unflatten_bench.py -t +# ./unflatten_bench.py -c +# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof + +import argparse +import gc +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from deepspeed.ops.op_builder import UtilsBuilder + +from apex_C import flatten as flatten_apex +from apex_C import unflatten as unflatten_apex + +util_ops = UtilsBuilder().load() +flatten = util_ops.flatten +unflatten = util_ops.unflatten + +torch.manual_seed(0) +# emulate a small typical model weights +x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()] +unflat_t = x * 30 + +# warm up and check that the same output is produced +flat_py = _flatten_dense_tensors(unflat_t) +flat_cpp = flatten(unflat_t) +flat_apex = flatten_apex(unflat_t) +#numel = flat_cpp.numel() +assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor" +assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor" + +flat_t = flat_py +unflat_py = _unflatten_dense_tensors(flat_py, unflat_t) +for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_py[i]).all() +unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t) +for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_cpp[i]).all() +unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t) +for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_apex[i]).all() + +# the programs being tested +def py(): + for i in range(1000): + unflat = _unflatten_dense_tensors(flat_t, unflat_t) + +def cpp(): + for i in range(1000): + unflat = unflatten(flat_t, unflat_t) + +def apex(): + for i in range(1000): + unflat = unflatten_apex(flat_t, unflat_t) + + +#### cProfile #### + +import cProfile + +def cprofileme(): + print("--------------- cProfile -----------------") + print("py") + cProfile.run("py()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + print("cpp") + cProfile.run("cpp()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + print("apex") + cProfile.run("apex()", sort=-1) + gc.collect(); torch.cuda.empty_cache() + +#### timeit #### + +import timeit + +def timeme(): + print("--------------- timeit -----------------") + print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') + gc.collect(); torch.cuda.empty_cache() + +#### line_profiler #### +# this one requires a special way to be called +# pip install line_profiler +# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof + +def line_profileme(): + print("--------------- line_profier -----------------") + print("py") + profile(py)() + gc.collect(); torch.cuda.empty_cache() + print("cpp") + profile(cpp)() + gc.collect(); torch.cuda.empty_cache() + print("apex") + profile(apex)() + gc.collect(); torch.cuda.empty_cache() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-l", action='store_true') + parser.add_argument("-c", action='store_true') + parser.add_argument("-t", action='store_true') + args = parser.parse_args() + if args.l: + line_profileme() + elif args.c: + cprofileme() + elif args.t: + timeme() From a54d6d085a29655258707e0c9d6861a06a3329c5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 2 Apr 2021 10:44:43 -0700 Subject: [PATCH 2/2] style --- tests/benchmarks/flatten_bench.py | 46 ++++++++++++++++++------ tests/benchmarks/unflatten_bench.py | 54 ++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 23 deletions(-) diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py index 6b6ea12d9414..b3ed3c601492 100755 --- a/tests/benchmarks/flatten_bench.py +++ b/tests/benchmarks/flatten_bench.py @@ -22,7 +22,14 @@ torch.manual_seed(0) # emulate a small typical model weights -x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()] +x = [ + torch.rand((512, + 512)).cuda(), + torch.rand((512, + 1024)).cuda(), + torch.rand((512, + 30000)).cuda() +] t = x * 30 # warm up and check that the same output is produced @@ -35,64 +42,83 @@ TIMES = 1000 + # the programs being tested def py(): for i in range(TIMES): flat = _flatten_dense_tensors(t) + def cpp(): for i in range(TIMES): flat = flatten(t) + def apex(): for i in range(TIMES): flat = flatten_apex(t) + #### cProfile #### import cProfile + def cprofileme(): print("--------------- cProfile -----------------") print("py") cProfile.run("py()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("cpp") cProfile.run("cpp()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("apex") cProfile.run("apex()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + #### timeit #### import timeit + def timeme(): print("--------------- timeit -----------------") print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + #### line_profiler #### # this one requires a special way to be called # pip install line_profiler # kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof + def line_profileme(): print("--------------- line_profier -----------------") print("py") profile(py)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("cpp") profile(cpp)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("apex") profile(apex)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py index 9851bf3a2506..85baf751ad9c 100755 --- a/tests/benchmarks/unflatten_bench.py +++ b/tests/benchmarks/unflatten_bench.py @@ -22,7 +22,14 @@ torch.manual_seed(0) # emulate a small typical model weights -x = [torch.rand((512,512)).cuda(), torch.rand((512,1024)).cuda(), torch.rand((512,30000)).cuda()] +x = [ + torch.rand((512, + 512)).cuda(), + torch.rand((512, + 1024)).cuda(), + torch.rand((512, + 30000)).cuda() +] unflat_t = x * 30 # warm up and check that the same output is produced @@ -35,21 +42,27 @@ flat_t = flat_py unflat_py = _unflatten_dense_tensors(flat_py, unflat_t) -for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_py[i]).all() +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_py[i]).all() unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t) -for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_cpp[i]).all() +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_cpp[i]).all() unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t) -for i in range(len(unflat_t)): assert torch.eq(unflat_t[i], unflat_apex[i]).all() +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_apex[i]).all() + # the programs being tested def py(): for i in range(1000): unflat = _unflatten_dense_tensors(flat_t, unflat_t) + def cpp(): for i in range(1000): unflat = unflatten(flat_t, unflat_t) + def apex(): for i in range(1000): unflat = unflatten_apex(flat_t, unflat_t) @@ -59,47 +72,62 @@ def apex(): import cProfile + def cprofileme(): print("--------------- cProfile -----------------") print("py") cProfile.run("py()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("cpp") cProfile.run("cpp()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("apex") cProfile.run("apex()", sort=-1) - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + #### timeit #### import timeit + def timeme(): print("--------------- timeit -----------------") print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + #### line_profiler #### # this one requires a special way to be called # pip install line_profiler # kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof + def line_profileme(): print("--------------- line_profier -----------------") print("py") profile(py)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("cpp") profile(cpp)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() print("apex") profile(apex)() - gc.collect(); torch.cuda.empty_cache() + gc.collect() + torch.cuda.empty_cache() + if __name__ == "__main__": parser = argparse.ArgumentParser()