From 19d49bd681be83ad44080b4cf773a586c39447a9 Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Wed, 28 Jul 2021 23:37:48 +0000 Subject: [PATCH 1/5] up --- benchmarks/distributed/test_distributed.py | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/benchmarks/distributed/test_distributed.py b/benchmarks/distributed/test_distributed.py index 4fd539175804..3e428946257d 100644 --- a/benchmarks/distributed/test_distributed.py +++ b/benchmarks/distributed/test_distributed.py @@ -9,10 +9,10 @@ from tqdm import tqdm, trange TEST_NUM_NODES = 65 -MAX_ACTORS_IN_CLUSTER = 10000 MAX_RUNNING_TASKS_IN_CLUSTER = 10000 MAX_PLACEMENT_GROUPS = 1000 -MAX_NUM_NODES = 250 +MAX_ACTORS_IN_CLUSTER = 50000 +MAX_NUM_NODES = 1000 def num_alive_nodes(): @@ -153,17 +153,6 @@ def f3(): str(available_resources) + " != " + str(cluster_resources)) print("Done launching nodes") -actor_start = perf_counter() -test_max_actors() -actor_end = perf_counter() - -sleep(1) -assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) -assert available_resources == cluster_resources, ( - str(available_resources) + " != " + str(cluster_resources)) -print("Done testing actors") - task_start = perf_counter() test_max_running_tasks() task_end = perf_counter() @@ -193,6 +182,15 @@ def f3(): sleep(1) assert num_alive_nodes( ) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) + +actor_start = perf_counter() +test_max_actors() +actor_end = perf_counter() + +sleep(1) +assert num_alive_nodes( +) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +print("Done testing actors") print("Done.") actor_time = actor_end - actor_start From 154ec79b09af0fef1a892f0173d6cd688a5c809e Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Thu, 29 Jul 2021 01:23:14 +0000 Subject: [PATCH 2/5] up --- benchmarks/benchmark_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_tests.yaml b/benchmarks/benchmark_tests.yaml index 016f54c32f4c..014cbc53dd8f 100644 --- a/benchmarks/benchmark_tests.yaml +++ b/benchmarks/benchmark_tests.yaml @@ -36,6 +36,6 @@ compute_template: distributed.yaml run: - timeout: 3600 # 1hr + timeout: 7200 # 2hr prepare: sleep 0 script: python distributed/test_distributed.py From 5b09714cc776dc53a71d33bce07b3a4a04b62cc4 Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Thu, 29 Jul 2021 04:08:42 +0000 Subject: [PATCH 3/5] up --- benchmarks/distributed/config.yaml | 9 +++++++++ benchmarks/distributed/test_distributed.py | 10 +++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/distributed/config.yaml b/benchmarks/distributed/config.yaml index 8ccff5c4ee28..88cbdbed91b6 100644 --- a/benchmarks/distributed/config.yaml +++ b/benchmarks/distributed/config.yaml @@ -38,6 +38,15 @@ available_node_types: resources: node: 1 max_workers: 999999 + tiny_worker_node: + node_config: + InstanceType: t3.large + ImageId: ami-0a2363a9cff180a64 + resources: + node: 1 + tiny: 1 + CPU: 5 + max_workers: 999999 head_node_type: head_node diff --git a/benchmarks/distributed/test_distributed.py b/benchmarks/distributed/test_distributed.py index 3e428946257d..7ea5f45327a0 100644 --- a/benchmarks/distributed/test_distributed.py +++ b/benchmarks/distributed/test_distributed.py @@ -144,7 +144,7 @@ def f3(): scale_to(TEST_NUM_NODES) assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +) == TEST_NUM_NODES, f"Wrong number of nodes in cluster {len(ray.nodes())}" cluster_resources = ray.cluster_resources() @@ -159,7 +159,7 @@ def f3(): sleep(1) assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +) == TEST_NUM_NODES, f"Wrong number of nodes in cluster {len(ray.nodes())}" assert available_resources == cluster_resources, ( str(available_resources) + " != " + str(cluster_resources)) print("Done testing tasks") @@ -170,7 +170,7 @@ def f3(): sleep(1) assert num_alive_nodes( -) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +) == TEST_NUM_NODES, f"Wrong number of nodes in cluster {len(ray.nodes())}" assert available_resources == cluster_resources, ( str(available_resources) + " != " + str(cluster_resources)) print("Done testing placement groups") @@ -181,7 +181,7 @@ def f3(): sleep(1) assert num_alive_nodes( -) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +) == MAX_NUM_NODES, f"Wrong number of nodes in cluster {len(ray.nodes())}" actor_start = perf_counter() test_max_actors() @@ -189,7 +189,7 @@ def f3(): sleep(1) assert num_alive_nodes( -) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +) == MAX_NUM_NODES, f"Wrong number of nodes in cluster {len(ray.nodes())}" print("Done testing actors") print("Done.") From 01cdcdc388b4143c95752eacfdc840102a0d7d4c Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Thu, 29 Jul 2021 04:09:05 +0000 Subject: [PATCH 4/5] up --- benchmarks/distributed/test_actors.py | 74 +++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 benchmarks/distributed/test_actors.py diff --git a/benchmarks/distributed/test_actors.py b/benchmarks/distributed/test_actors.py new file mode 100644 index 000000000000..b66f98c0bfd9 --- /dev/null +++ b/benchmarks/distributed/test_actors.py @@ -0,0 +1,74 @@ +import ray +import ray.autoscaler.sdk +from ray.test_utils import Semaphore +from ray.util.placement_group import placement_group, remove_placement_group + +import json +import os +from time import sleep, perf_counter +from tqdm import tqdm, trange + +TEST_NUM_NODES = 1000 +MAX_ACTORS_IN_CLUSTER = 20000 + +def num_alive_nodes(): + n = 0 + for node in ray.nodes(): + if node["Alive"]: + n += 1 + return n + + +def scale_to(target): + while num_alive_nodes() != target: + ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) + print(f"Current # nodes: {num_alive_nodes()}, target: {target}") + print("Waiting ...") + sleep(5) + + +def test_max_actors(): + # TODO (Alex): Dynamically set this based on number of cores + cpus_per_actor = 0.24 + + @ray.remote(num_cpus=cpus_per_actor) + class Actor: + def foo(self): + pass + + actors = [ + Actor.options(num_cpus=0.24).remote() + for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors") + ] + + for actor in tqdm(actors, desc="Ensuring actors have started"): + assert ray.get(actor.foo.remote()) is None + + +ray.init(address="auto") + +scale_to(TEST_NUM_NODES) +assert num_alive_nodes( +) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) + +cluster_resources = ray.cluster_resources() + +available_resources = ray.available_resources() +assert available_resources == cluster_resources, ( + str(available_resources) + " != " + str(cluster_resources)) + +actor_start = perf_counter() +test_max_actors() +actor_end = perf_counter() +actor_time = actor_end - actor_start + +print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)") + +if "TEST_OUTPUT_JSON" in os.environ: + out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") + results = { + "actor_time": actor_time, + "num_actors": MAX_ACTORS_IN_CLUSTER, + "success": "1" + } + json.dump(results, out_file) From 56e613ceb2bae40a4ee89b5c7bb3b85e920ac09e Mon Sep 17 00:00:00 2001 From: Yi Cheng Date: Thu, 29 Jul 2021 04:09:33 +0000 Subject: [PATCH 5/5] up --- benchmarks/distributed/test_actors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/distributed/test_actors.py b/benchmarks/distributed/test_actors.py index b66f98c0bfd9..e1ae3e32112d 100644 --- a/benchmarks/distributed/test_actors.py +++ b/benchmarks/distributed/test_actors.py @@ -1,7 +1,5 @@ import ray import ray.autoscaler.sdk -from ray.test_utils import Semaphore -from ray.util.placement_group import placement_group, remove_placement_group import json import os @@ -11,6 +9,7 @@ TEST_NUM_NODES = 1000 MAX_ACTORS_IN_CLUSTER = 20000 + def num_alive_nodes(): n = 0 for node in ray.nodes():