From b9ac3878ae615eb1926e00ec0a9a5fa67c27c30e Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Wed, 20 Jan 2021 19:20:54 -0800 Subject: [PATCH 001/245] [Autoscaler] Display node status tag in autsocaler status (#13561) * . * . * . * . * . * lint Co-authored-by: Alex Wu --- python/ray/autoscaler/_private/autoscaler.py | 2 +- python/ray/autoscaler/_private/util.py | 4 ++-- python/ray/tests/test_resource_demand_scheduler.py | 12 +++++++----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/python/ray/autoscaler/_private/autoscaler.py b/python/ray/autoscaler/_private/autoscaler.py index 2838e24c18b4..1166597ed9d6 100644 --- a/python/ray/autoscaler/_private/autoscaler.py +++ b/python/ray/autoscaler/_private/autoscaler.py @@ -765,7 +765,7 @@ def summary(self): ] is_pending = status in pending_states if is_pending: - pending_nodes.append((ip, node_type)) + pending_nodes.append((ip, node_type, status)) else: # TODO (Alex): Failed nodes are now immediately killed, so # this list will almost always be empty. We should ideally diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 81a2c1fc00ff..1e677e35bc7d 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -362,8 +362,8 @@ def format_info_string(lm_summary, autoscaler_summary, time=None): for node_type, count in autoscaler_summary.pending_launches.items(): line = f" {node_type}, {count} launching" pending_lines.append(line) - for ip, node_type in autoscaler_summary.pending_nodes: - line = f" {ip}: {node_type}, setting up" + for ip, node_type, status in autoscaler_summary.pending_nodes: + line = f" {ip}: {node_type}, {status.lower()}" pending_lines.append(line) if pending_lines: pending_report = "\n".join(pending_lines) diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 4b2027af1d66..3bfe28f7cc83 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -28,7 +28,7 @@ from ray.autoscaler.tags import TAG_RAY_USER_NODE_TYPE, TAG_RAY_NODE_KIND, \ NODE_KIND_WORKER, TAG_RAY_NODE_STATUS, \ STATUS_UP_TO_DATE, STATUS_UNINITIALIZED, \ - STATUS_UPDATE_FAILED, \ + STATUS_UPDATE_FAILED, STATUS_WAITING_FOR_SSH, \ NODE_KIND_HEAD, NODE_TYPE_LEGACY_WORKER, \ NODE_TYPE_LEGACY_HEAD from ray.test_utils import same_elements @@ -1419,7 +1419,8 @@ def testSummary(self): assert summary.active_nodes["empty_node"] == 1 assert len(summary.active_nodes) == 2, summary.active_nodes - assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge")] + assert summary.pending_nodes == [("172.0.0.3", "p2.xlarge", + STATUS_WAITING_FOR_SSH)] assert summary.pending_launches == {"m4.16xlarge": 2} assert summary.failed_nodes == [("172.0.0.4", "m4.4xlarge")] @@ -2403,7 +2404,8 @@ def test_info_string(): "p3.2xlarge": 2, "m4.4xlarge": 20 }, - pending_nodes=[("1.2.3.4", "m4.4xlarge"), ("1.2.3.5", "m4.4xlarge")], + pending_nodes=[("1.2.3.4", "m4.4xlarge", STATUS_WAITING_FOR_SSH), + ("1.2.3.5", "m4.4xlarge", STATUS_WAITING_FOR_SSH)], pending_launches={"m4.4xlarge": 2}, failed_nodes=[("1.2.3.6", "p3.2xlarge")]) @@ -2416,8 +2418,8 @@ def test_info_string(): 20 m4.4xlarge Pending: m4.4xlarge, 2 launching - 1.2.3.4: m4.4xlarge, setting up - 1.2.3.5: m4.4xlarge, setting up + 1.2.3.4: m4.4xlarge, waiting-for-ssh + 1.2.3.5: m4.4xlarge, waiting-for-ssh Recent failures: (no failures) From daf0bef2858441e3d2da953d9a76400b6ce7a77d Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 21 Jan 2021 16:30:26 +0100 Subject: [PATCH 002/245] [RLlib] Dreamer: Fix broken import and add compilation test case. (#13553) --- rllib/BUILD | 23 ++++++++---- rllib/agents/dreamer/dreamer.py | 2 ++ rllib/agents/dreamer/dreamer_model.py | 2 +- rllib/agents/dreamer/tests/test_dreamer.py | 41 ++++++++++++++++++++++ rllib/env/wrappers/dm_control_wrapper.py | 2 +- 5 files changed, 61 insertions(+), 9 deletions(-) create mode 100644 rllib/agents/dreamer/tests/test_dreamer.py diff --git a/rllib/BUILD b/rllib/BUILD index daa623dff843..f8f1cbd3c6f8 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -436,13 +436,13 @@ py_test( srcs = ["agents/a3c/tests/test_a3c.py"] ) -## APEXTrainer (DQN) -#py_test( -# name = "test_apex_dqn", -# tags = ["agents_dir"], -# size = "large", -# srcs = ["agents/dqn/tests/test_apex_dqn.py"] -#) +# APEXTrainer (DQN) +py_test( + name = "test_apex_dqn", + tags = ["agents_dir"], + size = "medium", + srcs = ["agents/dqn/tests/test_apex_dqn.py"] +) # APEXDDPGTrainer py_test( @@ -482,6 +482,15 @@ py_test( srcs = ["agents/dqn/tests/test_simple_q.py"] ) +# TODO: enable once we have a MuJoCo-independent test case. +## Dreamer +#py_test( +# name = "test_dreamer", +# tags = ["agents_dir"], +# size = "small", +# srcs = ["agents/dreamer/tests/test_dreamer.py"] +#) + # ES py_test( name = "test_es", diff --git a/rllib/agents/dreamer/dreamer.py b/rllib/agents/dreamer/dreamer.py index 94774d9fec91..21646d61871d 100644 --- a/rllib/agents/dreamer/dreamer.py +++ b/rllib/agents/dreamer/dreamer.py @@ -31,6 +31,8 @@ "discount": 0.99, # Lambda "lambda": 0.95, + # Clipping is done inherently via policy tanh. + "clip_actions": False, # Training iterations per data collection from real env "dreamer_train_iters": 100, # Horizon for Enviornment (1000 for Mujoco/DMC) diff --git a/rllib/agents/dreamer/dreamer_model.py b/rllib/agents/dreamer/dreamer_model.py index 5483f664f839..f2db417e512b 100644 --- a/rllib/agents/dreamer/dreamer_model.py +++ b/rllib/agents/dreamer/dreamer_model.py @@ -1,6 +1,6 @@ import numpy as np from typing import Any, List, Tuple -from ray.rllib.models.torch.modules.reshape import Reshape +from ray.rllib.models.torch.misc import Reshape from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.framework import TensorType diff --git a/rllib/agents/dreamer/tests/test_dreamer.py b/rllib/agents/dreamer/tests/test_dreamer.py new file mode 100644 index 000000000000..2b318866ca48 --- /dev/null +++ b/rllib/agents/dreamer/tests/test_dreamer.py @@ -0,0 +1,41 @@ +import unittest + +import ray +from ray import tune +import ray.rllib.agents.dreamer as dreamer +from ray.rllib.examples.env.dm_control_suite import hopper_hop +from ray.rllib.utils.test_utils import check_compute_single_action, \ + framework_iterator + + +class TestDreamer(unittest.TestCase): + """Sanity tests for DreamerTrainer.""" + + def setUp(self): + ray.init() + + def tearDown(self): + ray.shutdown() + + def test_dreamer_compilation(self): + """Test whether an DreamerTrainer can be built with all frameworks.""" + config = dreamer.DEFAULT_CONFIG.copy() + tune.register_env("dm_control_hopper_hop", lambda _: hopper_hop()) + + num_iterations = 1 + + # Test against all frameworks. + for _ in framework_iterator(config, frameworks="torch"): + for env in ["dm_control_hopper_hop"]: + trainer = dreamer.DREAMERTrainer(config=config, env=env) + for i in range(num_iterations): + results = trainer.train() + print(results) + check_compute_single_action(trainer) + trainer.stop() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/dm_control_wrapper.py b/rllib/env/wrappers/dm_control_wrapper.py index 6734e2a3ab66..3286aae28adf 100644 --- a/rllib/env/wrappers/dm_control_wrapper.py +++ b/rllib/env/wrappers/dm_control_wrapper.py @@ -31,7 +31,7 @@ specs = None try: from dm_control import suite -except ImportError: +except (ImportError, OSError): suite = None import numpy as np From d11e62f9e61a2eb2c5ce9c8d437b3d0d9cae6511 Mon Sep 17 00:00:00 2001 From: Saeid Date: Thu, 21 Jan 2021 15:36:11 +0000 Subject: [PATCH 003/245] [RLlib] Fix problem in preprocessing nested MultiDiscrete (#13308) --- rllib/models/preprocessors.py | 2 +- rllib/models/tests/test_preprocessors.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/rllib/models/preprocessors.py b/rllib/models/preprocessors.py index 2b0bcb092062..44312a807432 100644 --- a/rllib/models/preprocessors.py +++ b/rllib/models/preprocessors.py @@ -174,7 +174,7 @@ def transform(self, observation: TensorType) -> np.ndarray: @override(Preprocessor) def write(self, observation: TensorType, array: np.ndarray, offset: int) -> None: - array[offset + observation] = 1 + array[offset:offset + self.size] = self.transform(observation) class NoPreprocessor(Preprocessor): diff --git a/rllib/models/tests/test_preprocessors.py b/rllib/models/tests/test_preprocessors.py index 5515b6fea6b1..4ce7b73e7e74 100644 --- a/rllib/models/tests/test_preprocessors.py +++ b/rllib/models/tests/test_preprocessors.py @@ -71,6 +71,17 @@ def test_one_hot_preprocessor(self): pp.transform(np.array([0, 1, 3])), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) + def test_nested_multidiscrete_one_hot_preprocessor(self): + space = Tuple((MultiDiscrete([2, 3, 4]), )) + pp = get_preprocessor(space)(space) + self.assertTrue(pp.shape == (9, )) + check( + pp.transform((np.array([1, 2, 0]), )), + [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + check( + pp.transform((np.array([0, 1, 3]), )), + [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]) + if __name__ == "__main__": import pytest From 587f207c2fadc02c25ddf1dedbca4cbaf3163d48 Mon Sep 17 00:00:00 2001 From: Michael Luo Date: Thu, 21 Jan 2021 07:43:55 -0800 Subject: [PATCH 004/245] [RLlib] Support for D4RL + Semi-working CQL Benchmark (#13550) --- rllib/agents/cql/cql.py | 2 + rllib/evaluation/worker_set.py | 5 +- rllib/offline/__init__.py | 2 + rllib/offline/d4rl_reader.py | 52 +++++++++++++++++++ rllib/tuned_examples/cql/halfcheetah-cql.yaml | 1 + 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 rllib/offline/d4rl_reader.py diff --git a/rllib/agents/cql/cql.py b/rllib/agents/cql/cql.py index 04a63be72751..30bbe89d4553 100644 --- a/rllib/agents/cql/cql.py +++ b/rllib/agents/cql/cql.py @@ -15,6 +15,8 @@ SAC_CONFIG, { # You should override this to point to an offline dataset. "input": "sampler", + # Offline RL does not need IS estimators + "input_evaluation": [], # Number of iterations with Behavior Cloning Pretraining "bc_iters": 20000, # CQL Loss Temperature diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 80cf617bb029..8361e0af8777 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -8,7 +8,7 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \ _validate_multiagent_config from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ - ShuffledInput + ShuffledInput, D4RLReader from ray.rllib.env.env_context import EnvContext from ray.rllib.policy import Policy from ray.rllib.utils import merge_dicts @@ -266,6 +266,9 @@ def session_creator(): input_creator = ( lambda ioctx: ShuffledInput(MixedInput(config["input"], ioctx), config["shuffle_buffer_size"])) + elif "d4rl" in config["input"]: + env_name = config["input"].split(".")[1] + input_creator = (lambda ioctx: D4RLReader(env_name, ioctx)) else: input_creator = ( lambda ioctx: ShuffledInput(JsonReader(config["input"], ioctx), diff --git a/rllib/offline/__init__.py b/rllib/offline/__init__.py index 69b07c657006..540151cc2d4d 100644 --- a/rllib/offline/__init__.py +++ b/rllib/offline/__init__.py @@ -5,6 +5,7 @@ from ray.rllib.offline.input_reader import InputReader from ray.rllib.offline.mixed_input import MixedInput from ray.rllib.offline.shuffled_input import ShuffledInput +from ray.rllib.offline.d4rl_reader import D4RLReader __all__ = [ "IOContext", @@ -15,4 +16,5 @@ "InputReader", "MixedInput", "ShuffledInput", + "D4RLReader", ] diff --git a/rllib/offline/d4rl_reader.py b/rllib/offline/d4rl_reader.py new file mode 100644 index 000000000000..2c02af08868c --- /dev/null +++ b/rllib/offline/d4rl_reader.py @@ -0,0 +1,52 @@ +import logging +import gym + +from ray.rllib.offline.input_reader import InputReader +from ray.rllib.offline.io_context import IOContext +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils.annotations import override, PublicAPI +from ray.rllib.utils.typing import SampleBatchType +from typing import Dict + +logger = logging.getLogger(__name__) + + +@PublicAPI +class D4RLReader(InputReader): + """Reader object that loads the dataset from the D4RL dataset.""" + + @PublicAPI + def __init__(self, inputs: str, ioctx: IOContext = None): + """Initialize a D4RLReader. + + Args: + inputs (str): String corresponding to D4RL environment name + ioctx (IOContext): Current IO context object. + """ + import d4rl + self.env = gym.make(inputs) + self.dataset = convert_to_batch(d4rl.qlearning_dataset(self.env)) + assert self.dataset.count >= 1 + self.dataset.shuffle() + self.counter = 0 + + @override(InputReader) + def next(self) -> SampleBatchType: + if self.counter >= self.dataset.count: + self.counter = 0 + self.dataset.shuffle() + + self.counter += 1 + return self.dataset.slice(start=self.counter, end=self.counter + 1) + + +def convert_to_batch(dataset: Dict) -> SampleBatchType: + # Converts D4RL dataset to SampleBatch + d = {} + d[SampleBatch.OBS] = dataset["observations"] + d[SampleBatch.ACTIONS] = dataset["actions"] + d[SampleBatch.NEXT_OBS] = dataset["next_observations"] + d[SampleBatch.REWARDS] = dataset["rewards"] + d[SampleBatch.DONES] = dataset["terminals"] + + return SampleBatch(d) diff --git a/rllib/tuned_examples/cql/halfcheetah-cql.yaml b/rllib/tuned_examples/cql/halfcheetah-cql.yaml index 5bab20751c53..9a5fa9982875 100644 --- a/rllib/tuned_examples/cql/halfcheetah-cql.yaml +++ b/rllib/tuned_examples/cql/halfcheetah-cql.yaml @@ -5,6 +5,7 @@ halfcheetah_cql: episode_reward_mean: 9000 config: # SAC Configs + input: d4rl.halfcheetah-medium-v0 framework: torch horizon: 1000 soft_horizon: false From 92f1e0902ed4700fa6bf2ac7d3e781fa1a42f831 Mon Sep 17 00:00:00 2001 From: Kai Yang Date: Thu, 21 Jan 2021 23:57:20 +0800 Subject: [PATCH 005/245] [Java] Fix return of java doc (#13601) --- java/api/src/main/java/io/ray/api/Ray.java | 50 +++++++++++-------- .../java/io/ray/api/call/ActorCreator.java | 5 +- .../java/io/ray/api/call/ActorTaskCaller.java | 2 +- .../io/ray/api/call/BaseActorCreator.java | 21 +++++--- .../java/io/ray/api/call/BaseTaskCaller.java | 9 ++-- .../java/io/ray/api/call/PyActorCreator.java | 2 +- .../io/ray/api/call/PyActorTaskCaller.java | 2 +- .../java/io/ray/api/call/PyTaskCaller.java | 2 +- .../main/java/io/ray/api/call/TaskCaller.java | 2 +- .../io/ray/api/function/PyActorClass.java | 3 +- .../io/ray/api/function/PyActorMethod.java | 6 ++- .../java/io/ray/api/function/PyFunction.java | 6 ++- .../src/main/java/io/ray/api/id/BaseId.java | 2 +- .../ray/api/options/ActorCreationOptions.java | 25 ++++++---- .../java/io/ray/api/options/CallOptions.java | 9 ++-- .../java/io/ray/api/runtime/RayRuntime.java | 50 ++++++++++++------- .../api/runtimecontext/RuntimeContext.java | 2 +- .../ray/runtime/actor/NativeActorHandle.java | 4 +- .../functionmanager/FunctionManager.java | 6 ++- .../java/io/ray/runtime/gcs/GcsClient.java | 5 +- .../java/io/ray/runtime/gcs/RedisClient.java | 2 +- .../java/io/ray/runtime/metric/Metric.java | 2 +- .../java/io/ray/runtime/metric/Metrics.java | 2 +- .../ray/runtime/object/ObjectSerializer.java | 6 ++- .../io/ray/runtime/object/ObjectStore.java | 24 +++++---- .../placementgroup/PlacementGroupImpl.java | 19 ++++--- .../placementgroup/PlacementGroupUtils.java | 8 +-- .../io/ray/runtime/task/TaskSubmitter.java | 16 +++--- .../io/ray/runtime/util/BinaryFileUtil.java | 3 +- .../main/java/io/ray/runtime/util/IdUtil.java | 2 +- .../io/ray/runtime/util/ResourceUtil.java | 9 ++-- .../ray/streaming/api/function/Function.java | 2 +- .../api/function/impl/FilterFunction.java | 4 +- .../streaming/api/partition/Partition.java | 4 +- .../ray/streaming/api/stream/DataStream.java | 26 ++++++---- .../api/stream/DataStreamSource.java | 3 +- .../streaming/api/stream/KeyDataStream.java | 6 ++- .../io/ray/streaming/jobgraph/JobGraph.java | 2 +- .../python/stream/PythonDataStream.java | 28 +++++++---- .../python/stream/PythonKeyDataStream.java | 3 +- .../runtime/config/global/CommonConfig.java | 4 +- .../config/master/SchedulerConfig.java | 4 +- .../runtime/context/ContextBackend.java | 5 +- .../graph/executiongraph/ExecutionGraph.java | 30 ++++++----- .../executiongraph/ExecutionJobVertex.java | 2 +- .../runtime/core/resource/Resources.java | 2 +- .../streaming/runtime/master/JobMaster.java | 5 +- .../master/graphmanager/GraphManager.java | 7 +-- .../resourcemanager/ResourceManager.java | 2 +- .../strategy/ResourceAssignStrategy.java | 3 +- .../strategy/impl/PipelineFirstStrategy.java | 16 +++--- .../master/scheduler/JobScheduler.java | 3 +- .../master/scheduler/JobSchedulerImpl.java | 6 ++- .../controller/WorkerLifecycleController.java | 12 +++-- .../runtime/rpc/RemoteCallWorker.java | 9 ++-- .../runtime/transfer/DataReader.java | 3 +- .../runtime/transfer/channel/ChannelId.java | 9 ++-- .../ray/streaming/runtime/util/EnvUtil.java | 2 +- .../ray/streaming/runtime/util/Platform.java | 5 +- .../ray/streaming/runtime/util/RayUtils.java | 4 +- .../runtime/util/ReflectionUtils.java | 2 +- .../streaming/runtime/util/ResourceUtil.java | 31 ++++++------ .../streaming/runtime/worker/JobWorker.java | 4 +- .../streaming/runtime/util/Mockitools.java | 4 +- .../state/keystate/KeyGroupAssignment.java | 4 +- .../state/keystate/state/MapState.java | 15 +++--- .../state/keystate/state/UnaryState.java | 2 +- 67 files changed, 350 insertions(+), 229 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/Ray.java b/java/api/src/main/java/io/ray/api/Ray.java index 02ffc59c85e8..da9047a66075 100644 --- a/java/api/src/main/java/io/ray/api/Ray.java +++ b/java/api/src/main/java/io/ray/api/Ray.java @@ -51,7 +51,7 @@ public static synchronized void shutdown() { /** * Check if {@link #init} has been called yet. * - *

Returns True if {@link #init} has already been called and false otherwise. + * @return True if {@link #init} has already been called and false otherwise. */ public static boolean isInitialized() { return runtime != null; @@ -60,8 +60,8 @@ public static boolean isInitialized() { /** * Store an object in the object store. * - * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the - * in-store object. + * @param obj The Java object to be stored. + * @return A ObjectRef instance that represents the in-store object. */ public static ObjectRef put(T obj) { return internal().put(obj); @@ -70,7 +70,8 @@ public static ObjectRef put(T obj) { /** * Get an object by `ObjectRef` from the object store. * - * @param objectRef The reference of the object to get. Returns The Java object. + * @param objectRef The reference of the object to get. + * @return The Java object. */ public static T get(ObjectRef objectRef) { return internal().get(objectRef); @@ -79,7 +80,8 @@ public static T get(ObjectRef objectRef) { /** * Get a list of objects by `ObjectRef`s from the object store. * - * @param objectList A list of object references. Returns A list of Java objects. + * @param objectList A list of object references. + * @return A list of Java objects. */ public static List get(List> objectList) { return internal().get(objectList); @@ -91,8 +93,8 @@ public static List get(List> objectList) { * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, - * one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns, int timeoutMs) { return internal().wait(waitList, numReturns, timeoutMs); @@ -103,8 +105,8 @@ public static WaitResult wait(List> waitList, int numReturns * objects are locally available. * * @param waitList A list of object references to wait for. - * @param numReturns The number of objects that should be returned. Returns Two lists, one - * containing locally available objects, one containing the rest. + * @param numReturns The number of objects that should be returned. + * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns) { return internal().wait(waitList, numReturns, Integer.MAX_VALUE); @@ -114,8 +116,8 @@ public static WaitResult wait(List> waitList, int numReturns * A convenient helper method for Ray.wait. It will wait infinitely until all objects are locally * available. * - * @param waitList A list of object references to wait for. Returns Two lists, one containing - * locally available objects, one containing the rest. + * @param waitList A list of object references to wait for. + * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList) { return internal().wait(waitList, waitList.size(), Integer.MAX_VALUE); @@ -127,8 +129,9 @@ public static WaitResult wait(List> waitList) { *

Gets a handle to a named actor with the given name. The actor must have been created with * name specified. * - * @param name The name of the named actor. Returns an ActorHandle to the actor if the actor of - * specified name exists or an Optional.empty() + * @param name The name of the named actor. + * @return an ActorHandle to the actor if the actor of specified name exists or an + * Optional.empty() */ public static Optional getActor(String name) { return internal().getActor(name, false); @@ -140,8 +143,9 @@ public static Optional getActor(String name) { *

Gets a handle to a global named actor with the given name. The actor must have been created * with global name specified. * - * @param name The global name of the named actor. Returns an ActorHandle to the actor if the - * actor of specified name exists or an Optional.empty() + * @param name The global name of the named actor. + * @return an ActorHandle to the actor if the actor of specified name exists or an + * Optional.empty() */ public static Optional getGlobalActor(String name) { return internal().getActor(name, true); @@ -151,7 +155,7 @@ public static Optional getGlobalActor(String name * If users want to use Ray API in their own threads, call this method to get the async context * and then call {@link #setAsyncContext} at the beginning of the new thread. * - *

Returns The async context. + * @return The async context. */ public static Object getAsyncContext() { return internal().getAsyncContext(); @@ -175,7 +179,8 @@ public static void setAsyncContext(Object asyncContext) { * If users want to use Ray API in their own threads, they should wrap their {@link Runnable} * objects with this method. * - * @param runnable The runnable to wrap. Returns The wrapped runnable. + * @param runnable The runnable to wrap. + * @return The wrapped runnable. */ public static Runnable wrapRunnable(Runnable runnable) { return internal().wrapRunnable(runnable); @@ -185,7 +190,8 @@ public static Runnable wrapRunnable(Runnable runnable) { * If users want to use Ray API in their own threads, they should wrap their {@link Callable} * objects with this method. * - * @param callable The callable to wrap. Returns The wrapped callable. + * @param callable The callable to wrap. + * @return The wrapped callable. */ public static Callable wrapCallable(Callable callable) { return internal().wrapCallable(callable); @@ -238,7 +244,8 @@ public static RuntimeContext getRuntimeContext() { * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. Returns A handle to the created placement group. + * @param strategy Actor placement strategy. + * @return A handle to the created placement group. */ public static PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy) { @@ -265,7 +272,8 @@ public static void exitActor() { /** * Get a placement group by placement group Id. * - * @param id placement group id. Returns The placement group. + * @param id placement group id. + * @return The placement group. */ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { return internal().getPlacementGroup(id); @@ -274,7 +282,7 @@ public static PlacementGroup getPlacementGroup(PlacementGroupId id) { /** * Get all placement groups in this cluster. * - *

Returns All placement groups. + * @return All placement groups. */ public static List getAllPlacementGroups() { return internal().getAllPlacementGroups(); diff --git a/java/api/src/main/java/io/ray/api/call/ActorCreator.java b/java/api/src/main/java/io/ray/api/call/ActorCreator.java index c6bb9cce8ea7..b64a4fbcd0e5 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/ActorCreator.java @@ -23,7 +23,8 @@ public ActorCreator(RayFuncR func, Object[] args) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self + * @param jvmOptions JVM options for the Java worker that this actor is running in. + * @return self * @see io.ray.api.options.ActorCreationOptions.Builder#setJvmOptions(java.lang.String) */ public ActorCreator setJvmOptions(String jvmOptions) { @@ -34,7 +35,7 @@ public ActorCreator setJvmOptions(String jvmOptions) { /** * Create a java actor remotely and return a handle to the created actor. * - *

Returns a handle to the created java actor. + * @return a handle to the created java actor. */ public ActorHandle remote() { return Ray.internal().createActor(func, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java index 4b9d25a21478..4579acbb876d 100644 --- a/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/ActorTaskCaller.java @@ -25,7 +25,7 @@ public ActorTaskCaller(ActorHandle actor, RayFuncR func, Object[] args) { * Execute an java actor method remotely and return an object reference to the result object in * the object store. * - *

Returns an object reference to an object in the object store. + * @return an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java index 5f488124b16c..7e761b4c2859 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/BaseActorCreator.java @@ -18,7 +18,8 @@ public class BaseActorCreator { * name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that is * accessible from all jobs, use {@link BaseActorCreator#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. Returns self + * @param name The name of the named actor. + * @return self * @see io.ray.api.options.ActorCreationOptions.Builder#setName(String) */ public T setName(String name) { @@ -31,7 +32,8 @@ public T setName(String name) { * Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is only * accessible from this job, use {@link BaseActorCreator#setName(java.lang.String)} instead. * - * @param name The name of the named actor. Returns self + * @param name The name of the named actor. + * @return self * @see io.ray.api.options.ActorCreationOptions.Builder#setGlobalName(String) */ public T setGlobalName(String name) { @@ -45,7 +47,8 @@ public T setGlobalName(String name) { * used. * * @param resourceName resource name - * @param resourceQuantity resource quantity Returns self + * @param resourceQuantity resource quantity + * @return self * @see ActorCreationOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String resourceName, Double resourceQuantity) { @@ -58,7 +61,8 @@ public T setResource(String resourceName, Double resourceQuantity) { * called multiple times. If the same resource is set multiple times, the latest quantity will be * used. * - * @param resources requirements for multiple resources. Returns self + * @param resources requirements for multiple resources. + * @return self * @see BaseActorCreator#setResources(java.util.Map) */ public T setResources(Map resources) { @@ -71,7 +75,8 @@ public T setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts Returns self + * @param maxRestarts max number of actor restarts + * @return self * @see ActorCreationOptions.Builder#setMaxRestarts(int) */ public T setMaxRestarts(int maxRestarts) { @@ -85,7 +90,8 @@ public T setMaxRestarts(int maxRestarts) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. + * @return self * @see ActorCreationOptions.Builder#setMaxConcurrency(int) */ public T setMaxConcurrency(int maxConcurrency) { @@ -97,7 +103,8 @@ public T setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. Returns self + * @param bundleIndex The index of the bundle to place this actor in. + * @return self * @see ActorCreationOptions.Builder#setPlacementGroup(PlacementGroup, int) */ public T setPlacementGroup(PlacementGroup group, int bundleIndex) { diff --git a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java index 8b683c7bdf55..88c58e05350f 100644 --- a/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/BaseTaskCaller.java @@ -14,7 +14,8 @@ public class BaseTaskCaller> { /** * Set a name for this task. * - * @param name task name Returns self + * @param name task name + * @return self * @see CallOptions.Builder#setName(java.lang.String) */ public T setName(String name) { @@ -27,7 +28,8 @@ public T setName(String name) { * times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity Returns self + * @param value resource capacity + * @return self * @see CallOptions.Builder#setResource(java.lang.String, java.lang.Double) */ public T setResource(String name, Double value) { @@ -39,7 +41,8 @@ public T setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. Returns self + * @param resources requirements for multiple resources. + * @return self * @see CallOptions.Builder#setResources(java.util.Map) */ public T setResources(Map resources) { diff --git a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java index 5add65346c73..fb87a1eac7da 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorCreator.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorCreator.java @@ -17,7 +17,7 @@ public PyActorCreator(PyActorClass pyActorClass, Object[] args) { /** * Create a python actor remotely and return a handle to the created actor. * - *

Returns a handle to the created python actor. + * @return a handle to the created python actor. */ public PyActorHandle remote() { return Ray.internal().createActor(pyActorClass, args, buildOptions()); diff --git a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java index c9444548f407..7ee7d8a13c92 100644 --- a/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyActorTaskCaller.java @@ -25,7 +25,7 @@ public PyActorTaskCaller(PyActorHandle actor, PyActorMethod method, Object[] * Execute a python actor method remotely and return an object reference to the result object in * the object store. * - *

Returns an object reference to an object in the object store. + * @return an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java index 8d58e9b300a8..ecd7aa3c8987 100644 --- a/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/PyTaskCaller.java @@ -22,7 +22,7 @@ public PyTaskCaller(PyFunction func, Object[] args) { * Execute a python function remotely and return an object reference to the result object in the * object store. * - *

Returns an object reference to an object in the object store. + * @return an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/call/TaskCaller.java b/java/api/src/main/java/io/ray/api/call/TaskCaller.java index 82f72d63e6cd..80dacec2dfdc 100644 --- a/java/api/src/main/java/io/ray/api/call/TaskCaller.java +++ b/java/api/src/main/java/io/ray/api/call/TaskCaller.java @@ -22,7 +22,7 @@ public TaskCaller(RayFuncR func, Object[] args) { * Execute a java function remotely and return an object reference to the result object in the * object store. * - *

Returns an object reference to an object in the object store. + * @return an object reference to an object in the object store. */ @SuppressWarnings("unchecked") public ObjectRef remote() { diff --git a/java/api/src/main/java/io/ray/api/function/PyActorClass.java b/java/api/src/main/java/io/ray/api/function/PyActorClass.java index c753e1f27b72..d76385919b9b 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorClass.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorClass.java @@ -38,7 +38,8 @@ private PyActorClass(String moduleName, String className) { * Create a python actor class. * * @param moduleName The full module name of this actor class - * @param className The name of this actor class Returns a python actor class + * @param className The name of this actor class + * @return a python actor class */ public static PyActorClass of(String moduleName, String className) { return new PyActorClass(moduleName, className); diff --git a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java index f91b0c9f9c10..6f24b5d11a3c 100644 --- a/java/api/src/main/java/io/ray/api/function/PyActorMethod.java +++ b/java/api/src/main/java/io/ray/api/function/PyActorMethod.java @@ -43,7 +43,8 @@ private PyActorMethod(String methodName, Class returnType) { /** * Create a python actor method. * - * @param methodName The name of this actor method Returns a python actor method. + * @param methodName The name of this actor method + * @return a python actor method. */ public static PyActorMethod of(String methodName) { return of(methodName, Object.class); @@ -54,7 +55,8 @@ public static PyActorMethod of(String methodName) { * * @param methodName The name of this actor method * @param returnType Class of the return value of this actor method - * @param The type of the return value of this actor method Returns a python actor method. + * @param The type of the return value of this actor method + * @return a python actor method. */ public static PyActorMethod of(String methodName, Class returnType) { return new PyActorMethod<>(methodName, returnType); diff --git a/java/api/src/main/java/io/ray/api/function/PyFunction.java b/java/api/src/main/java/io/ray/api/function/PyFunction.java index 119bba4e5be2..2119b0bbf310 100644 --- a/java/api/src/main/java/io/ray/api/function/PyFunction.java +++ b/java/api/src/main/java/io/ray/api/function/PyFunction.java @@ -49,7 +49,8 @@ private PyFunction(String moduleName, String functionName, Class returnType) * Create a python function. * * @param moduleName The full module name of this function - * @param functionName The name of this function Returns a python function. + * @param functionName The name of this function + * @return a python function. */ public static PyFunction of(String moduleName, String functionName) { return of(moduleName, functionName, Object.class); @@ -61,7 +62,8 @@ public static PyFunction of(String moduleName, String functionName) { * @param moduleName The full module name of this function * @param functionName The name of this function * @param returnType Class of the return value of this function - * @param Type of the return value of this function Returns a python function. + * @param Type of the return value of this function + * @return a python function. */ public static PyFunction of(String moduleName, String functionName, Class returnType) { return new PyFunction<>(moduleName, functionName, returnType); diff --git a/java/api/src/main/java/io/ray/api/id/BaseId.java b/java/api/src/main/java/io/ray/api/id/BaseId.java index 573f549b2fa3..ee91a77d63c4 100644 --- a/java/api/src/main/java/io/ray/api/id/BaseId.java +++ b/java/api/src/main/java/io/ray/api/id/BaseId.java @@ -52,7 +52,7 @@ public boolean isNil() { /** * Derived class should implement this function. * - *

Returns The length of this id in bytes. + * @return The length of this id in bytes. */ public abstract int size(); diff --git a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java index 29a13c115052..303239735586 100644 --- a/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java +++ b/java/api/src/main/java/io/ray/api/options/ActorCreationOptions.java @@ -50,7 +50,8 @@ public static class Builder { * this name via {@link Ray#getActor(java.lang.String)}. If you want create a named actor that * is accessible from all jobs, use {@link Builder#setGlobalName(java.lang.String)} instead. * - * @param name The name of the named actor. Returns self + * @param name The name of the named actor. + * @return self */ public Builder setName(String name) { this.name = name; @@ -63,7 +64,8 @@ public Builder setName(String name) { * {@link Ray#getGlobalActor(java.lang.String)}. If you want to create a named actor that is * only accessible from this job, use {@link Builder#setName(java.lang.String)} instead. * - * @param name The name of the named actor. Returns self + * @param name The name of the named actor. + * @return self */ public Builder setGlobalName(String name) { this.name = name; @@ -77,7 +79,8 @@ public Builder setGlobalName(String name) { * will be used. * * @param resourceName resource name - * @param resourceQuantity resource quantity Returns self + * @param resourceQuantity resource quantity + * @return self */ public Builder setResource(String resourceName, Double resourceQuantity) { this.resources.put(resourceName, resourceQuantity); @@ -89,7 +92,8 @@ public Builder setResource(String resourceName, Double resourceQuantity) { * be called multiple times. If the same resource is set multiple times, the latest quantity * will be used. * - * @param resources requirements for multiple resources. Returns self + * @param resources requirements for multiple resources. + * @return self */ public Builder setResources(Map resources) { this.resources.putAll(resources); @@ -101,7 +105,8 @@ public Builder setResources(Map resources) { * unexpectedly. The minimum valid value is 0 (default), which indicates that the actor doesn't * need to be restarted. A value of -1 indicates that an actor should be restarted indefinitely. * - * @param maxRestarts max number of actor restarts Returns self + * @param maxRestarts max number of actor restarts + * @return self */ public Builder setMaxRestarts(int maxRestarts) { this.maxRestarts = maxRestarts; @@ -113,7 +118,8 @@ public Builder setMaxRestarts(int maxRestarts) { * *

Note, if this is set, this actor won't share Java worker with other actors or tasks. * - * @param jvmOptions JVM options for the Java worker that this actor is running in. Returns self + * @param jvmOptions JVM options for the Java worker that this actor is running in. + * @return self */ public Builder setJvmOptions(String jvmOptions) { this.jvmOptions = jvmOptions; @@ -126,8 +132,8 @@ public Builder setJvmOptions(String jvmOptions) { *

The max concurrency defaults to 1 for threaded execution. Note that the execution order is * not guaranteed when {@code max_concurrency > 1}. * - * @param maxConcurrency The max number of concurrent calls to allow for this actor. Returns - * self + * @param maxConcurrency The max number of concurrent calls to allow for this actor. + * @return self */ public Builder setMaxConcurrency(int maxConcurrency) { if (maxConcurrency <= 0) { @@ -142,7 +148,8 @@ public Builder setMaxConcurrency(int maxConcurrency) { * Set the placement group to place this actor in. * * @param group The placement group of the actor. - * @param bundleIndex The index of the bundle to place this actor in. Returns self + * @param bundleIndex The index of the bundle to place this actor in. + * @return self */ public Builder setPlacementGroup(PlacementGroup group, int bundleIndex) { this.group = group; diff --git a/java/api/src/main/java/io/ray/api/options/CallOptions.java b/java/api/src/main/java/io/ray/api/options/CallOptions.java index 233c30aa3fe2..37e474d55a33 100644 --- a/java/api/src/main/java/io/ray/api/options/CallOptions.java +++ b/java/api/src/main/java/io/ray/api/options/CallOptions.java @@ -22,7 +22,8 @@ public static class Builder { /** * Set a name for this task. * - * @param name task name Returns self + * @param name task name + * @return self */ public Builder setName(String name) { this.name = name; @@ -34,7 +35,8 @@ public Builder setName(String name) { * multiple times. If the same resource is set multiple times, the latest quantity will be used. * * @param name resource name - * @param value resource capacity Returns self + * @param value resource capacity + * @return self */ public Builder setResource(String name, Double value) { this.resources.put(name, value); @@ -45,7 +47,8 @@ public Builder setResource(String name, Double value) { * Set custom requirements for multiple resources. This method can be called multiple times. If * the same resource is set multiple times, the latest quantity will be used. * - * @param resources requirements for multiple resources. Returns self + * @param resources requirements for multiple resources. + * @return self */ public Builder setResources(Map resources) { this.resources.putAll(resources); diff --git a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java index 2f3eeb2a7160..53da3d48dae8 100644 --- a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java +++ b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java @@ -31,22 +31,24 @@ public interface RayRuntime { /** * Store an object in the object store. * - * @param obj The Java object to be stored. Returns A ObjectRef instance that represents the - * in-store object. + * @param obj The Java object to be stored. + * @return A ObjectRef instance that represents the in-store object. */ ObjectRef put(T obj); /** * Get an object from the object store. * - * @param objectRef The reference of the object to get. Returns The Java object. + * @param objectRef The reference of the object to get. + * @return The Java object. */ T get(ObjectRef objectRef); /** * Get a list of objects from the object store. * - * @param objectRefs The list of object references. Returns A list of Java objects. + * @param objectRefs The list of object references. + * @return A list of Java objects. */ List get(List> objectRefs); @@ -56,8 +58,8 @@ public interface RayRuntime { * * @param waitList A list of ObjectRef to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, - * one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @return Two lists, one containing locally available objects, one containing the rest. */ WaitResult wait(List> waitList, int numReturns, int timeoutMs); @@ -87,7 +89,8 @@ public interface RayRuntime { * name specified. * * @param name The name of the named actor. - * @param global Whether the named actor is global. Returns ActorHandle to the actor. + * @param global Whether the named actor is global. + * @return ActorHandle to the actor. */ Optional getActor(String name, boolean global); @@ -104,7 +107,8 @@ public interface RayRuntime { * * @param func The remote function to run. * @param args The arguments of the remote function. - * @param options The options for this call. Returns The result object. + * @param options The options for this call. + * @return The result object. */ ObjectRef call(RayFunc func, Object[] args, CallOptions options); @@ -113,7 +117,8 @@ public interface RayRuntime { * * @param pyFunction The Python function. * @param args Arguments of the function. - * @param options The options for this call. Returns The result object. + * @param options The options for this call. + * @return The result object. */ ObjectRef call(PyFunction pyFunction, Object[] args, CallOptions options); @@ -122,7 +127,8 @@ public interface RayRuntime { * * @param actor A handle to the actor. * @param func The remote function to run, it must be a method of the given actor. - * @param args The arguments of the remote function. Returns The result object. + * @param args The arguments of the remote function. + * @return The result object. */ ObjectRef callActor(ActorHandle actor, RayFunc func, Object[] args); @@ -131,7 +137,8 @@ public interface RayRuntime { * * @param pyActor A handle to the actor. * @param pyActorMethod The actor method. - * @param args Arguments of the function. Returns The result object. + * @param args Arguments of the function. + * @return The result object. */ ObjectRef callActor(PyActorHandle pyActor, PyActorMethod pyActorMethod, Object[] args); @@ -141,7 +148,8 @@ public interface RayRuntime { * @param actorFactoryFunc A remote function whose return value is the actor object. * @param args The arguments for the remote function. * @param The type of the actor object. - * @param options The options for creating actor. Returns A handle to the actor. + * @param options The options for creating actor. + * @return A handle to the actor. */ ActorHandle createActor( RayFunc actorFactoryFunc, Object[] args, ActorCreationOptions options); @@ -151,7 +159,8 @@ ActorHandle createActor( * * @param pyActorClass The Python actor class. * @param args Arguments of the actor constructor. - * @param options The options for creating actor. Returns A handle to the actor. + * @param options The options for creating actor. + * @return A handle to the actor. */ PyActorHandle createActor(PyActorClass pyActorClass, Object[] args, ActorCreationOptions options); @@ -170,14 +179,16 @@ PlacementGroup createPlacementGroup( /** * Wrap a {@link Runnable} with necessary context capture. * - * @param runnable The runnable to wrap. Returns The wrapped runnable. + * @param runnable The runnable to wrap. + * @return The wrapped runnable. */ Runnable wrapRunnable(Runnable runnable); /** * Wrap a {@link Callable} with necessary context capture. * - * @param callable The callable to wrap. Returns The wrapped callable. + * @param callable The callable to wrap. + * @return The wrapped callable. */ Callable wrapCallable(Callable callable); @@ -187,14 +198,15 @@ PlacementGroup createPlacementGroup( /** * Get a placement group by id. * - * @param id placement group id. Returns The placement group. + * @param id placement group id. + * @return The placement group. */ PlacementGroup getPlacementGroup(PlacementGroupId id); /** * Get all placement groups in this cluster. * - *

Returns All placement groups. + * @return All placement groups. */ List getAllPlacementGroups(); @@ -209,8 +221,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False - * otherwise. + * @param timeoutMs Timeout in milliseconds. + * @return True if the placement group is created. False otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); } diff --git a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java index b5fa486aa586..d00ea4f1195b 100644 --- a/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java +++ b/java/api/src/main/java/io/ray/api/runtimecontext/RuntimeContext.java @@ -21,7 +21,7 @@ public interface RuntimeContext { boolean wasCurrentActorRestarted(); /** - * Return true if Ray is running in single-process mode, false if Ray is running in cluster mode. + * Returns true if Ray is running in single-process mode, false if Ray is running in cluster mode. */ boolean isSingleProcess(); diff --git a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java index 1dd4b84f5c2b..85a46ad8b963 100644 --- a/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java +++ b/java/runtime/src/main/java/io/ray/runtime/actor/NativeActorHandle.java @@ -71,7 +71,7 @@ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundExcept /** * Serialize this actor handle to bytes. * - *

Returns the bytes of the actor handle + * @return the bytes of the actor handle */ public byte[] toBytes() { return nativeSerialize(actorId); @@ -80,7 +80,7 @@ public byte[] toBytes() { /** * Deserialize an actor handle from bytes. * - *

Returns the bytes of an actor handle + * @return the bytes of an actor handle */ public static NativeActorHandle fromBytes(byte[] bytes) { byte[] actorId = nativeDeserialize(bytes); diff --git a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java index d26a13dca193..c9ef7ce3bbe6 100644 --- a/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java +++ b/java/runtime/src/main/java/io/ray/runtime/functionmanager/FunctionManager.java @@ -69,7 +69,8 @@ public FunctionManager(List codeSearchPath) { * Get the RayFunction from a RayFunc instance (a lambda). * * @param jobId current job id. - * @param func The lambda. Returns A RayFunction object. + * @param func The lambda. + * @return A RayFunction object. */ public RayFunction getFunction(JobId jobId, RayFunc func) { JavaFunctionDescriptor functionDescriptor = RAY_FUNC_CACHE.get().get(func.getClass()); @@ -90,7 +91,8 @@ public RayFunction getFunction(JobId jobId, RayFunc func) { * Get the RayFunction from a function descriptor. * * @param jobId Current job id. - * @param functionDescriptor The function descriptor. Returns A RayFunction object. + * @param functionDescriptor The function descriptor. + * @return A RayFunction object. */ public RayFunction getFunction(JobId jobId, JavaFunctionDescriptor functionDescriptor) { JobFunctionTable jobFunctionTable = jobFunctionTables.get(jobId); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java index df34212e7eec..cc70bbd7e963 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/GcsClient.java @@ -35,7 +35,8 @@ public GcsClient(String redisAddress, String redisPassword) { /** * Get placement group by {@link PlacementGroupId}. * - * @param placementGroupId Id of placement group. Returns The placement group. + * @param placementGroupId Id of placement group. + * @return The placement group. */ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { byte[] result = globalStateAccessor.getPlacementGroupInfo(placementGroupId); @@ -45,7 +46,7 @@ public PlacementGroup getPlacementGroupInfo(PlacementGroupId placementGroupId) { /** * Get all placement groups in this cluster. * - *

Returns All placement groups. + * @return All placement groups. */ public List getAllPlacementGroupInfo() { List results = globalStateAccessor.getAllPlacementGroupInfo(); diff --git a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java index 77004a8493a4..811402994e4e 100644 --- a/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java +++ b/java/runtime/src/main/java/io/ray/runtime/gcs/RedisClient.java @@ -88,7 +88,7 @@ public byte[] get(byte[] key, byte[] field) { /** * Return the specified elements of the list stored at the specified key. * - *

Returns Multi bulk reply, specifically a list of elements in the specified range. + * @return Multi bulk reply, specifically a list of elements in the specified range. */ public List lrange(byte[] key, long start, long end) { try (Jedis jedis = jedisPool.getResource()) { diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java index 961cbfe9a9b8..80c39cf96f50 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metric.java @@ -54,7 +54,7 @@ public void record() { /** * Get the value to record and then reset. * - *

Returns latest updating value. + * @return latest updating value. */ protected abstract double getAndReset(); diff --git a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java index 85939ed79abb..f3af834f6715 100644 --- a/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java +++ b/java/runtime/src/main/java/io/ray/runtime/metric/Metrics.java @@ -111,7 +111,7 @@ public B tags(Map tags) { /** * Creates a metric by sub-class. * - *

Returns a metric + * @return a metric */ protected abstract M create(); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java index 76576b969e20..51ae9bfd2b98 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java @@ -55,7 +55,8 @@ public class ObjectSerializer { * Deserialize an object from an {@link NativeRayObject} instance. * * @param nativeRayObject The object to deserialize. - * @param objectId The associated object ID of the object. Returns The deserialized object. + * @param objectId The associated object ID of the object. + * @return The deserialized object. */ public static Object deserialize( NativeRayObject nativeRayObject, ObjectId objectId, Class objectType) { @@ -110,7 +111,8 @@ public static Object deserialize( /** * Serialize an Java object to an {@link NativeRayObject} instance. * - * @param object The object to serialize. Returns The serialized object. + * @param object The object to serialize. + * @return The serialized object. */ public static NativeRayObject serialize(Object object) { if (object instanceof NativeRayObject) { diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java index df524af11c8a..8711811b24ad 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java @@ -26,7 +26,8 @@ public ObjectStore(WorkerContext workerContext) { /** * Put a raw object into object store. * - * @param obj The ray object. Returns Generated ID of the object. + * @param obj The ray object. + * @return Generated ID of the object. */ public abstract ObjectId putRaw(NativeRayObject obj); @@ -41,7 +42,8 @@ public ObjectStore(WorkerContext workerContext) { /** * Serialize and put an object to the object store. * - * @param object The object to put. Returns Id of the object. + * @param object The object to put. + * @return Id of the object. */ public ObjectId put(Object object) { if (object instanceof NativeRayObject) { @@ -71,8 +73,8 @@ public void put(Object object, ObjectId objectId) { * Get a list of raw objects from the object store. * * @param objectIds IDs of the objects to get. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns Result list - * of objects data. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. + * @return Result list of objects data. */ public abstract List getRaw(List objectIds, long timeoutMs); @@ -80,7 +82,8 @@ public void put(Object object, ObjectId objectId) { * Get a list of objects from the object store. * * @param ids List of the object ids. - * @param Type of these objects. Returns A list of GetResult objects. + * @param Type of these objects. + * @return A list of GetResult objects. */ @SuppressWarnings("unchecked") public List get(List ids, Class elementType) { @@ -118,8 +121,8 @@ public List get(List ids, Class elementType) { * * @param objectIds IDs of the objects to wait for. * @param numObjects Number of objects that should appear. - * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. Returns A bitset - * that indicates each object has appeared or not. + * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. + * @return A bitset that indicates each object has appeared or not. */ public abstract List wait(List objectIds, int numObjects, long timeoutMs); @@ -129,8 +132,8 @@ public List get(List ids, Class elementType) { * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. - * @param timeoutMs The maximum time in milliseconds to wait before returning. Returns Two lists, - * one containing locally available objects, one containing the rest. + * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @return Two lists, one containing locally available objects, one containing the rest. */ public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { Preconditions.checkNotNull(waitList); @@ -185,7 +188,8 @@ public WaitResult wait(List> waitList, int numReturns, int t /** * Promote the given object to the underlying object store, and get the ownership info. * - * @param objectId The ID of the object to promote Returns the serialized ownership address + * @param objectId The ID of the object to promote + * @return the serialized ownership address */ public abstract byte[] promoteAndGetOwnershipInfo(ObjectId objectId); diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java index b08f7c9f5c0f..1d0d540848bf 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java @@ -53,8 +53,8 @@ public PlacementGroupState getState() { /** * Wait for the placement group to be ready within the specified time. * - * @param timeoutSeconds Timeout in seconds. Returns True if the placement group is created. False - * otherwise. + * @param timeoutSeconds Timeout in seconds. + * @return True if the placement group is created. False otherwise. */ public boolean wait(int timeoutSeconds) { return Ray.internal().waitPlacementGroupReady(id, timeoutSeconds); @@ -71,7 +71,8 @@ public static class Builder { /** * Set the Id of the placement group. * - * @param id Id of the placement group. Returns self. + * @param id Id of the placement group. + * @return self. */ public Builder setId(PlacementGroupId id) { this.id = id; @@ -81,7 +82,8 @@ public Builder setId(PlacementGroupId id) { /** * Set the name of the placement group. * - * @param name Name of the placement group. Returns self. + * @param name Name of the placement group. + * @return self. */ public Builder setName(String name) { this.name = name; @@ -91,7 +93,8 @@ public Builder setName(String name) { /** * Set the bundles of the placement group. * - * @param bundles the bundles of the placement group. Returns self. + * @param bundles the bundles of the placement group. + * @return self. */ public Builder setBundles(List> bundles) { this.bundles = bundles; @@ -101,7 +104,8 @@ public Builder setBundles(List> bundles) { /** * Set the placement strategy of the placement group. * - * @param strategy the placement strategy of the placement group. Returns self. + * @param strategy the placement strategy of the placement group. + * @return self. */ public Builder setStrategy(PlacementStrategy strategy) { this.strategy = strategy; @@ -111,7 +115,8 @@ public Builder setStrategy(PlacementStrategy strategy) { /** * Set the placement state of the placement group. * - * @param state the state of the placement group. Returns self. + * @param state the state of the placement group. + * @return self. */ public Builder setState(PlacementGroupState state) { this.state = state; diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java index 75305ef1f4e2..8e9d03cc6407 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupUtils.java @@ -61,8 +61,8 @@ private static PlacementGroupState covertToUserSpecifiedState( /** * Generate a PlacementGroupImpl from placementGroupTableData protobuf data. * - * @param placementGroupTableData protobuf data. Returns placement group info {@link - * PlacementGroupImpl} + * @param placementGroupTableData protobuf data. + * @return placement group info {@link PlacementGroupImpl} */ private static PlacementGroupImpl generatePlacementGroupFromPbData( PlacementGroupTableData placementGroupTableData) { @@ -90,8 +90,8 @@ private static PlacementGroupImpl generatePlacementGroupFromPbData( /** * Generate a PlacementGroupImpl from byte array. * - * @param placementGroupByteArray bytes array from native method. Returns placement group info - * {@link PlacementGroupImpl} + * @param placementGroupByteArray bytes array from native method. + * @return placement group info {@link PlacementGroupImpl} */ public static PlacementGroupImpl generatePlacementGroupFromByteArray( byte[] placementGroupByteArray) { diff --git a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java index ca195d6ced11..e8a8351716d5 100644 --- a/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java +++ b/java/runtime/src/main/java/io/ray/runtime/task/TaskSubmitter.java @@ -21,7 +21,8 @@ public interface TaskSubmitter { * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. Returns Ids of the return objects. + * @param options Options for this task. + * @return Ids of the return objects. */ List submitTask( FunctionDescriptor functionDescriptor, @@ -34,7 +35,8 @@ List submitTask( * * @param functionDescriptor The remote function that generates the actor object. * @param args Arguments of this task. - * @param options Options for this actor creation task. Returns Handle to the actor. + * @param options Options for this actor creation task. + * @return Handle to the actor. * @throws IllegalArgumentException if actor of specified name exists */ BaseActorHandle createActor( @@ -48,7 +50,8 @@ BaseActorHandle createActor( * @param functionDescriptor The remote function to execute. * @param args Arguments of this task. * @param numReturns Return object count. - * @param options Options for this task. Returns Ids of the return objects. + * @param options Options for this task. + * @return Ids of the return objects. */ List submitActorTask( BaseActorHandle actor, @@ -62,7 +65,8 @@ List submitActorTask( * * @param name Name of the placement group. * @param bundles Pre-allocated resource list. - * @param strategy Actor placement strategy. Returns A handle to the created placement group. + * @param strategy Actor placement strategy. + * @return A handle to the created placement group. */ PlacementGroup createPlacementGroup( String name, List> bundles, PlacementStrategy strategy); @@ -78,8 +82,8 @@ PlacementGroup createPlacementGroup( * Wait for the placement group to be ready within the specified time. * * @param id Id of placement group. - * @param timeoutMs Timeout in milliseconds. Returns True if the placement group is created. False - * otherwise. + * @param timeoutMs Timeout in milliseconds. + * @return True if the placement group is created. False otherwise. */ boolean waitPlacementGroupReady(PlacementGroupId id, int timeoutMs); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java index 85c327a446b7..f3282ed08c56 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/BinaryFileUtil.java @@ -21,7 +21,8 @@ public class BinaryFileUtil { * will be protected by a file lock. * * @param destDir a directory to extract resource file to - * @param fileName resource file name Returns extracted resource file + * @param fileName resource file name + * @return extracted resource file */ public static File getNativeFile(String destDir, String fileName) { final File dir = new File(destDir); diff --git a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java index 4f7bf2580af2..239568afa51b 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/IdUtil.java @@ -13,7 +13,7 @@ public class IdUtil { /** * Compute the actor ID of the task which created this object. * - *

Returns The actor ID of the task which created this object. + * @return The actor ID of the task which created this object. */ public static ActorId getActorIdFromObjectId(ObjectId objectId) { byte[] taskIdBytes = new byte[TaskId.LENGTH]; diff --git a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java index 0c7a93d27818..e9676d07b2f6 100644 --- a/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java +++ b/java/runtime/src/main/java/io/ray/runtime/util/ResourceUtil.java @@ -11,8 +11,8 @@ public class ResourceUtil { * Convert resources map to a string that is used for the command line argument of starting * raylet. * - * @param resources The resources map to be converted. Returns The starting-raylet command line - * argument, like "CPU,4,GPU,0". + * @param resources The resources map to be converted. + * @return The starting-raylet command line argument, like "CPU,4,GPU,0". */ public static String getResourcesStringFromMap(Map resources) { StringBuilder builder = new StringBuilder(); @@ -32,8 +32,9 @@ public static String getResourcesStringFromMap(Map resources) { /** * Parse the static resources configure field and convert to the resources map. * - * @param resources The static resources string to be parsed. Returns The map whose key represents - * the resource name and the value represents the resource quantity. + * @param resources The static resources string to be parsed. + * @return The map whose key represents the resource name and the value represents the resource + * quantity. * @throws IllegalArgumentException If the resources string's format does match, it will throw an * IllegalArgumentException. */ diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java index fbfc4736e031..c12bdf87c48c 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/Function.java @@ -11,7 +11,7 @@ public interface Function extends Serializable { * storage, and load it back when in fail-over through. {@link * Function#loadCheckpoint(Serializable)}. * - *

Returns A serializable object which represents function state. + * @return A serializable object which represents function state. */ default Serializable saveCheckpoint() { return null; diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java index 877a93ae0e74..d60e335a9d1e 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/function/impl/FilterFunction.java @@ -14,8 +14,8 @@ public interface FilterFunction extends Function { /** * The filter function that evaluates the predicate. * - * @param value The value to be filtered. Returns True for values that should be retained, false - * for values to be filtered out. + * @param value The value to be filtered. + * @return True for values that should be retained, false for values to be filtered out. */ boolean filter(T value) throws Exception; } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java index 527f469c301a..80e9d92729bf 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/partition/Partition.java @@ -15,8 +15,8 @@ public interface Partition extends Function { * record. * * @param record The record. - * @param numPartition num of partitions Returns IDs of the downstream partitions that should - * receive the record. + * @param numPartition num of partitions + * @return IDs of the downstream partitions that should receive the record. */ int[] partition(T record, int numPartition); } diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java index 698eab29d2e3..999057d5a8b7 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStream.java @@ -59,7 +59,8 @@ public DataStream(PythonDataStream referencedStream) { * Apply a map function to this stream. * * @param mapFunction The map function. - * @param Type of data returned by the map function. Returns A new DataStream. + * @param Type of data returned by the map function. + * @return A new DataStream. */ public DataStream map(MapFunction mapFunction) { return new DataStream<>(this, new MapOperator<>(mapFunction)); @@ -69,7 +70,8 @@ public DataStream map(MapFunction mapFunction) { * Apply a flat-map function to this stream. * * @param flatMapFunction The FlatMapFunction - * @param Type of data returned by the flatmap function. Returns A new DataStream + * @param Type of data returned by the flatmap function. + * @return A new DataStream */ public DataStream flatMap(FlatMapFunction flatMapFunction) { return new DataStream<>(this, new FlatMapOperator<>(flatMapFunction)); @@ -84,7 +86,8 @@ public DataStream filter(FilterFunction filterFunction) { * type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. Returns A new UnionStream. + * @param others The other DataStreams to union output with. + * @return A new UnionStream. */ @SafeVarargs public final DataStream union(DataStream stream, DataStream... others) { @@ -98,7 +101,8 @@ public final DataStream union(DataStream stream, DataStream... others) * Apply union transformations to this stream by merging {@link DataStream} outputs of the same * type with each other. * - * @param streams The DataStreams to union output with. Returns A new UnionStream. + * @param streams The DataStreams to union output with. + * @return A new UnionStream. */ public final DataStream union(List> streams) { if (this instanceof UnionStream) { @@ -115,7 +119,8 @@ public final DataStream union(List> streams) { * * @param other Another stream. * @param The type of the other stream data. - * @param The type of the data in the joined stream. Returns A new JoinStream. + * @param The type of the data in the joined stream. + * @return A new JoinStream. */ public JoinStream join(DataStream other) { return new JoinStream<>(this, other); @@ -129,7 +134,8 @@ public DataStream process() { /** * Apply a sink function and get a StreamSink. * - * @param sinkFunction The sink function. Returns A new StreamSink. + * @param sinkFunction The sink function. + * @return A new StreamSink. */ public DataStreamSink sink(SinkFunction sinkFunction) { return new DataStreamSink<>(this, new SinkOperator<>(sinkFunction)); @@ -139,7 +145,8 @@ public DataStreamSink sink(SinkFunction sinkFunction) { * Apply a key-by function to this stream. * * @param keyFunction the key function. - * @param The type of the key. Returns A new KeyDataStream. + * @param The type of the key. + * @return A new KeyDataStream. */ public KeyDataStream keyBy(KeyFunction keyFunction) { checkPartitionCall(); @@ -149,7 +156,7 @@ public KeyDataStream keyBy(KeyFunction keyFunction) { /** * Apply broadcast to this stream. * - *

Returns This stream. + * @return This stream. */ public DataStream broadcast() { checkPartitionCall(); @@ -159,7 +166,8 @@ public DataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. Returns This stream. + * @param partition The partitioning strategy. + * @return This stream. */ public DataStream partitionBy(Partition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java index 13de0b33bb4e..53dd2a09738a 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/DataStreamSource.java @@ -27,7 +27,8 @@ public static DataStreamSource fromSource( * * @param context Stream context. * @param values A collection of values. - * @param The type of source data. Returns A DataStreamSource. + * @param The type of source data. + * @return A DataStreamSource. */ public static DataStreamSource fromCollection( StreamingContext context, Collection values) { diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java index fb6431ef2da8..c50b232697e4 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/api/stream/KeyDataStream.java @@ -33,7 +33,8 @@ public KeyDataStream(PythonDataStream referencedStream) { /** * Apply a reduce function to this stream. * - * @param reduceFunction The reduce function. Returns A new DataStream. + * @param reduceFunction The reduce function. + * @return A new DataStream. */ public DataStream reduce(ReduceFunction reduceFunction) { return new DataStream<>(this, new ReduceOperator(reduceFunction)); @@ -44,7 +45,8 @@ public DataStream reduce(ReduceFunction reduceFunction) { * * @param aggregateFunction The aggregate function * @param The type of aggregated intermediate data. - * @param The type of result data. Returns A new DataStream. + * @param The type of result data. + * @return A new DataStream. */ public DataStream aggregate(AggregateFunction aggregateFunction) { return new DataStream<>(this, null); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java index 6e40ee441c32..b192dbcc8a18 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/jobgraph/JobGraph.java @@ -43,7 +43,7 @@ public JobGraph( * Generate direct-graph(made up of a set of vertices and connected by edges) by current job graph * for simple log printing. * - *

Returns Digraph in string type. + * @return Digraph in string type. */ public String generateDigraph() { StringBuilder digraph = new StringBuilder(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java index 25b5873105a6..90f018ecdc89 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonDataStream.java @@ -51,7 +51,8 @@ public PythonDataStream map(String moduleName, String funcName) { /** * Apply a map function to this stream. * - * @param func The python MapFunction. Returns A new PythonDataStream. + * @param func The python MapFunction. + * @return A new PythonDataStream. */ public PythonDataStream map(PythonFunction func) { func.setFunctionInterface(FunctionInterface.MAP_FUNCTION); @@ -65,7 +66,8 @@ public PythonDataStream flatMap(String moduleName, String funcName) { /** * Apply a flat-map function to this stream. * - * @param func The python FlapMapFunction. Returns A new PythonDataStream + * @param func The python FlapMapFunction. + * @return A new PythonDataStream */ public PythonDataStream flatMap(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FLAT_MAP_FUNCTION); @@ -79,8 +81,9 @@ public PythonDataStream filter(String moduleName, String funcName) { /** * Apply a filter function to this stream. * - * @param func The python FilterFunction. Returns A new PythonDataStream that contains only the - * elements satisfying the given filter predicate. + * @param func The python FilterFunction. + * @return A new PythonDataStream that contains only the elements satisfying the given filter + * predicate. */ public PythonDataStream filter(PythonFunction func) { func.setFunctionInterface(FunctionInterface.FILTER_FUNCTION); @@ -92,7 +95,8 @@ public PythonDataStream filter(PythonFunction func) { * same type with each other. * * @param stream The DataStream to union output with. - * @param others The other DataStreams to union output with. Returns A new UnionStream. + * @param others The other DataStreams to union output with. + * @return A new UnionStream. */ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... others) { List streams = new ArrayList<>(); @@ -105,7 +109,8 @@ public final PythonDataStream union(PythonDataStream stream, PythonDataStream... * Apply union transformations to this stream by merging {@link PythonDataStream} outputs of the * same type with each other. * - * @param streams The DataStreams to union output with. Returns A new UnionStream. + * @param streams The DataStreams to union output with. + * @return A new UnionStream. */ public final PythonDataStream union(List streams) { if (this instanceof PythonUnionStream) { @@ -124,7 +129,8 @@ public PythonStreamSink sink(String moduleName, String funcName) { /** * Apply a sink function and get a StreamSink. * - * @param func The python SinkFunction. Returns A new StreamSink. + * @param func The python SinkFunction. + * @return A new StreamSink. */ public PythonStreamSink sink(PythonFunction func) { func.setFunctionInterface(FunctionInterface.SINK_FUNCTION); @@ -138,7 +144,8 @@ public PythonKeyDataStream keyBy(String moduleName, String funcName) { /** * Apply a key-by function to this stream. * - * @param func the python keyFunction. Returns A new KeyDataStream. + * @param func the python keyFunction. + * @return A new KeyDataStream. */ public PythonKeyDataStream keyBy(PythonFunction func) { checkPartitionCall(); @@ -149,7 +156,7 @@ public PythonKeyDataStream keyBy(PythonFunction func) { /** * Apply broadcast to this stream. * - *

Returns This stream. + * @return This stream. */ public PythonDataStream broadcast() { checkPartitionCall(); @@ -159,7 +166,8 @@ public PythonDataStream broadcast() { /** * Apply a partition to this stream. * - * @param partition The partitioning strategy. Returns This stream. + * @param partition The partitioning strategy. + * @return This stream. */ public PythonDataStream partitionBy(PythonPartition partition) { checkPartitionCall(); diff --git a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java index 8116fd392923..078f84ac4a94 100644 --- a/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java +++ b/streaming/java/streaming-api/src/main/java/io/ray/streaming/python/stream/PythonKeyDataStream.java @@ -31,7 +31,8 @@ public PythonDataStream reduce(String moduleName, String funcName) { /** * Apply a reduce function to this stream. * - * @param func The reduce function. Returns A new DataStream. + * @param func The reduce function. + * @return A new DataStream. */ public PythonDataStream reduce(PythonFunction func) { func.setFunctionInterface(FunctionInterface.REDUCE_FUNCTION); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java index 0c555e7c5ada..2ec3b6dfb944 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/global/CommonConfig.java @@ -11,7 +11,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job id. Non-custom. * - *

Returns Job id with string type. + * @return Job id with string type. */ @DefaultValue(value = "default-job-id") @Key(value = JOB_ID) @@ -20,7 +20,7 @@ public interface CommonConfig extends Config { /** * Ray streaming job name. Non-custom. * - *

Returns Job name with string type. + * @return Job name with string type. */ @DefaultValue(value = "default-job-name") @Key(value = JOB_NAME) diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java index bc2fc2bd3662..79189431a2ba 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/config/master/SchedulerConfig.java @@ -11,7 +11,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker initiation. Default is: 10000ms(10s). * - *

Returns timeout ms + * @return timeout ms */ @Key(WORKER_INITIATION_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") @@ -20,7 +20,7 @@ public interface SchedulerConfig extends Config { /** * The timeout ms of worker starting. Default is: 10000ms(10s). * - *

Returns timeout ms + * @return timeout ms */ @Key(WORKER_STARTING_WAIT_TIMEOUT_MS) @DefaultValue(value = "10000") diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java index faf8703905be..83b62696e6ba 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/context/ContextBackend.java @@ -12,14 +12,15 @@ public interface ContextBackend { /** * check if key exists in state * - *

Returns true if exists + * @return true if exists */ boolean exists(final String key) throws Exception; /** * get content by key * - * @param key key Returns the StateBackend + * @param key key + * @return the StateBackend */ byte[] get(final String key) throws Exception; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java index b0d3b522ed10..2852e0f99141 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionGraph.java @@ -156,7 +156,7 @@ public AtomicInteger getExecutionVertexIdGenerator() { /** * Get all execution vertices from current execution graph. * - *

Returns all execution vertices. + * @return all execution vertices. */ public List getAllExecutionVertices() { return executionJobVertexMap.values().stream() @@ -168,7 +168,7 @@ public List getAllExecutionVertices() { /** * Get all execution vertices whose status is 'TO_ADD' from current execution graph. * - *

Returns all added execution vertices. + * @return all added execution vertices. */ public List getAllAddedExecutionVertices() { return executionJobVertexMap.values().stream() @@ -181,7 +181,8 @@ public List getAllAddedExecutionVertices() { /** * Get specified execution vertex from current execution graph by execution vertex id. * - * @param executionVertexId execution vertex id. Returns the specified execution vertex. + * @param executionVertexId execution vertex id. + * @return the specified execution vertex. */ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertexId) { if (executionVertexMap.containsKey(executionVertexId)) { @@ -193,7 +194,8 @@ public ExecutionVertex getExecutionVertexByExecutionVertexId(int executionVertex /** * Get specified execution vertex from current execution graph by actor id. * - * @param actorId the actor id of execution vertex. Returns the specified execution vertex. + * @param actorId the actor id of execution vertex. + * @return the specified execution vertex. */ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { return actorIdExecutionVertexMap.get(actorId); @@ -202,7 +204,8 @@ public ExecutionVertex getExecutionVertexByActorId(ActorId actorId) { /** * Get specified actor by actor id. * - * @param actorId the actor id of execution vertex. Returns the specified actor handle. + * @param actorId the actor id of execution vertex. + * @return the specified actor handle. */ public Optional getActorById(ActorId actorId) { return getAllActors().stream().filter(actor -> actor.getId().equals(actorId)).findFirst(); @@ -212,7 +215,8 @@ public Optional getActorById(ActorId actorId) { * Get the peer actor in the other side of channelName of a given actor * * @param actor actor in this side - * @param channelName the channel name Returns the peer actor in the other side + * @param channelName the channel name + * @return the peer actor in the other side */ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { Set set = getActorsByChannelId(channelName); @@ -229,7 +233,8 @@ public BaseActorHandle getPeerActor(BaseActorHandle actor, String channelName) { /** * Get actors in both sides of a channelId * - * @param channelId the channelId Returns actors in both sides + * @param channelId the channelId + * @return actors in both sides */ public Set getActorsByChannelId(String channelId) { return channelGroupedActors.getOrDefault(channelId, Sets.newHashSet()); @@ -238,7 +243,7 @@ public Set getActorsByChannelId(String channelId) { /** * Get all actors by graph. * - *

Returns actor list + * @return actor list */ public List getAllActors() { return getActorsFromJobVertices(getExecutionJobVertexList()); @@ -247,7 +252,7 @@ public List getAllActors() { /** * Get source actors by graph. * - *

Returns actor list + * @return actor list */ public List getSourceActors() { List executionJobVertices = @@ -261,7 +266,7 @@ public List getSourceActors() { /** * Get transformation and sink actors by graph. * - *

Returns actor list + * @return actor list */ public List getNonSourceActors() { List executionJobVertices = @@ -278,7 +283,7 @@ public List getNonSourceActors() { /** * Get sink actors by graph. * - *

Returns actor list + * @return actor list */ public List getSinkActors() { List executionJobVertices = @@ -292,7 +297,8 @@ public List getSinkActors() { /** * Get actors according to job vertices. * - * @param executionJobVertices specified job vertices Returns actor list + * @param executionJobVertices specified job vertices + * @return actor list */ public List getActorsFromJobVertices( List executionJobVertices) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java index 0aa426672db6..cf869c0c4f2a 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/graph/executiongraph/ExecutionJobVertex.java @@ -109,7 +109,7 @@ public String getExecutionJobVertexName() { /** * e.g. 1-SourceOperator * - *

Returns operator name with index + * @return operator name with index */ public String getExecutionJobVertexNameWithIndex() { return executionJobVertexId + "-" + executionJobVertexName; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java index b0dec4aef0c0..9b07d131f7c9 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/core/resource/Resources.java @@ -24,7 +24,7 @@ public Resources() {} /** * Get registered containers, the container list is read-only. * - *

Returns container list. + * @return container list. */ public ImmutableList getRegisteredContainers() { return ImmutableList.copyOf(registerContainers); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java index a1dd5b6bc14b..fd672978a4f2 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/JobMaster.java @@ -101,7 +101,7 @@ private void loadMasterCheckpoint() { /** * Init JobMaster. To initiate or recover other components(like metrics and extra coordinators). * - *

Returns init result + * @return init result */ public Boolean init(boolean isRecover) { LOG.info("Initializing job master, isRecover={}.", isRecover); @@ -136,7 +136,8 @@ public Boolean init(boolean isRecover) { * * * @param jobMasterActor JobMaster actor - * @param jobGraph logical plan Returns submit result + * @param jobGraph logical plan + * @return submit result */ public boolean submitJob(ActorHandle jobMasterActor, JobGraph jobGraph) { LOG.info("Begin submitting job using logical plan: {}.", jobGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java index ce8dd474157a..b563917d97b4 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/graphmanager/GraphManager.java @@ -19,21 +19,22 @@ public interface GraphManager { /** * Build execution graph from job graph. * - * @param jobGraph logical plan of streaming job. Returns physical plan of streaming job. + * @param jobGraph logical plan of streaming job. + * @return physical plan of streaming job. */ ExecutionGraph buildExecutionGraph(JobGraph jobGraph); /** * Get job graph. * - *

Returns the job graph. + * @return the job graph. */ JobGraph getJobGraph(); /** * Get execution graph. * - *

Returns the execution graph. + * @return the execution graph. */ ExecutionGraph getExecutionGraph(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java index 43671eea1b28..fbe3f696aa59 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/ResourceManager.java @@ -10,7 +10,7 @@ public interface ResourceManager extends ResourceAssignStrategy { /** * Get registered containers, the container list is read-only. * - *

Returns the registered container list + * @return the registered container list */ ImmutableList getRegisteredContainers(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java index 8df20790cb90..9ce131d2599c 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/ResourceAssignStrategy.java @@ -13,7 +13,8 @@ public interface ResourceAssignStrategy { * Assign {@link Container} for {@link ExecutionVertex} * * @param containers registered container - * @param executionGraph execution graph Returns allocating view + * @param executionGraph execution graph + * @return allocating view */ ResourceAssignmentView assignResource(List containers, ExecutionGraph executionGraph); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java index 74b646c67364..48f2366cd37d 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/resourcemanager/strategy/impl/PipelineFirstStrategy.java @@ -42,8 +42,8 @@ public class PipelineFirstStrategy implements ResourceAssignStrategy { * Assign resource to each execution vertex in the given execution graph. * * @param containers registered containers - * @param executionGraph execution graph Returns allocating map, key is container ID, value is - * list of vertextId, and contains vertices + * @param executionGraph execution graph + * @return allocating map, key is container ID, value is list of vertextId, and contains vertices */ @Override public ResourceAssignmentView assignResource( @@ -133,7 +133,8 @@ private void updateContainerCapacity(List containers, int capacity) { * Find a container which matches required resource * * @param requiredResource required resource - * @param containers registered containers Returns container that matches the required resource + * @param containers registered containers + * @return container that matches the required resource */ private Container findMatchedContainer( Map requiredResource, List containers) { @@ -159,7 +160,8 @@ private Container findMatchedContainer( * Check if current container has enough resource * * @param requiredResource required resource - * @param container container Returns true if matches, false else + * @param container container + * @return true if matches, false else */ private boolean hasEnoughResource(Map requiredResource, Container container) { LOG.info("Check resource for index: {}, container: {}", currentContainerIndex, container); @@ -200,7 +202,8 @@ private boolean hasEnoughResource(Map requiredResource, Containe /** * Forward to next container * - * @param containers registered container list Returns next container in the list + * @param containers registered container list + * @return next container in the list */ private Container forwardToNextContainer(List containers) { this.currentContainerIndex = (this.currentContainerIndex + 1) % containers.size(); @@ -210,7 +213,8 @@ private Container forwardToNextContainer(List containers) { /** * Get current container * - * @param containers registered container Returns current container to allocate actor + * @param containers registered container + * @return current container to allocate actor */ private Container getCurrentContainer(List containers) { return containers.get(currentContainerIndex); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java index 962c0bdfa92b..d0fb60d54878 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobScheduler.java @@ -8,7 +8,8 @@ public interface JobScheduler { /** * Schedule streaming job using the physical plan. * - * @param executionGraph physical plan Returns scheduling result + * @param executionGraph physical plan + * @return scheduling result */ boolean scheduleJob(ExecutionGraph executionGraph); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java index 6309bb334e32..039715ccbefd 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/JobSchedulerImpl.java @@ -95,7 +95,8 @@ private void initAndStart(ExecutionGraph executionGraph) { /** * Create JobWorker actors according to the physical plan. * - * @param executionGraph physical plan Returns actor creation result + * @param executionGraph physical plan + * @return actor creation result */ public boolean createWorkers(ExecutionGraph executionGraph) { LOG.info("Begin creating workers."); @@ -148,7 +149,8 @@ public boolean startWorkers(ExecutionGraph executionGraph, long checkpointId) { /** * Build workers context. * - * @param executionGraph execution graph Returns vertex to worker context map + * @param executionGraph execution graph + * @return vertex to worker context map */ protected Map buildWorkersContext( ExecutionGraph executionGraph) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java index f5c4be5f7ee1..3cd3984b2043 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/master/scheduler/controller/WorkerLifecycleController.java @@ -36,7 +36,8 @@ public boolean createWorkers(List executionVertices) { /** * Create JobWorker actor according to the execution vertex. * - * @param executionVertex target execution vertex Returns creation result + * @param executionVertex target execution vertex + * @return creation result */ private boolean createWorker(ExecutionVertex executionVertex) { LOG.info( @@ -84,7 +85,8 @@ private boolean createWorker(ExecutionVertex executionVertex) { * Using context to init JobWorker. * * @param vertexToContextMap target JobWorker actor - * @param timeout timeout for waiting, unit: ms Returns initiation result + * @param timeout timeout for waiting, unit: ms + * @return initiation result */ public boolean initWorkers( Map vertexToContextMap, int timeout) { @@ -120,7 +122,8 @@ public boolean initWorkers( * Start JobWorkers to run task. * * @param executionGraph physical plan - * @param timeout timeout for waiting, unit: ms Returns starting result + * @param timeout timeout for waiting, unit: ms + * @return starting result */ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId, int timeout) { LOG.info("Begin starting workers."); @@ -150,7 +153,8 @@ public boolean startWorkers(ExecutionGraph executionGraph, long lastCheckpointId /** * Stop and destroy JobWorkers' actor. * - * @param executionVertices target vertices Returns destroy result + * @param executionVertices target vertices + * @return destroy result */ public boolean destroyWorkers(List executionVertices) { return asyncBatchExecute(this::destroyWorker, executionVertices); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java index 5a5475350d65..6cd788138883 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/rpc/RemoteCallWorker.java @@ -25,7 +25,8 @@ public class RemoteCallWorker { * Call JobWorker actor to init. * * @param actor target JobWorker actor - * @param context JobWorker's context Returns init result + * @param context JobWorker's context + * @return init result */ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerContext context) { LOG.info("Call worker to initiate, actor: {}, context: {}.", actor.getId(), context); @@ -50,7 +51,8 @@ public static ObjectRef initWorker(BaseActorHandle actor, JobWorkerCont * Call JobWorker actor to start. * * @param actor target JobWorker actor - * @param checkpointId checkpoint ID to be rollback Returns start result + * @param checkpointId checkpoint ID to be rollback + * @return start result */ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) { LOG.info("Call worker to start, actor: {}.", actor.getId()); @@ -79,7 +81,8 @@ public static ObjectRef rollback(BaseActorHandle actor, final Long checkpointId) /** * Call JobWorker actor to destroy without reconstruction. * - * @param actor target JobWorker actor Returns destroy result + * @param actor target JobWorker actor + * @return destroy result */ public static Boolean shutdownWithoutReconstruction(BaseActorHandle actor) { LOG.info("Call worker to shutdown without reconstruction, actor is {}.", actor.getId()); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java index 17ab4fe1ec4a..ff3c62fee11c 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/DataReader.java @@ -115,7 +115,8 @@ private static native long createDataReaderNative( /** * Read message from input channels, if timeout, return null. * - * @param timeoutMillis timeout Returns message or null + * @param timeoutMillis timeout + * @return message or null */ public ChannelMessage read(long timeoutMillis) { if (buf.isEmpty()) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java index d3a4b8d71773..731031d62a9b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/transfer/channel/ChannelId.java @@ -86,7 +86,8 @@ public static String genRandomIdStr() { * Generate channel name, which will be {@link ChannelId#ID_LENGTH} character * * @param fromTaskId upstream task id - * @param toTaskId downstream task id Returns channel name + * @param toTaskId downstream task id + * @return channel name */ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { /* @@ -116,7 +117,8 @@ public static String genIdStr(int fromTaskId, int toTaskId, long ts) { } /** - * @param id hex string representation of channel id Returns bytes representation of channel id + * @param id hex string representation of channel id + * @return bytes representation of channel id */ public static byte[] idStrToBytes(String id) { byte[] idBytes = BaseEncoding.base16().decode(id.toUpperCase()); @@ -125,7 +127,8 @@ public static byte[] idStrToBytes(String id) { } /** - * @param id bytes representation of channel id Returns hex string representation of channel id + * @param id bytes representation of channel id + * @return hex string representation of channel id */ public static String idBytesToStr(byte[] id) { assert id.length == ChannelId.ID_LENGTH; diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java index 07fda18a6c5a..29ac29f4d51e 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/EnvUtil.java @@ -36,7 +36,7 @@ public static void loadNativeLibraries() { /** * Execute an external command. * - *

Returns Whether the command succeeded. + * @return Whether the command succeeded. */ public static boolean executeCommand(List command, int waitTimeoutSeconds) { try { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java index effafcc540a0..324e1ab9dcd9 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/Platform.java @@ -77,7 +77,10 @@ public static void wrapDirectBuffer(ByteBuffer buffer, long address, int size) { buffer.clear(); } - /** @param buffer a DirectBuffer backed by off-heap memory Returns address of off-heap memory */ + /** + * @param buffer a DirectBuffer backed by off-heap memory + * @return address of off-heap memory + */ public static long getAddress(ByteBuffer buffer) { return ((DirectBuffer) buffer).address(); } diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java index a97a2f5bab3b..b3243d69f449 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/RayUtils.java @@ -15,7 +15,7 @@ public class RayUtils { /** * Get all node info from GCS * - *

Returns node info list + * @return node info list */ public static List getAllNodeInfo() { if (Ray.getRuntimeContext().isSingleProcess()) { @@ -28,7 +28,7 @@ public static List getAllNodeInfo() { /** * Get all alive node info map * - *

Returns node info map, key is unique node id , value is node info + * @return node info map, key is unique node id , value is node info */ public static Map getAliveNodeInfoMap() { return getAllNodeInfo().stream() diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java index bc04a1ded0f6..13a75f8ebc7b 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ReflectionUtils.java @@ -20,7 +20,7 @@ public static Method findMethod(Class cls, String methodName) { /** * For covariant return type, return the most specific method. * - *

Returns all methods named by {@code methodName}, + * @return all methods named by {@code methodName}, */ public static List findMethods(Class cls, String methodName) { List> classes = new ArrayList<>(); diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java index b8336cd145be..b00b6ee96b85 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/util/ResourceUtil.java @@ -52,8 +52,8 @@ public static void logProcessMemoryDetail() { } /** - * Returns jvm heap usage ratio. note that one of the survivor space is not include in total - * memory while calculating this ratio. + * @return jvm heap usage ratio. note that one of the survivor space is not include in total + * memory while calculating this ratio. */ public static double getJvmHeapUsageRatio() { Runtime runtime = Runtime.getRuntime(); @@ -61,8 +61,8 @@ public static double getJvmHeapUsageRatio() { } /** - * Returns jvm heap usage(in bytes). note that this value doesn't include one of the survivor - * space. + * @return jvm heap usage(in bytes). note that this value doesn't include one of the survivor + * space. */ public static long getJvmHeapUsageInBytes() { Runtime runtime = Runtime.getRuntime(); @@ -95,8 +95,8 @@ public static double getProcessCpuUsage() { } /** - * Returns the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` - * to get cpu usage by default, and use MXBean if any exception raised. + * @return the system cpu usage. This value is a double in the [0.0,1.0] We will try to use `vsar` + * to get cpu usage by default, and use MXBean if any exception raised. */ public static double getSystemCpuUsage() { double cpuUsage = 0.0; @@ -109,10 +109,10 @@ public static double getSystemCpuUsage() { } /** - * Returns the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] - * interval. A value of 0.0 means that all CPUs were idle during the recent period of time - * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time - * during the recent period being observed + * @return the "recent cpu usage" for the whole system. This value is a double in the [0.0,1.0] + * interval. A value of 0.0 means that all CPUs were idle during the recent period of time + * observed, while a value of 1.0 means that all CPUs were actively running 100% of the time + * during the recent period being observed */ public static double getSystemCpuUtilByMXBean() { return osmxb.getSystemCpuLoad(); @@ -144,7 +144,7 @@ public static double getSystemCpuUtilByVsar() throws Exception { return cpuUsageFromVsar; } - /** Returnss the system load average for the last minute */ + /** Returns the system load average for the last minute */ public static double getSystemLoadAverage() { return osmxb.getSystemLoadAverage(); } @@ -158,7 +158,8 @@ public static int getCpuCores() { * Get containers by hostname of address * * @param containers container list - * @param containerHosts container hostname or address set Returns matched containers + * @param containerHosts container hostname or address set + * @return matched containers */ public static List getContainersByHostname( List containers, Collection containerHosts) { @@ -174,7 +175,8 @@ public static List getContainersByHostname( /** * Get container by hostname * - * @param hostName container hostname Returns container + * @param hostName container hostname + * @return container */ public static Optional getContainerByHostname( List containers, String hostName) { @@ -188,7 +190,8 @@ public static Optional getContainerByHostname( /** * Get container by id * - * @param containerID container id Returns container + * @param containerID container id + * @return container */ public static Optional getContainerById( List containers, ContainerId containerID) { diff --git a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java index 5a6554802bc3..15200c65633e 100644 --- a/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java +++ b/streaming/java/streaming-runtime/src/main/java/io/ray/streaming/runtime/worker/JobWorker.java @@ -137,8 +137,8 @@ public Boolean init(JobWorkerContext workerContext) { /** * Start worker's stream tasks with specific checkpoint ID. * - *

Returns a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link - * ChannelCreationStatus} of each input queue. + * @return a {@link CallResult} with {@link ChannelRecoverInfo}, contains {@link + * ChannelCreationStatus} of each input queue. */ public CallResult rollback(Long checkpointId, Long startRollbackTs) { synchronized (initialStateChangeLock) { diff --git a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java index 5fe774e20b22..eb48f1691a12 100644 --- a/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java +++ b/streaming/java/streaming-runtime/src/test/java/io/ray/streaming/runtime/util/Mockitools.java @@ -49,8 +49,8 @@ public static List mockGetAllNodeInfo() { /** * Mock get node info map * - * @param nodeInfos all node infos fetched from GCS Returns node info map, key is node unique id, - * value is node info + * @param nodeInfos all node infos fetched from GCS + * @return node info map, key is node unique id, value is node info */ public static Map mockGetNodeInfoMap(List nodeInfos) { return nodeInfos.stream() diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java index 10f99c0b6b2f..921ea8598b43 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/KeyGroupAssignment.java @@ -50,8 +50,8 @@ public static KeyGroup getKeyGroup(int maxParallelism, int parallelism, int inde * Assigning the key to a key-group index. * * @param key the key to assign. - * @param maxParallelism the maximum parallelism. Returns the key-group index to which the given - * key is assigned. + * @param maxParallelism the maximum parallelism. + * @return the key-group index to which the given key is assigned. */ public static int assignKeyGroupIndexForKey(Object key, int maxParallelism) { return Math.abs(key.hashCode() % maxParallelism); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java index 933081af5383..a632d21d0728 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/MapState.java @@ -28,7 +28,8 @@ public interface MapState extends UnaryState> { /** * Returns the current value associated with the given key. * - * @param key The key of the mapping Returns The value of the mapping with the given key + * @param key The key of the mapping + * @return The value of the mapping with the given key */ V get(K key); @@ -64,8 +65,8 @@ public interface MapState extends UnaryState> { /** * Returns whether there exists the given mapping. * - * @param key The key of the mapping Returns True if there exists a mapping whose key equals to - * the given key + * @param key The key of the mapping + * @return True if there exists a mapping whose key equals to the given key */ default boolean contains(K key) { return get().containsKey(key); @@ -74,7 +75,7 @@ default boolean contains(K key) { /** * Returns all the mappings in the state * - *

Returns An iterable view of all the key-value pairs in the state. + * @return An iterable view of all the key-value pairs in the state. */ default Iterable> entries() { return get().entrySet(); @@ -83,7 +84,7 @@ default Iterable> entries() { /** * Returns all the keys in the state * - *

Returns An iterable view of all the keys in the state. + * @return An iterable view of all the keys in the state. */ default Iterable keys() { return get().keySet(); @@ -92,7 +93,7 @@ default Iterable keys() { /** * Returns all the values in the state. * - *

Returns An iterable view of all the values in the state. + * @return An iterable view of all the values in the state. */ default Iterable values() { return get().values(); @@ -101,7 +102,7 @@ default Iterable values() { /** * Iterates over all the mappings in the state. * - *

Returns An iterator over all the mappings in the state + * @return An iterator over all the mappings in the state */ default Iterator> iterator() { return get().entrySet().iterator(); diff --git a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java index 5c250b594973..637b573144b8 100644 --- a/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java +++ b/streaming/java/streaming-state/src/main/java/io/ray/streaming/state/keystate/state/UnaryState.java @@ -24,7 +24,7 @@ public interface UnaryState extends State { /** * get the value in state * - *

Returns the value in state + * @return the value in state */ O get(); } From a82fa80f7b00863d1732d7e74ba6b63b383f7a90 Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Thu, 21 Jan 2021 10:15:18 -0700 Subject: [PATCH 006/245] Inline small objects in GetObjectStatus response. (#13309) --- python/ray/_raylet.pyx | 7 +-- python/ray/includes/libcoreworker.pxd | 3 +- python/ray/tests/test_advanced.py | 37 ++++++++++++++ src/ray/core_worker/core_worker.cc | 48 +++++++++++++----- src/ray/core_worker/core_worker.h | 4 +- src/ray/core_worker/future_resolver.cc | 69 +++++++++++++++++--------- src/ray/core_worker/future_resolver.h | 1 + src/ray/protobuf/core_worker.proto | 12 +++++ 8 files changed, 140 insertions(+), 41 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8ba80852fb40..4b5f9deeef1a 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -898,16 +898,17 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref): + def object_exists(self, ObjectRef object_ref, memory_store_only=False): cdef: c_bool has_object + c_bool is_in_plasma CObjectID c_object_id = object_ref.native() with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().Contains( - c_object_id, &has_object)) + c_object_id, &has_object, &is_in_plasma)) - return has_object + return has_object and (not memory_store_only or not is_in_plasma) cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index f1acad1fadd8..637dbd750020 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,7 +183,8 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) + CRayStatus Contains(const CObjectID &object_id, c_bool *has_object, + c_bool *is_in_plasma) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, c_bool fetch_local) diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index 6df746fdcd91..8f607009ee49 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -521,6 +521,43 @@ def method(self): assert ray.worker.global_worker.core_worker.object_exists(x_id) +@pytest.mark.skipif(client_test_enabled(), reason="internal api") +def test_future_resolution_skip_plasma(ray_start_cluster): + cluster = ray_start_cluster + # Disable worker caching so worker leases are not reused; set object + # inlining size threshold and enable storing of small objects in in-memory + # object store so the borrowed ref is inlined. + cluster.add_node( + num_cpus=1, + resources={"pin_head": 1}, + _system_config={ + "worker_lease_timeout_milliseconds": 0, + "max_direct_call_object_size": 100 * 1024, + "put_small_object_in_memory_store": True, + }, + ) + cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) + ray.init(address=cluster.address) + + @ray.remote(resources={"pin_head": 1}) + def f(x): + return x + 1 + + @ray.remote(resources={"pin_worker": 1}) + def g(x): + borrowed_ref = x[0] + f_ref = f.remote(borrowed_ref) + # borrowed_ref should be inlined on future resolution and shouldn't be + # in Plasma. + assert ray.worker.global_worker.core_worker.object_exists( + borrowed_ref, memory_store_only=True) + return ray.get(f_ref) * 2 + + one = ray.put(1) + g_ref = g.remote([one]) + assert ray.get(g_ref) == 4 + + if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 21fc462a7af6..dfbe8ef2ccd3 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1058,7 +1058,8 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { +Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, + bool *is_in_plasma) { bool found = false; bool in_plasma = false; found = memory_store_->Contains(object_id, &in_plasma); @@ -1066,6 +1067,9 @@ Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { RAY_RETURN_NOT_OK(plasma_store_provider_->Contains(object_id, &found)); } *has_object = found; + if (is_in_plasma != nullptr) { + *is_in_plasma = found && in_plasma; + } return Status::OK(); } @@ -2091,25 +2095,43 @@ void CoreWorker::HandleGetObjectStatus(const rpc::GetObjectStatusRequest &reques send_reply_callback(Status::OK(), nullptr, nullptr); } else { RAY_CHECK(owner_address.worker_id() == request.owner_worker_id()); + bool is_freed = reference_counter_->IsPlasmaObjectFreed(object_id); - if (reference_counter_->IsPlasmaObjectFreed(object_id)) { - reply->set_status(rpc::GetObjectStatusReply::FREED); - } else { - reply->set_status(rpc::GetObjectStatusReply::CREATED); - } // Send the reply once the value has become available. The value is // guaranteed to become available eventually because we own the object and // its ref count is > 0. - // TODO(swang): We could probably just send the object value if it is small - // enough and we have it local. - memory_store_->GetAsync(object_id, - [send_reply_callback](std::shared_ptr obj) { - send_reply_callback(Status::OK(), nullptr, nullptr); - }); + memory_store_->GetAsync(object_id, [reply, send_reply_callback, + is_freed](std::shared_ptr obj) { + if (is_freed) { + reply->set_status(rpc::GetObjectStatusReply::FREED); + } else { + // If obj is the concrete object value, it is small, so we + // send the object back to the caller in the GetObjectStatus + // reply, bypassing a Plasma put and object transfer. If obj + // is an indicator that the object is in Plasma, we set an + // in_plasma indicator on the message, and the caller will + // have to facilitate a Plasma object transfer to get the + // object value. + auto *object = reply->mutable_object(); + if (obj->HasData()) { + const auto &data = obj->GetData(); + object->set_data(data->Data(), data->Size()); + } + if (obj->HasMetadata()) { + const auto &metadata = obj->GetMetadata(); + object->set_metadata(metadata->Data(), metadata->Size()); + } + for (const auto &nested_id : obj->GetNestedIds()) { + object->add_nested_inlined_ids(nested_id.Binary()); + } + reply->set_status(rpc::GetObjectStatusReply::CREATED); + } + send_reply_callback(Status::OK(), nullptr, nullptr); + }); } RemoveLocalReference(object_id); -} +} // namespace ray void CoreWorker::HandleWaitForActorOutOfScope( const rpc::WaitForActorOutOfScopeRequest &request, diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 088ba346a70c..3002b9003630 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -559,8 +559,10 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// /// \param[in] object_id ID of the objects to check for. /// \param[out] has_object Whether or not the object is present. + /// \param[out] is_in_plasma Whether or not the object is in Plasma. /// \return Status. - Status Contains(const ObjectID &object_id, bool *has_object); + Status Contains(const ObjectID &object_id, bool *has_object, + bool *is_in_plasma = nullptr); /// Wait for a list of objects to appear in the object store. /// Duplicate object ids are supported, and `num_objects` includes duplicate ids in this diff --git a/src/ray/core_worker/future_resolver.cc b/src/ray/core_worker/future_resolver.cc index 8a1cc3f078ef..c625507cdbb5 100644 --- a/src/ray/core_worker/future_resolver.cc +++ b/src/ray/core_worker/future_resolver.cc @@ -28,30 +28,53 @@ void FutureResolver::ResolveFutureAsync(const ObjectID &object_id, rpc::GetObjectStatusRequest request; request.set_object_id(object_id.Binary()); request.set_owner_worker_id(owner_address.worker_id()); - conn->GetObjectStatus( - request, - [this, object_id](const Status &status, const rpc::GetObjectStatusReply &reply) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id - << " that was deserialized: " << status.ToString(); - } + conn->GetObjectStatus(request, [this, object_id]( + const Status &status, + const rpc::GetObjectStatusReply &reply) { + if (!status.ok()) { + RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id + << " that was deserialized: " << status.ToString(); + } - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { - // The owner is gone or the owner replied that the object has gone - // out of scope (this is an edge case in the distributed ref counting - // protocol where a borrower dies before it can notify the owner of - // another borrower). Store an error so that an exception will be - // thrown immediately when the worker tries to get the value. - RAY_UNUSED(in_memory_store_->Put( - RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); - } else { - // We can now try to fetch the object via plasma. If the owner later - // fails or the object is released, the raylet will eventually store - // an error in plasma on our behalf. - RAY_UNUSED(in_memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), - object_id)); - } - }); + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { + // The owner is gone or the owner replied that the object has gone + // out of scope (this is an edge case in the distributed ref counting + // protocol where a borrower dies before it can notify the owner of + // another borrower). Store an error so that an exception will be + // thrown immediately when the worker tries to get the value. + RAY_UNUSED(in_memory_store_->Put( + RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); + } else if (reply.status() == rpc::GetObjectStatusReply::CREATED) { + // The object is either an indicator that the object is in Plasma, or + // the object has been returned directly in the reply. In either + // case, we put the corresponding RayObject into the in-memory store. + // If the owner later fails or the object is released, the raylet + // will eventually store an error in Plasma on our behalf. + const auto &data = reply.object().data(); + std::shared_ptr data_buffer; + if (data.size() > 0) { + RAY_LOG(DEBUG) << "Object returned directly in GetObjectStatus reply, putting " + << object_id << " in memory store"; + data_buffer = std::make_shared( + const_cast(reinterpret_cast(data.data())), + data.size()); + } else { + RAY_LOG(DEBUG) << "Object not returned directly in GetObjectStatus reply, " + << object_id << " will have to be fetched from Plasma"; + } + const auto &metadata = reply.object().metadata(); + std::shared_ptr metadata_buffer; + if (metadata.size() > 0) { + metadata_buffer = std::make_shared( + const_cast(reinterpret_cast(metadata.data())), + metadata.size()); + } + auto inlined_ids = + IdVectorFromProtobuf(reply.object().nested_inlined_ids()); + RAY_UNUSED(in_memory_store_->Put( + RayObject(data_buffer, metadata_buffer, inlined_ids), object_id)); + } + }); } } // namespace ray diff --git a/src/ray/core_worker/future_resolver.h b/src/ray/core_worker/future_resolver.h index be504a582f3d..b774434b71da 100644 --- a/src/ray/core_worker/future_resolver.h +++ b/src/ray/core_worker/future_resolver.h @@ -16,6 +16,7 @@ #include +#include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/rpc/worker/core_worker_client.h" diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 799530d274e9..43dfaa45bbe0 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -132,6 +132,15 @@ message GetObjectStatusRequest { bytes object_id = 2; } +message RayObject { + // Data of the object. + bytes data = 1; + // Metadata of the object. + bytes metadata = 2; + // ObjectIDs that were nested in data. This is only set for inlined objects. + repeated bytes nested_inlined_ids = 3; +} + message GetObjectStatusReply { enum ObjectStatus { CREATED = 0; @@ -139,6 +148,9 @@ message GetObjectStatusReply { FREED = 2; } ObjectStatus status = 1; + // The Ray object: either a concrete value, an in-Plasma indicator, or an + // exception. + RayObject object = 2; } message WaitForActorOutOfScopeRequest { From 68038741ac2e1892db2456fed71083996613c884 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Thu, 21 Jan 2021 09:16:02 -0800 Subject: [PATCH 007/245] [serve] Refactor BackendState to use ReplicaState classes (#13406) --- python/ray/serve/backend_state.py | 533 +++++++++++++++++------------ python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 + 4 files changed, 327 insertions(+), 217 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 673c4b2cfbc8..4aad2671ea4e 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,7 +1,8 @@ import asyncio -from asyncio.futures import Future from collections import defaultdict -from typing import Dict, Any, List, Optional, Set, Tuple +from enum import Enum +import time +from typing import Dict, List, Optional, Tuple import ray import ray.cloudpickle as pickle @@ -17,7 +18,6 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey -from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,6 +30,150 @@ _RESOURCE_CHECK_ENABLED = True +class ReplicaState(Enum): + SHOULD_START = 1 + STARTING = 2 + RUNNING = 3 + SHOULD_STOP = 4 + STOPPING = 5 + STOPPED = 6 + + +class BackendReplica: + def __init__(self, controller_name: str, detached: bool, + replica_tag: ReplicaTag, backend_tag: BackendTag): + self._actor_name = format_actor_name(replica_tag, controller_name) + self._controller_name = controller_name + self._detached = detached + self._replica_tag = replica_tag + self._backend_tag = backend_tag + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._state = ReplicaState.SHOULD_START + + def __get_state__(self): + clean_dict = self.__dict__.copy() + del clean_dict["_actor_handle"] + del clean_dict["_startup_obj_ref"] + del clean_dict["_drain_obj_ref"] + return clean_dict + + def __set_state__(self, d): + self.__dict__ = d + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._recover_from_checkpoint() + + def _recover_from_checkpoint(self): + if self._state == ReplicaState.STARTING: + # We do not need to pass in the class here because the actor + # creation has already been started if this class was checkpointed + # in the STARTING state. + self.start() + elif self._state == ReplicaState.RUNNING: + # Fetch actor handles for all backend replicas in the system. + # The actors must exist if this class was checkpointed in the + # RUNNING state. + self._actor_handle = ray.get_actor(self._actor_name) + elif self._state == ReplicaState.STOPPING: + self.stop() + + def start(self, backend_info: Optional[BackendInfo]): + assert self._state in { + ReplicaState.SHOULD_START, ReplicaState.STARTING + }, (f"State must be {ReplicaState.SHOULD_START} or " + f"{ReplicaState.STARTING}, *not* {self._state}") + try: + self._actor_handle = ray.get_actor(self._actor_name) + except ValueError: + logger.debug("Starting replica '{}' for backend '{}'.".format( + self._replica_tag, self._backend_tag)) + self._actor_handle = ray.remote(backend_info.worker_class).options( + name=self._actor_name, + lifetime="detached" if self._detached else None, + max_restarts=-1, + max_task_retries=-1, + **backend_info.replica_config.ray_actor_options).remote( + self._backend_tag, self._replica_tag, + backend_info.replica_config.actor_init_args, + backend_info.backend_config, self._controller_name) + self._startup_obj_ref = self._actor_handle.ready.remote() + self._state = ReplicaState.STARTING + + def check_started(self): + if self._state == ReplicaState.RUNNING: + return True + assert self._state == ReplicaState.STARTING, ( + f"State must be {ReplicaState.STARTING}, *not* {self._state}") + ready, _ = ray.wait([self._startup_obj_ref], timeout=0) + if len(ready) == 1: + self._state = ReplicaState.RUNNING + return True + return False + + def set_should_stop(self, graceful_shutdown_timeout_s: Duration): + self._state = ReplicaState.SHOULD_STOP + self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s + + def stop(self): + # We need to handle transitions from: + # SHOULD_START -> SHOULD_STOP -> STOPPING + # This means that the replica_handle may not have been created. + + assert self._state in { + ReplicaState.SHOULD_STOP, ReplicaState.STOPPING + }, (f"State must be {ReplicaState.SHOULD_STOP} or " + f"{ReplicaState.STOPPING}, *not* {self._state}") + + def drain_actor(actor_name): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(actor_name) + except ValueError: + return None + return replica.drain_pending_queries.remote() + + self._state = ReplicaState.STOPPING + self._drain_obj_ref = drain_actor(self._actor_name) + self._shutdown_deadline = time.time( + ) + self._graceful_shutdown_timeout_s + + def check_stopped(self): + if self._state == ReplicaState.STOPPED: + return True + assert self._state == ReplicaState.STOPPING, ( + f"State must be {ReplicaState.STOPPING}, *not* {self._state}") + + try: + replica = ray.get_actor(self._actor_name) + except ValueError: + self._state = ReplicaState.STOPPED + return True + + ready, _ = ray.wait([self._drain_obj_ref], timeout=0) + timeout_passed = time.time() > self._shutdown_deadline + + if len(ready) == 1 or timeout_passed: + if timeout_passed: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{self._actor_name} did not shutdown after " + f"{self._graceful_shutdown_timeout_s}s, force-killing.") + + ray.kill(replica, no_restart=True) + self._state = ReplicaState.STOPPED + return True + return False + + def get_actor_handle(self): + assert self._state == ReplicaState.RUNNING, ( + f"State must be {ReplicaState.RUNNING}, *not* {self._state}") + return self._actor_handle + + class BackendState: """Manages all state for backends in the system. @@ -46,79 +190,65 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - # Non-checkpointed state. - self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag, ActorHandle]] = dict() - self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag]] = dict() - - # Checkpointed state. - self.backends: Dict[BackendTag, BackendInfo] = dict() - self.backend_replicas: Dict[BackendTag, Dict[ - ReplicaTag, ActorHandle]] = defaultdict(dict) + self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ + BackendReplica]]] = defaultdict(lambda: defaultdict(list)) + self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() + self._target_replicas: Dict[BackendTag, int] = defaultdict(int) self.backend_goals: Dict[BackendTag, GoalId] = dict() - self.backend_replicas_to_start: Dict[BackendTag, List[ - ReplicaTag]] = defaultdict(list) - self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ - ReplicaTag, Duration]]] = defaultdict(list) - self.backends_to_remove: List[BackendTag] = list() + + # Un-Checkpointed state. + self.pending_goals: Dict[GoalId, asyncio.Event] = dict() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backend_to_remove, - pending_goal_ids) = pickle.loads(checkpoint) + (self._replicas, self._backend_metadata, self._target_replicas, + self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) - # Fetch actor handles for all backend replicas in the system. - # All of these backend_replicas are guaranteed to already exist - # because they would not be written to a checkpoint in - # self.backend_replicas until they were created. - for backend_tag, replica_dict in self.backend_replicas.items(): - for replica_tag in replica_dict.keys(): - replica_name = format_actor_name(replica_tag, - self._controller_name) - self.backend_replicas[backend_tag][ - replica_tag] = ray.get_actor(replica_name) - self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps( - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backends_to_remove, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps((self._replicas, self._backend_metadata, + self._target_replicas, self.backend_goals, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) + def get_running_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return { + backend_tag: { + backend_replica._replica_tag: + backend_replica.get_actor_handle() + for backend_replica in state_to_replica_dict[ + ReplicaState.RUNNING] + } + for backend_tag, state_to_replica_dict in self._replicas.items() + } + def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in self.backend_replicas.items() + for backend_tag, replica_dict in + self.get_running_replica_handles().items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self.backends.items() + for tag, info in self._backend_metadata.items() } - def get_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return self.backend_replicas - def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self.backends.get(backend_tag) + return self._backend_metadata.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -126,7 +256,11 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self.backends[backend_tag] = backend_info + self._backend_metadata[backend_tag] = backend_info + self._target_replicas[ + backend_tag] = backend_info.backend_config.num_replicas + else: + self._target_replicas[backend_tag] = 0 self.backend_goals[backend_tag] = new_goal_id @@ -136,31 +270,25 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self.backends.get(backend_tag) + backend_info = self._backend_metadata.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica = create_backend_replica(replica_config.func_or_class) + backend_replica_class = create_backend_replica( + replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica, + worker_class=backend_replica_class, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) - try: - self.scale_backend_replicas(backend_tag, - backend_config.num_replicas) - except RayServeException as e: - del self.backends[backend_tag] - raise e - # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -175,20 +303,15 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: return None - # Scale its replicas down to 0. - self.scale_backend_replicas(backend_tag, 0, force_kill) - - # Remove the backend's metadata. - del self.backends[backend_tag] - - # Add the intention to remove the backend from the routers. - self.backends_to_remove.append(backend_tag) - new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) + if force_kill: + self._backend_metadata[ + backend_tag].backend_config.\ + experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -197,20 +320,18 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self.backends[backend_tag].backend_config + stored_backend_config = self._backend_metadata[ + backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self.backends[backend_tag].backend_config = updated_config + self._backend_metadata[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self.backends[backend_tag]) - - # Scale the replicas with the new configuration. - self.scale_backend_replicas(backend_tag, updated_config.num_replicas) + backend_tag, self._backend_metadata[backend_tag]) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -260,31 +381,38 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - num_replicas: int, - force_kill: bool = False, - ) -> None: + ) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds the intention to start/stop them to self.backend_replicas_to_start - and self.backend_replicas_to_stop. The caller is responsible for then - first writing a checkpoint and then actually starting/stopping the - intended replicas. This avoids inconsistencies with starting/stopping a - replica and then crashing before writing a checkpoint. + adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. + The caller is responsible for then first writing a checkpoint and then + actually starting/stopping the intended replicas. This avoids + inconsistencies with starting/stopping a replica and then crashing + before writing a checkpoint. """ + num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self.backends + assert (backend_tag in self._backend_metadata ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = len(self.backend_replicas[backend_tag]) + current_num_replicas = sum([ + len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), + len(self._replicas[backend_tag][ReplicaState.STARTING]), + len(self._replicas[backend_tag][ReplicaState.RUNNING]), + ]) + delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self.backends[backend_tag] - if delta_num_replicas > 0: + backend_info: BackendInfo = self._backend_metadata[backend_tag] + if delta_num_replicas == 0: + return False + + elif delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -292,10 +420,11 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - raise RayServeException( + logger.error( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. To fix this, consider scaling to replica to " + "to be added. This is not a problem if the cluster is " + "autoscaling. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -305,154 +434,132 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self.backend_replicas_to_start[backend_tag].append(replica_tag) + self._replicas[backend_tag][ReplicaState.SHOULD_START].append( + BackendReplica(self._controller_name, self._detached, + replica_tag, backend_tag)) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert len( - self.backend_replicas[backend_tag]) >= delta_num_replicas - replicas_copy = self.backend_replicas.copy() + assert self._target_replicas[backend_tag] >= delta_num_replicas + for _ in range(-delta_num_replicas): - replica_tag, _ = replicas_copy[backend_tag].popitem() + replica_state_dict = self._replicas[backend_tag] + list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ + or replica_state_dict[ReplicaState.STARTING] \ + or replica_state_dict[ReplicaState.RUNNING] + + assert len(list_to_use), replica_state_dict + replica_to_stop = list_to_use.pop() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - if force_kill: - graceful_timeout_s = 0 - self.backend_replicas_to_stop[backend_tag].append(( - replica_tag, - graceful_timeout_s, - )) - - def _start_pending_replicas(self): - for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ - items(): - for replica_tag in replicas_to_create: - replica_handle = self._start_backend_replica( - backend_tag, replica_tag) - ready_future = replica_handle.ready.remote().as_future() - self.currently_starting_replicas[ready_future] = ( - backend_tag, replica_tag, replica_handle) - - def _stop_pending_replicas(self): - for backend_tag, replicas_to_stop in ( - self.backend_replicas_to_stop.items()): - for replica_tag, shutdown_timeout in replicas_to_stop: - replica_name = format_actor_name(replica_tag, - self._controller_name) - - async def kill_actor(replica_name_to_use): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(replica_name_to_use) - except ValueError: - return - - try: - await asyncio.wait_for( - replica.drain_pending_queries.remote(), - timeout=shutdown_timeout) - except asyncio.TimeoutError: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{replica_name_to_use} did not shutdown after " - f"{shutdown_timeout}s, killing.") - finally: - ray.kill(replica, no_restart=True) - - self.currently_stopping_replicas[asyncio.ensure_future( - kill_actor(replica_name))] = (backend_tag, replica_tag) - - async def _check_currently_starting_replicas(self) -> int: - """Returns the number of pending replicas waiting to start""" - in_flight: Set[Future[Any]] = set() - - if self.currently_starting_replicas: - done, in_flight = await asyncio.wait( - list(self.currently_starting_replicas.keys()), timeout=0) - for fut in done: - (backend_tag, replica_tag, - replica_handle) = self.currently_starting_replicas.pop(fut) - self.backend_replicas[backend_tag][ - replica_tag] = replica_handle - - backend = self.backend_replicas_to_start.get(backend_tag) - if backend: - try: - backend.remove(replica_tag) - except ValueError: - pass - if len(backend) == 0: - del self.backend_replicas_to_start[backend_tag] - - async def _check_currently_stopping_replicas(self) -> int: - """Returns the number of replicas waiting to stop""" - in_flight: Set[Future[Any]] = set() - - if self.currently_stopping_replicas: - done_stopping, in_flight = await asyncio.wait( - list(self.currently_stopping_replicas.keys()), timeout=0) - for fut in done_stopping: - (backend_tag, - replica_tag) = self.currently_stopping_replicas.pop(fut) - - backend_to_stop = self.backend_replicas_to_stop.get( - backend_tag) - - if backend_to_stop: - try: - backend_to_stop.remove(replica_tag) - except ValueError: - pass - if len(backend_to_stop) == 0: - del self.backend_replicas_to_stop[backend_tag] - - backend = self.backend_replicas.get(backend_tag) - if backend: - try: - del backend[replica_tag] - except KeyError: - pass - - if len(self.backend_replicas[backend_tag]) == 0: - del self.backend_replicas[backend_tag] + + replica_to_stop.set_should_stop(graceful_timeout_s) + self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( + replica_to_stop) + + return True + + def scale_all_backends(self): + checkpoint_needed = False + for backend_tag, num_replicas in list(self._target_replicas.items()): + checkpoint_needed = (checkpoint_needed + or self.scale_backend_replicas(backend_tag)) + if num_replicas == 0: + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if checkpoint_needed: + self._checkpoint() + + def _pop_replicas_of_state(self, state: ReplicaState + ) -> List[Tuple[ReplicaState, BackendTag]]: + replicas = [] + for backend_tag, state_to_replica_dict in self._replicas.items(): + if state in state_to_replica_dict: + replicas.extend( + (replica, backend_tag) + for replica in state_to_replica_dict.pop(state)) + + return replicas def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self.backend_replicas.keys()).union( - set(self.backends.keys())) + all_tags = set(self._replicas.keys()).union( + set(self._backend_metadata.keys())) for backend_tag in all_tags: - desired_info = self.backends.get(backend_tag) - existing_info = self.backend_replicas.get(backend_tag) + desired_num_replicas = self._target_replicas.get(backend_tag) + state_dict = self._replicas.get(backend_tag, {}) + existing_info = state_dict.get(ReplicaState.RUNNING, []) + + # If we have pending ops, the current goal is *not* ready + if (state_dict.get(ReplicaState.SHOULD_START) + or state_dict.get(ReplicaState.STARTING) + or state_dict.get(ReplicaState.SHOULD_STOP) + or state_dict.get(ReplicaState.STOPPING)): + continue + + # TODO(ilr): FIX # Check for deleting - if (not desired_info or - desired_info.backend_config.num_replicas == 0) and \ + if (not desired_num_replicas or + desired_num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append(self.backend_goals.get(backend_tag)) + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) # Check for a non-zero number of backends - if desired_info and existing_info and desired_info.backend_config.\ - num_replicas == len(existing_info): - completed_goals.append(self.backend_goals.get(backend_tag)) + if (desired_num_replicas and existing_info) \ + and desired_num_replicas == len(existing_info): + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: + self.scale_all_backends() + for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - self._start_pending_replicas() - self._stop_pending_replicas() - - num_starting = len(self.currently_starting_replicas) - num_stopping = len(self.currently_stopping_replicas) - - await self._check_currently_starting_replicas() - await self._check_currently_stopping_replicas() - - if (len(self.currently_starting_replicas) != num_starting) or \ - (len(self.currently_stopping_replicas) != num_stopping): + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_START): + replica_state.start(self._backend_metadata[backend_tag]) + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_STOP): + replica_state.stop() + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + transition_triggered = False + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STARTING): + if replica_state.check_started(): + self._replicas[backend_tag][ReplicaState.RUNNING].append( + replica_state) + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STOPPING): + if replica_state.check_stopped(): + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + for backend_tag in list(self._replicas.keys()): + if not any(self._replicas[backend_tag]): + del self._replicas[backend_tag] + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if transition_triggered: self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 205af81b065a..41a1eca08ae8 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 + experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index a3c75c711878..b5c65111a8f9 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_replica_handles() + return self.backend_state.get_running_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_replica_handles( + for replica_dict in self.backend_state.get_running_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index 202b01386059..a35f7e54b361 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,6 +683,9 @@ def f(): client.create_endpoint("endpoint", backend="backend") +# This error is only printed because creation is run in the control loop, not +# in the API path. +@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance From 87ca102c9300ce48106515f1a66a431b2fd9e25e Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Thu, 21 Jan 2021 10:00:37 -0800 Subject: [PATCH 008/245] [Kubernetes] Unit test for cluster launch and teardown using K8s Operator (#13437) --- ci/travis/ci.sh | 1 + python/ray/tests/BUILD | 3 +- .../ray/tests/test_k8s_operator_examples.py | 150 ++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 python/ray/tests/test_k8s_operator_examples.py diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index a403a4a9f522..d9c679bc7218 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -164,6 +164,7 @@ test_python() { -python/ray/tests:test_stress # timeout -python/ray/tests:test_stress_sharded # timeout -python/ray/tests:test_k8s_cluster_launcher + -python/ray/tests:test_k8s_operator_examples ) fi if [ 0 -lt "${#args[@]}" ]; then # Any targets to test? diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 0f2709c82fc0..7f4c61bb1cfb 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -111,8 +111,9 @@ py_test_module_list( py_test_module_list( files = [ "test_k8s_cluster_launcher.py", + "test_k8s_operator_examples.py", ], - size = "small", + size = "medium", extra_srcs = SRCS, deps = ["//:ray_lib"], tags = ["kubernetes"] diff --git a/python/ray/tests/test_k8s_operator_examples.py b/python/ray/tests/test_k8s_operator_examples.py new file mode 100644 index 000000000000..6ca2aca370b2 --- /dev/null +++ b/python/ray/tests/test_k8s_operator_examples.py @@ -0,0 +1,150 @@ +"""Tests launch and teardown of multiple Ray clusters using Kubernetes +operator.""" +import sys +import os +import subprocess +import tempfile +import time +import unittest + +import kubernetes +import pytest +import yaml + +IMAGE_ENV = "KUBERNETES_OPERATOR_TEST_IMAGE" +IMAGE = os.getenv(IMAGE_ENV, "rayproject/ray:nightly") +NAMESPACE = "test-k8s-operator-examples" + + +def retry_until_true(f): + # Retry 60 times with 1 second delay between attempts. + def f_with_retries(*args, **kwargs): + for _ in range(60): + if f(*args, **kwargs): + return + else: + time.sleep(1) + pytest.fail("The condition wasn't met before the timeout expired.") + + return f_with_retries + + +@retry_until_true +def wait_for_pods(n): + client = kubernetes.client.CoreV1Api() + pods = client.list_namespaced_pod(namespace=NAMESPACE).items + # Double-check that the correct image is use. + for pod in pods: + assert pod.spec.containers[0].image == IMAGE + return len(pods) == n + + +@retry_until_true +def wait_for_logs(): + """Check if logs indicate presence of nodes of types "head-node" and + "worker-nodes" in the "example-cluster" cluster.""" + cmd = f"kubectl -n {NAMESPACE} logs ray-operator-pod"\ + "| grep ^example-cluster: | tail -n 100" + log_tail = subprocess.check_output(cmd, shell=True).decode() + return ("head-node" in log_tail) and ("worker-nodes" in log_tail) + + +def operator_configs_directory(): + here = os.path.realpath(__file__) + ray_python_root = os.path.dirname(os.path.dirname(here)) + relative_path = "autoscaler/kubernetes/operator_configs" + return os.path.join(ray_python_root, relative_path) + + +def get_operator_config_path(file_name): + return os.path.join(operator_configs_directory(), file_name) + + +class KubernetesOperatorTest(unittest.TestCase): + def test_examples(self): + with tempfile.NamedTemporaryFile("w+") as example_cluster_file, \ + tempfile.NamedTemporaryFile("w+") as example_cluster2_file,\ + tempfile.NamedTemporaryFile("w+") as operator_file: + + # Get paths to operator configs + example_cluster_config_path = get_operator_config_path( + "example_cluster.yaml") + example_cluster2_config_path = get_operator_config_path( + "example_cluster2.yaml") + operator_config_path = get_operator_config_path("operator.yaml") + self.crd_path = get_operator_config_path("cluster_crd.yaml") + + # Load operator configs + example_cluster_config = yaml.safe_load( + open(example_cluster_config_path).read()) + example_cluster2_config = yaml.safe_load( + open(example_cluster2_config_path).read()) + operator_config = list( + yaml.safe_load_all(open(operator_config_path).read())) + + # Fill image fields + podTypes = example_cluster_config["spec"]["podTypes"] + podTypes2 = example_cluster2_config["spec"]["podTypes"] + pod_configs = ([operator_config[-1]] + [ + podType["podConfig"] for podType in podTypes + ] + [podType["podConfig"] for podType in podTypes2]) + for pod_config in pod_configs: + pod_config["spec"]["containers"][0]["image"] = IMAGE + + # Dump to temporary files + yaml.dump(example_cluster_config, example_cluster_file) + yaml.dump(example_cluster2_config, example_cluster2_file) + yaml.dump_all(operator_config, operator_file) + files = [ + example_cluster_file, example_cluster2_file, operator_file + ] + for file in files: + file.flush() + + # Apply CR + cmd = f"kubectl apply -f {self.crd_path}" + subprocess.check_call(cmd, shell=True) + + # Create namespace + cmd = f"kubectl create namespace {NAMESPACE}" + subprocess.check_call(cmd, shell=True) + + # Start operator and two clusters + for file in files: + cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}" + subprocess.check_call(cmd, shell=True) + + # Check that autoscaling respects minWorkers by waiting for + # six pods in the namespace. + wait_for_pods(6) + + # Check that logging output looks normal (two workers connected to + # ray cluster example-cluster.) + wait_for_logs() + + # Delete the second cluster + cmd = f"kubectl -n {NAMESPACE} delete -f"\ + f"{example_cluster2_file.name}" + subprocess.check_call(cmd, shell=True) + + # Four pods remain + wait_for_pods(4) + + # Delete the first cluster + cmd = f"kubectl -n {NAMESPACE} delete -f"\ + f"{example_cluster_file.name}" + subprocess.check_call(cmd, shell=True) + + # Only operator pod remains. + wait_for_pods(1) + + def __del__(self): + cmd = f"kubectl delete -f {self.crd_path}" + subprocess.check_call(cmd, shell=True) + cmd = f"kubectl delete namespace {NAMESPACE}" + subprocess.check_call(cmd, shell=True) + + +if __name__ == "__main__": + kubernetes.config.load_kube_config() + sys.exit(pytest.main(["-v", __file__])) From 20acc3b05e093d5bc6dbd83a70bf9b1d6c144434 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Thu, 21 Jan 2021 16:10:34 -0800 Subject: [PATCH 009/245] Revert "Inline small objects in GetObjectStatus response. (#13309)" (#13615) This reverts commit a82fa80f7b00863d1732d7e74ba6b63b383f7a90. --- python/ray/_raylet.pyx | 7 ++- python/ray/includes/libcoreworker.pxd | 3 +- python/ray/tests/test_advanced.py | 37 -------------- src/ray/core_worker/core_worker.cc | 48 +++++------------- src/ray/core_worker/core_worker.h | 4 +- src/ray/core_worker/future_resolver.cc | 69 +++++++++----------------- src/ray/core_worker/future_resolver.h | 1 - src/ray/protobuf/core_worker.proto | 12 ----- 8 files changed, 41 insertions(+), 140 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 4b5f9deeef1a..8ba80852fb40 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -898,17 +898,16 @@ cdef class CoreWorker: return RayObjectsToDataMetadataPairs(results) - def object_exists(self, ObjectRef object_ref, memory_store_only=False): + def object_exists(self, ObjectRef object_ref): cdef: c_bool has_object - c_bool is_in_plasma CObjectID c_object_id = object_ref.native() with nogil: check_status(CCoreWorkerProcess.GetCoreWorker().Contains( - c_object_id, &has_object, &is_in_plasma)) + c_object_id, &has_object)) - return has_object and (not memory_store_only or not is_in_plasma) + return has_object cdef _create_put_buffer(self, shared_ptr[CBuffer] &metadata, size_t data_size, ObjectRef object_ref, diff --git a/python/ray/includes/libcoreworker.pxd b/python/ray/includes/libcoreworker.pxd index 637dbd750020..f1acad1fadd8 100644 --- a/python/ray/includes/libcoreworker.pxd +++ b/python/ray/includes/libcoreworker.pxd @@ -183,8 +183,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil: CRayStatus Get(const c_vector[CObjectID] &ids, int64_t timeout_ms, c_vector[shared_ptr[CRayObject]] *results, c_bool plasma_objects_only) - CRayStatus Contains(const CObjectID &object_id, c_bool *has_object, - c_bool *is_in_plasma) + CRayStatus Contains(const CObjectID &object_id, c_bool *has_object) CRayStatus Wait(const c_vector[CObjectID] &object_ids, int num_objects, int64_t timeout_ms, c_vector[c_bool] *results, c_bool fetch_local) diff --git a/python/ray/tests/test_advanced.py b/python/ray/tests/test_advanced.py index 8f607009ee49..6df746fdcd91 100644 --- a/python/ray/tests/test_advanced.py +++ b/python/ray/tests/test_advanced.py @@ -521,43 +521,6 @@ def method(self): assert ray.worker.global_worker.core_worker.object_exists(x_id) -@pytest.mark.skipif(client_test_enabled(), reason="internal api") -def test_future_resolution_skip_plasma(ray_start_cluster): - cluster = ray_start_cluster - # Disable worker caching so worker leases are not reused; set object - # inlining size threshold and enable storing of small objects in in-memory - # object store so the borrowed ref is inlined. - cluster.add_node( - num_cpus=1, - resources={"pin_head": 1}, - _system_config={ - "worker_lease_timeout_milliseconds": 0, - "max_direct_call_object_size": 100 * 1024, - "put_small_object_in_memory_store": True, - }, - ) - cluster.add_node(num_cpus=1, resources={"pin_worker": 1}) - ray.init(address=cluster.address) - - @ray.remote(resources={"pin_head": 1}) - def f(x): - return x + 1 - - @ray.remote(resources={"pin_worker": 1}) - def g(x): - borrowed_ref = x[0] - f_ref = f.remote(borrowed_ref) - # borrowed_ref should be inlined on future resolution and shouldn't be - # in Plasma. - assert ray.worker.global_worker.core_worker.object_exists( - borrowed_ref, memory_store_only=True) - return ray.get(f_ref) * 2 - - one = ray.put(1) - g_ref = g.remote([one]) - assert ray.get(g_ref) == 4 - - if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index dfbe8ef2ccd3..21fc462a7af6 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1058,8 +1058,7 @@ Status CoreWorker::Get(const std::vector &ids, const int64_t timeout_m return Status::OK(); } -Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, - bool *is_in_plasma) { +Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object) { bool found = false; bool in_plasma = false; found = memory_store_->Contains(object_id, &in_plasma); @@ -1067,9 +1066,6 @@ Status CoreWorker::Contains(const ObjectID &object_id, bool *has_object, RAY_RETURN_NOT_OK(plasma_store_provider_->Contains(object_id, &found)); } *has_object = found; - if (is_in_plasma != nullptr) { - *is_in_plasma = found && in_plasma; - } return Status::OK(); } @@ -2095,43 +2091,25 @@ void CoreWorker::HandleGetObjectStatus(const rpc::GetObjectStatusRequest &reques send_reply_callback(Status::OK(), nullptr, nullptr); } else { RAY_CHECK(owner_address.worker_id() == request.owner_worker_id()); - bool is_freed = reference_counter_->IsPlasmaObjectFreed(object_id); + if (reference_counter_->IsPlasmaObjectFreed(object_id)) { + reply->set_status(rpc::GetObjectStatusReply::FREED); + } else { + reply->set_status(rpc::GetObjectStatusReply::CREATED); + } // Send the reply once the value has become available. The value is // guaranteed to become available eventually because we own the object and // its ref count is > 0. - memory_store_->GetAsync(object_id, [reply, send_reply_callback, - is_freed](std::shared_ptr obj) { - if (is_freed) { - reply->set_status(rpc::GetObjectStatusReply::FREED); - } else { - // If obj is the concrete object value, it is small, so we - // send the object back to the caller in the GetObjectStatus - // reply, bypassing a Plasma put and object transfer. If obj - // is an indicator that the object is in Plasma, we set an - // in_plasma indicator on the message, and the caller will - // have to facilitate a Plasma object transfer to get the - // object value. - auto *object = reply->mutable_object(); - if (obj->HasData()) { - const auto &data = obj->GetData(); - object->set_data(data->Data(), data->Size()); - } - if (obj->HasMetadata()) { - const auto &metadata = obj->GetMetadata(); - object->set_metadata(metadata->Data(), metadata->Size()); - } - for (const auto &nested_id : obj->GetNestedIds()) { - object->add_nested_inlined_ids(nested_id.Binary()); - } - reply->set_status(rpc::GetObjectStatusReply::CREATED); - } - send_reply_callback(Status::OK(), nullptr, nullptr); - }); + // TODO(swang): We could probably just send the object value if it is small + // enough and we have it local. + memory_store_->GetAsync(object_id, + [send_reply_callback](std::shared_ptr obj) { + send_reply_callback(Status::OK(), nullptr, nullptr); + }); } RemoveLocalReference(object_id); -} // namespace ray +} void CoreWorker::HandleWaitForActorOutOfScope( const rpc::WaitForActorOutOfScopeRequest &request, diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h index 3002b9003630..088ba346a70c 100644 --- a/src/ray/core_worker/core_worker.h +++ b/src/ray/core_worker/core_worker.h @@ -559,10 +559,8 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler { /// /// \param[in] object_id ID of the objects to check for. /// \param[out] has_object Whether or not the object is present. - /// \param[out] is_in_plasma Whether or not the object is in Plasma. /// \return Status. - Status Contains(const ObjectID &object_id, bool *has_object, - bool *is_in_plasma = nullptr); + Status Contains(const ObjectID &object_id, bool *has_object); /// Wait for a list of objects to appear in the object store. /// Duplicate object ids are supported, and `num_objects` includes duplicate ids in this diff --git a/src/ray/core_worker/future_resolver.cc b/src/ray/core_worker/future_resolver.cc index c625507cdbb5..8a1cc3f078ef 100644 --- a/src/ray/core_worker/future_resolver.cc +++ b/src/ray/core_worker/future_resolver.cc @@ -28,53 +28,30 @@ void FutureResolver::ResolveFutureAsync(const ObjectID &object_id, rpc::GetObjectStatusRequest request; request.set_object_id(object_id.Binary()); request.set_owner_worker_id(owner_address.worker_id()); - conn->GetObjectStatus(request, [this, object_id]( - const Status &status, - const rpc::GetObjectStatusReply &reply) { - if (!status.ok()) { - RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id - << " that was deserialized: " << status.ToString(); - } + conn->GetObjectStatus( + request, + [this, object_id](const Status &status, const rpc::GetObjectStatusReply &reply) { + if (!status.ok()) { + RAY_LOG(WARNING) << "Error retrieving the value of object ID " << object_id + << " that was deserialized: " << status.ToString(); + } - if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { - // The owner is gone or the owner replied that the object has gone - // out of scope (this is an edge case in the distributed ref counting - // protocol where a borrower dies before it can notify the owner of - // another borrower). Store an error so that an exception will be - // thrown immediately when the worker tries to get the value. - RAY_UNUSED(in_memory_store_->Put( - RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); - } else if (reply.status() == rpc::GetObjectStatusReply::CREATED) { - // The object is either an indicator that the object is in Plasma, or - // the object has been returned directly in the reply. In either - // case, we put the corresponding RayObject into the in-memory store. - // If the owner later fails or the object is released, the raylet - // will eventually store an error in Plasma on our behalf. - const auto &data = reply.object().data(); - std::shared_ptr data_buffer; - if (data.size() > 0) { - RAY_LOG(DEBUG) << "Object returned directly in GetObjectStatus reply, putting " - << object_id << " in memory store"; - data_buffer = std::make_shared( - const_cast(reinterpret_cast(data.data())), - data.size()); - } else { - RAY_LOG(DEBUG) << "Object not returned directly in GetObjectStatus reply, " - << object_id << " will have to be fetched from Plasma"; - } - const auto &metadata = reply.object().metadata(); - std::shared_ptr metadata_buffer; - if (metadata.size() > 0) { - metadata_buffer = std::make_shared( - const_cast(reinterpret_cast(metadata.data())), - metadata.size()); - } - auto inlined_ids = - IdVectorFromProtobuf(reply.object().nested_inlined_ids()); - RAY_UNUSED(in_memory_store_->Put( - RayObject(data_buffer, metadata_buffer, inlined_ids), object_id)); - } - }); + if (!status.ok() || reply.status() == rpc::GetObjectStatusReply::OUT_OF_SCOPE) { + // The owner is gone or the owner replied that the object has gone + // out of scope (this is an edge case in the distributed ref counting + // protocol where a borrower dies before it can notify the owner of + // another borrower). Store an error so that an exception will be + // thrown immediately when the worker tries to get the value. + RAY_UNUSED(in_memory_store_->Put( + RayObject(rpc::ErrorType::OBJECT_UNRECONSTRUCTABLE), object_id)); + } else { + // We can now try to fetch the object via plasma. If the owner later + // fails or the object is released, the raylet will eventually store + // an error in plasma on our behalf. + RAY_UNUSED(in_memory_store_->Put(RayObject(rpc::ErrorType::OBJECT_IN_PLASMA), + object_id)); + } + }); } } // namespace ray diff --git a/src/ray/core_worker/future_resolver.h b/src/ray/core_worker/future_resolver.h index b774434b71da..be504a582f3d 100644 --- a/src/ray/core_worker/future_resolver.h +++ b/src/ray/core_worker/future_resolver.h @@ -16,7 +16,6 @@ #include -#include "ray/common/grpc_util.h" #include "ray/common/id.h" #include "ray/core_worker/store_provider/memory_store/memory_store.h" #include "ray/rpc/worker/core_worker_client.h" diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 43dfaa45bbe0..799530d274e9 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -132,15 +132,6 @@ message GetObjectStatusRequest { bytes object_id = 2; } -message RayObject { - // Data of the object. - bytes data = 1; - // Metadata of the object. - bytes metadata = 2; - // ObjectIDs that were nested in data. This is only set for inlined objects. - repeated bytes nested_inlined_ids = 3; -} - message GetObjectStatusReply { enum ObjectStatus { CREATED = 0; @@ -148,9 +139,6 @@ message GetObjectStatusReply { FREED = 2; } ObjectStatus status = 1; - // The Ray object: either a concrete value, an in-Plasma indicator, or an - // exception. - RayObject object = 2; } message WaitForActorOutOfScopeRequest { From ccc901f6620bb4a542fc96afbc733665fa9a3016 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Thu, 21 Jan 2021 16:38:51 -0800 Subject: [PATCH 010/245] add 3.8 (#13608) --- .../linux-py3.8-requirements_tune.txt | 864 ++++++++++++++++++ 1 file changed, 864 insertions(+) create mode 100644 python/requirements/linux-py3.8-requirements_tune.txt diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt new file mode 100644 index 000000000000..36dbb1dce9ad --- /dev/null +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -0,0 +1,864 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile requirements_tune.in +# +--find-links https://download.pytorch.org/whl/torch_stable.html + +absl-py==0.11.0 + # via tensorboard +alembic==1.4.1 + # via + # mlflow + # optuna +argon2-cffi==20.1.0 + # via notebook +async-generator==1.10 + # via nbclient +atari-py==0.2.6 + # via + # -c ../requirements.txt + # gym +attrs==20.3.0 + # via + # cmd2 + # jsonschema + # pytest +autocfg==0.0.6 + # via gluoncv +autogluon.core==0.0.16b20210121 + # via gluoncv +autograd==1.3 + # via autogluon.core +ax-platform==0.1.19 ; python_version >= "3.7" + # via -r requirements_tune.in +azure-core==1.10.0 + # via azure-storage-blob +azure-storage-blob==12.7.1 + # via mlflow +backcall==0.2.0 + # via ipython +bayesian-optimization==1.2.0 + # via + # -r requirements_tune.in + # nevergrad +bcrypt==3.2.0 + # via paramiko +bleach==3.2.2 + # via nbconvert +bokeh==2.2.3 + # via dask +boto3==1.16.57 + # via + # -c ../requirements.txt + # autogluon.core + # smart-open +botocore==1.19.57 + # via + # boto3 + # s3transfer +botorch==0.3.3 + # via ax-platform +cachetools==4.2.0 + # via google-auth +certifi==2020.12.5 + # via + # kubernetes + # msrest + # requests + # sentry-sdk +cffi==1.14.4 + # via + # argon2-cffi + # bcrypt + # cryptography + # pynacl +chardet==4.0.0 + # via requests +click==7.1.2 + # via + # -c ../requirements.txt + # databricks-cli + # distributed + # flask + # mlflow + # sacremoses + # wandb +cliff==3.6.0 + # via optuna +cloudpickle==1.6.0 + # via + # dask + # distributed + # gym + # hyperopt + # mlflow + # tensorflow-probability +cma==3.0.3 + # via nevergrad +cmaes==0.7.0 + # via optuna +cmd2==1.4.0 + # via cliff +colorama==0.4.4 + # via + # -c ../requirements.txt + # cmd2 +colorlog==4.7.2 + # via optuna +configparser==5.0.1 + # via wandb +configspace==0.4.10 + # via + # -r requirements_tune.in + # autogluon.core + # hpbandster +cryptography==3.3.1 + # via + # azure-storage-blob + # paramiko +cycler==0.10.0 + # via matplotlib +cython==0.29.0 + # via + # -c ../requirements.txt + # autogluon.core + # configspace +dask[complete]==2021.1.0 + # via + # -c ../requirements.txt + # autogluon.core + # distributed +databricks-cli==0.14.1 + # via mlflow +dataclasses==0.6 + # via torch +decorator==4.4.2 + # via + # ipython + # networkx + # paramz + # tensorflow-probability +decord==0.4.2 + # via gluoncv +defusedxml==0.6.0 + # via nbconvert +dill==0.3.3 + # via autogluon.core +distributed==2021.1.0 + # via + # autogluon.core + # dask +dm-tree==0.1.5 + # via + # -c ../requirements.txt + # tensorflow-probability +docker-pycreds==0.4.0 + # via wandb +docker==4.4.1 + # via mlflow +dragonfly-opt==0.1.6 + # via -r requirements_tune.in +entrypoints==0.3 + # via + # mlflow + # nbconvert +filelock==3.0.12 + # via + # -c ../requirements.txt + # transformers +flask==1.1.2 + # via + # -c ../requirements.txt + # mlflow + # prometheus-flask-exporter +fsspec==0.8.5 + # via + # dask + # pytorch-lightning +future==0.18.2 + # via + # autograd + # dragonfly-opt + # hyperopt + # pyglet + # pytorch-lightning + # torch +gast==0.4.0 + # via tensorflow-probability +gitdb==4.0.5 + # via gitpython +gitpython==3.1.12 + # via + # mlflow + # wandb +gluoncv==0.9.1 + # via -r requirements_tune.in +google-auth-oauthlib==0.4.2 + # via tensorboard +google-auth==1.24.0 + # via + # google-auth-oauthlib + # kubernetes + # tensorboard +gpy==1.9.9 + # via -r requirements_tune.in +gpytorch==1.3.1 + # via botorch +graphviz==0.8.4 + # via + # autogluon.core + # mxnet +grpcio==1.35.0 + # via + # -c ../requirements.txt + # tensorboard +gunicorn==20.0.4 + # via mlflow +gym[atari]==0.18.0 + # via + # -c ../requirements.txt + # -r requirements_tune.in +h5py==3.1.0 + # via + # -r requirements_tune.in + # keras +heapdict==1.0.1 + # via zict +hpbandster==0.7.4 + # via -r requirements_tune.in +hyperopt==0.2.5 + # via -r requirements_tune.in +idna==2.10 + # via requests +ipykernel==5.4.3 + # via + # ipywidgets + # jupyter + # jupyter-console + # notebook + # qtconsole +ipython-genutils==0.2.0 + # via + # nbformat + # notebook + # qtconsole + # traitlets +ipython==7.19.0 + # via + # ipykernel + # ipywidgets + # jupyter-console +ipywidgets==7.6.3 + # via jupyter +isodate==0.6.0 + # via msrest +itsdangerous==1.1.0 + # via flask +jedi==0.18.0 + # via ipython +jinja2==2.11.2 + # via + # ax-platform + # bokeh + # flask + # nbconvert + # notebook +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.0 + # via + # optuna + # sacremoses + # scikit-learn + # scikit-optimize +jsonschema==3.2.0 + # via + # -c ../requirements.txt + # nbformat +jupyter-client==6.1.11 + # via + # ipykernel + # jupyter-console + # nbclient + # notebook + # qtconsole +jupyter-console==6.2.0 + # via jupyter +jupyter-core==4.7.0 + # via + # jupyter-client + # nbconvert + # nbformat + # notebook + # qtconsole +jupyter==1.0.0 + # via -r requirements_tune.in +jupyterlab-pygments==0.1.2 + # via nbconvert +jupyterlab-widgets==1.0.0 + # via ipywidgets +keras==2.4.3 + # via -r requirements_tune.in +kiwisolver==1.3.1 + # via matplotlib +kubernetes==12.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +lightgbm==3.1.1 + # via -r requirements_tune.in +locket==0.2.1 + # via partd +mako==1.1.4 + # via alembic +markdown==3.3.3 + # via tensorboard +markupsafe==1.1.1 + # via + # jinja2 + # mako +matplotlib==3.3.3 + # via + # -r requirements_tune.in + # autogluon.core + # gluoncv + # zoopt +mistune==0.8.4 + # via nbconvert +mlflow==1.13.1 + # via -r requirements_tune.in +more-itertools==8.6.0 + # via pytest +msgpack==1.0.2 + # via + # -c ../requirements.txt + # distributed +msrest==0.6.19 + # via azure-storage-blob +mxnet==1.7.0.post1 + # via -r requirements_tune.in +nbclient==0.5.1 + # via nbconvert +nbconvert==6.0.7 + # via + # jupyter + # notebook +nbformat==5.1.2 + # via + # ipywidgets + # nbclient + # nbconvert + # notebook +nest-asyncio==1.4.3 + # via nbclient +netifaces==0.10.9 + # via hpbandster +networkx==2.5 + # via + # -c ../requirements.txt + # hyperopt +nevergrad==0.4.2.post5 + # via -r requirements_tune.in +notebook==6.2.0 + # via + # jupyter + # widgetsnbextension +numpy==1.19.5 + # via + # -c ../requirements.txt + # atari-py + # autogluon.core + # autograd + # bayesian-optimization + # bokeh + # cma + # cmaes + # configspace + # dask + # decord + # dragonfly-opt + # gluoncv + # gpy + # gym + # h5py + # hpbandster + # hyperopt + # keras + # lightgbm + # matplotlib + # mlflow + # mxnet + # nevergrad + # opencv-python + # optuna + # pandas + # paramz + # patsy + # pytorch-lightning + # scikit-learn + # scikit-optimize + # scipy + # statsmodels + # tensorboard + # tensorboardx + # tensorflow-probability + # torch + # torchvision + # transformers + # xgboost + # zoopt +oauthlib==3.1.0 + # via requests-oauthlib +opencv-python==4.5.1.48 + # via + # gluoncv + # gym +optuna==2.3.0 + # via -r requirements_tune.in +packaging==20.8 + # via + # bleach + # bokeh + # optuna + # pytest + # transformers +pandas==1.0.5 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # dask + # gluoncv + # mlflow + # statsmodels +pandocfilters==1.4.3 + # via nbconvert +paramiko==2.7.2 + # via autogluon.core +paramz==0.9.5 + # via gpy +parso==0.8.1 + # via jedi +partd==1.1.0 + # via dask +patsy==0.5.1 + # via statsmodels +pbr==5.5.1 + # via + # cliff + # stevedore +pexpect==4.8.0 + # via + # -c ../requirements.txt + # ipython +pickleshare==0.7.5 + # via ipython +pillow==7.2.0 ; platform_system != "Windows" + # via + # -c ../requirements.txt + # bokeh + # gluoncv + # gym + # matplotlib + # torchvision +plotly==4.14.3 + # via ax-platform +pluggy==0.13.1 + # via pytest +portalocker==2.0.0 + # via gluoncv +prettytable==0.7.2 + # via cliff +prometheus-client==0.9.0 + # via + # -c ../requirements.txt + # notebook + # prometheus-flask-exporter +prometheus-flask-exporter==0.18.1 + # via mlflow +promise==2.3 + # via wandb +prompt-toolkit==3.0.11 + # via + # ipython + # jupyter-console +protobuf==3.14.0 + # via + # -c ../requirements.txt + # mlflow + # tensorboard + # tensorboardx + # wandb +psutil==5.8.0 + # via + # distributed + # wandb +ptyprocess==0.7.0 + # via + # pexpect + # terminado +py==1.10.0 + # via pytest +pyaml==20.4.0 + # via scikit-optimize +pyasn1-modules==0.2.8 + # via google-auth +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pycparser==2.20 + # via cffi +pyglet==1.5.0 + # via gym +pygments==2.7.4 + # via + # -c ../requirements.txt + # ipython + # jupyter-console + # jupyterlab-pygments + # nbconvert + # qtconsole +pynacl==1.4.0 + # via paramiko +pyparsing==2.4.7 + # via + # cliff + # configspace + # matplotlib + # packaging +pyperclip==1.8.1 + # via cmd2 +pyro4==4.80 + # via hpbandster +pyrsistent==0.17.3 + # via jsonschema +pytest-remotedata==0.3.2 + # via -r requirements_tune.in +pytest==5.4.3 + # via + # -c ../requirements.txt + # autogluon.core + # pytest-remotedata +python-dateutil==2.8.1 + # via + # alembic + # bokeh + # botocore + # jupyter-client + # kubernetes + # matplotlib + # mlflow + # pandas + # wandb +python-editor==1.0.4 + # via alembic +pytorch-lightning-bolts==0.2.5 + # via -r requirements_tune.in +pytorch-lightning==1.0.3 + # via + # -r requirements_tune.in + # pytorch-lightning-bolts +pytz==2020.5 + # via pandas +pyyaml==5.4.1 + # via + # -c ../requirements.txt + # autocfg + # bokeh + # cliff + # dask + # distributed + # gluoncv + # keras + # kubernetes + # mlflow + # pyaml + # pytorch-lightning + # wandb + # yacs +pyzmq==21.0.1 + # via + # jupyter-client + # notebook + # qtconsole +qtconsole==5.0.1 + # via jupyter +qtpy==1.9.0 + # via qtconsole +querystring-parser==1.2.4 + # via mlflow +regex==2020.11.13 + # via + # sacremoses + # transformers +requests-oauthlib==1.3.0 + # via + # google-auth-oauthlib + # kubernetes + # msrest +requests==2.25.1 + # via + # -c ../requirements.txt + # autogluon.core + # azure-core + # databricks-cli + # docker + # gluoncv + # kubernetes + # mlflow + # msrest + # mxnet + # requests-oauthlib + # sigopt + # tensorboard + # transformers + # wandb +retrying==1.3.3 + # via plotly +rsa==4.7 + # via google-auth +s3transfer==0.3.4 + # via boto3 +sacremoses==0.0.43 + # via transformers +scikit-learn==0.22.2 + # via + # -c ../requirements.txt + # -r requirements_tune.in + # autogluon.core + # ax-platform + # bayesian-optimization + # gpytorch + # lightgbm + # scikit-optimize +scikit-optimize==0.8.1 + # via + # -r requirements_tune.in + # autogluon.core +scipy==1.4.1 + # via + # -c ../requirements.txt + # autogluon.core + # ax-platform + # bayesian-optimization + # botorch + # dragonfly-opt + # gluoncv + # gpy + # gpytorch + # gym + # hpbandster + # hyperopt + # keras + # lightgbm + # optuna + # paramz + # scikit-learn + # scikit-optimize + # statsmodels + # xgboost +send2trash==1.5.0 + # via notebook +sentencepiece==0.1.95 + # via transformers +sentry-sdk==0.19.5 + # via wandb +serpent==1.30.2 + # via + # hpbandster + # pyro4 +shortuuid==1.0.1 + # via wandb +sigopt==5.7.0 + # via -r requirements_tune.in +six==1.15.0 + # via + # absl-py + # argon2-cffi + # atari-py + # azure-core + # bcrypt + # bleach + # cryptography + # cycler + # databricks-cli + # dm-tree + # docker + # docker-pycreds + # dragonfly-opt + # google-auth + # gpy + # grpcio + # hyperopt + # isodate + # jsonschema + # kubernetes + # mlflow + # paramz + # patsy + # plotly + # promise + # protobuf + # pynacl + # pytest-remotedata + # python-dateutil + # querystring-parser + # retrying + # sacremoses + # tensorboard + # tensorboardx + # tensorflow-probability + # wandb + # websocket-client +smart_open==4.0.1 + # via + # -c ../requirements.txt + # -r requirements_tune.in +smmap==3.0.4 + # via gitdb +sortedcontainers==2.3.0 + # via distributed +sqlalchemy==1.3.22 + # via + # alembic + # mlflow + # optuna +sqlparse==0.4.1 + # via mlflow +statsmodels==0.12.1 + # via hpbandster +stevedore==3.3.0 + # via cliff +subprocess32==3.5.4 + # via wandb +tabulate==0.8.7 + # via + # -c ../requirements.txt + # databricks-cli +tblib==1.7.0 + # via distributed +tensorboard-plugin-wit==1.8.0 + # via tensorboard +tensorboard==2.4.1 + # via pytorch-lightning +tensorboardx==2.1 + # via + # -c ../requirements.txt + # gluoncv +tensorflow-probability==0.11.1 + # via -r requirements_tune.in +terminado==0.9.2 + # via notebook +testpath==0.4.4 + # via nbconvert +timm==0.3.2 + # via -r requirements_tune.in +tokenizers==0.8.1.rc2 + # via transformers +toolz==0.11.1 + # via + # dask + # distributed + # partd +torch==1.7.0+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # botorch + # gpytorch + # pytorch-lightning + # pytorch-lightning-bolts + # timm + # torchvision +torchvision==0.8.1+cpu ; sys_platform != "darwin" + # via + # -r requirements_tune.in + # timm +tornado==6.1 + # via + # autogluon.core + # bokeh + # distributed + # ipykernel + # jupyter-client + # notebook + # terminado +tqdm==4.56.0 + # via + # autogluon.core + # gluoncv + # hyperopt + # optuna + # pytorch-lightning + # sacremoses + # transformers +traitlets==5.0.5 + # via + # ipykernel + # ipython + # ipywidgets + # jupyter-client + # jupyter-core + # nbclient + # nbconvert + # nbformat + # notebook + # qtconsole +transformers==3.1 + # via -r requirements_tune.in +typeguard==2.10.0 + # via ax-platform +typing-extensions==3.7.4.3 + # via + # bokeh + # nevergrad + # torch +typing==3.7.4.3 + # via configspace +urllib3==1.26.2 + # via + # botocore + # kubernetes + # requests + # sentry-sdk +wandb==0.10.12 + # via -r requirements_tune.in +watchdog==1.0.2 + # via wandb +wcwidth==0.2.5 + # via + # cmd2 + # prompt-toolkit + # pytest +webencodings==0.5.1 + # via bleach +websocket-client==0.57.0 + # via + # docker + # kubernetes +werkzeug==1.0.1 + # via + # -c ../requirements.txt + # flask + # tensorboard +wheel==0.36.2 + # via + # lightgbm + # tensorboard +widgetsnbextension==3.5.1 + # via ipywidgets +xgboost==1.3.0.post0 + # via -r requirements_tune.in +yacs==0.1.8 + # via gluoncv +zict==2.0.0 + # via distributed +zoopt==0.4.1 + # via -r requirements_tune.in + +# The following packages are considered to be unsafe in a requirements file: +# setuptools From 0998d69968608012ca6cdd1ee166961df1aa0f0b Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 21 Jan 2021 16:46:42 -0800 Subject: [PATCH 011/245] [core] Admission control for pulling objects to the local node (#13514) * Admission control, TODO: tests, object size * Unit tests for admission control and some bug fixes * Add object size to object table, only activate pull if object size is known * Some fixes, reset timer on eviction * doc * update * Trigger OOM from the pull manager * don't spam * doc * Update src/ray/object_manager/pull_manager.cc Co-authored-by: Eric Liang * Remove useless tests * Fix test * osx build * Skip broken test * tests * Skip failing tests Co-authored-by: Eric Liang --- .travis.yml | 11 +- BUILD.bazel | 24 - python/ray/tests/test_object_manager.py | 83 +++ python/ray/tests/test_object_spilling.py | 61 +++ python/ray/tests/test_reconstruction.py | 3 + src/ray/core_worker/core_worker.cc | 1 + src/ray/core_worker/reference_count.cc | 9 + src/ray/core_worker/reference_count.h | 6 + src/ray/gcs/accessor.h | 2 +- .../gcs/gcs_client/service_based_accessor.cc | 4 + .../gcs/gcs_client/service_based_accessor.h | 2 +- .../test/global_state_accessor_test.cc | 2 +- .../test/service_based_gcs_client_test.cc | 2 +- src/ray/gcs/gcs_server/gcs_object_manager.cc | 7 +- src/ray/gcs/gcs_server/gcs_object_manager.h | 1 + src/ray/object_manager/object_directory.cc | 41 +- src/ray/object_manager/object_directory.h | 8 +- src/ray/object_manager/object_manager.cc | 60 ++- .../ownership_based_object_directory.cc | 11 +- .../object_manager/plasma/eviction_policy.h | 2 + src/ray/object_manager/plasma/store.h | 7 + src/ray/object_manager/plasma/store_runner.h | 9 +- src/ray/object_manager/pull_manager.cc | 231 +++++++- src/ray/object_manager/pull_manager.h | 96 +++- .../test/object_manager_stress_test.cc | 453 ---------------- .../test/object_manager_test.cc | 496 ------------------ .../object_manager/test/pull_manager_test.cc | 318 +++++++++-- src/ray/protobuf/core_worker.proto | 1 + src/ray/protobuf/gcs.proto | 4 + src/ray/protobuf/gcs_service.proto | 2 + src/ray/raylet/reconstruction_policy.cc | 2 +- src/ray/raylet/reconstruction_policy_test.cc | 4 +- .../raylet/test/local_object_manager_test.cc | 5 +- src/ray/test/run_object_manager_tests.sh | 43 -- 34 files changed, 873 insertions(+), 1138 deletions(-) delete mode 100644 src/ray/object_manager/test/object_manager_stress_test.cc delete mode 100644 src/ray/object_manager/test/object_manager_test.cc delete mode 100755 src/ray/test/run_object_manager_tests.sh diff --git a/.travis.yml b/.travis.yml index 36e49aaa74ef..5170ed0864b8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -78,7 +78,9 @@ matrix: - . ./ci/travis/ci.sh build script: # Run all C++ unit tests with ASAN enabled. ASAN adds too much overhead to run Python tests. - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all + # NOTE: core_worker_test is out-of-date and should already covered by + # Python tests. + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -core_worker_test - os: osx osx_image: xcode7 @@ -435,11 +437,10 @@ matrix: script: - . ./ci/travis/ci.sh test_cpp script: - # raylet integration tests (core_worker_tests included in bazel tests below) - - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh - # cc bazel tests (w/o RLlib) - - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... + # NOTE: core_worker_test is out-of-date and should already covered by Python + # tests. + - ./ci/suppress_output bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test # ray serve tests - if [ $RAY_CI_SERVE_AFFECTED == "1" ]; then ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only python/ray/serve/...; fi diff --git a/BUILD.bazel b/BUILD.bazel index a863727ecd95..c1745e468852 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1365,30 +1365,6 @@ cc_library( ], ) -cc_binary( - name = "object_manager_test", - testonly = 1, - srcs = ["src/ray/object_manager/test/object_manager_test.cc"], - copts = COPTS, - deps = [ - ":object_manager", - "//src/ray/protobuf:common_cc_proto", - "@com_google_googletest//:gtest_main", - ], -) - -cc_binary( - name = "object_manager_stress_test", - testonly = 1, - srcs = ["src/ray/object_manager/test/object_manager_stress_test.cc"], - copts = COPTS, - deps = [ - ":object_manager", - "//src/ray/protobuf:common_cc_proto", - "@com_google_googletest//:gtest_main", - ], -) - cc_library( name = "platform_shims", srcs = [] + select({ diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index b29b9caa228f..e38733f62d7e 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -296,6 +296,89 @@ def driver(): ray.get(driver.remote()) +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") +@pytest.mark.timeout(30) +def test_pull_bundles_admission_control(shutdown_only): + cluster = Cluster() + object_size = int(6e6) + num_objects = 10 + num_tasks = 10 + # Head node can fit all of the objects at once. + cluster.add_node( + num_cpus=0, + object_store_memory=2 * num_tasks * num_objects * object_size) + cluster.wait_for_nodes() + ray.init(address=cluster.address) + + # Worker node can only fit 1 task at a time. + cluster.add_node( + num_cpus=1, object_store_memory=1.5 * num_objects * object_size) + cluster.wait_for_nodes() + + @ray.remote + def foo(*args): + return + + args = [] + for _ in range(num_tasks): + task_args = [ + ray.put(np.zeros(object_size, dtype=np.uint8)) + for _ in range(num_objects) + ] + args.append(task_args) + + tasks = [foo.remote(*task_args) for task_args in args] + ray.get(tasks) + + +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") +@pytest.mark.timeout(30) +def test_pull_bundles_admission_control_dynamic(shutdown_only): + # This test is the same as test_pull_bundles_admission_control, except that + # the object store's capacity starts off higher and is later consumed + # dynamically by concurrent workers. + cluster = Cluster() + object_size = int(6e6) + num_objects = 10 + num_tasks = 10 + # Head node can fit all of the objects at once. + cluster.add_node( + num_cpus=0, + object_store_memory=2 * num_tasks * num_objects * object_size) + cluster.wait_for_nodes() + ray.init(address=cluster.address) + + # Worker node can fit 2 tasks at a time. + cluster.add_node( + num_cpus=1, object_store_memory=2.5 * num_objects * object_size) + cluster.wait_for_nodes() + + @ray.remote + def foo(*args): + return + + @ray.remote + def allocate(*args): + return np.zeros(object_size, dtype=np.uint8) + + args = [] + for _ in range(num_tasks): + task_args = [ + ray.put(np.zeros(object_size, dtype=np.uint8)) + for _ in range(num_objects) + ] + args.append(task_args) + + tasks = [foo.remote(*task_args) for task_args in args] + allocated = [allocate.remote() for _ in range(num_objects)] + ray.get(tasks) + del allocated + + if __name__ == "__main__": import pytest import sys diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 10b1da77306a..745eb3bafc1d 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -648,5 +648,66 @@ def test_release_during_plasma_fetch(tmp_path, shutdown_only): do_test_release_resource(tmp_path, expect_released=True) +@pytest.mark.skip( + reason="This hangs due to a deadlock between a worker getting its " + "arguments and the node pulling arguments for the next task queued.") +@pytest.mark.skipif( + platform.system() == "Windows", reason="Failing on Windows.") +@pytest.mark.timeout(30) +def test_spill_objects_on_object_transfer(object_spilling_config, + ray_start_cluster): + # This test checks that objects get spilled to make room for transferred + # objects. + cluster = ray_start_cluster + object_size = int(1e7) + num_objects = 10 + num_tasks = 10 + # Head node can fit all of the objects at once. + cluster.add_node( + num_cpus=0, + object_store_memory=2 * num_tasks * num_objects * object_size, + _system_config={ + "max_io_workers": 1, + "automatic_object_spilling_enabled": True, + "object_store_full_delay_ms": 100, + "object_spilling_config": object_spilling_config, + "min_spilling_size": 0 + }) + cluster.wait_for_nodes() + ray.init(address=cluster.address) + + # Worker node can fit 1 tasks at a time. + cluster.add_node( + num_cpus=1, object_store_memory=1.5 * num_objects * object_size) + cluster.wait_for_nodes() + + @ray.remote + def foo(*args): + return + + @ray.remote + def allocate(*args): + return np.zeros(object_size, dtype=np.uint8) + + # Allocate some objects that must be spilled to make room for foo's + # arguments. + allocated = [allocate.remote() for _ in range(num_objects)] + ray.get(allocated) + print("done allocating") + + args = [] + for _ in range(num_tasks): + task_args = [ + ray.put(np.zeros(object_size, dtype=np.uint8)) + for _ in range(num_objects) + ] + args.append(task_args) + + # Check that tasks scheduled to the worker node have enough room after + # spilling. + tasks = [foo.remote(*task_args) for task_args in args] + ray.get(tasks) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/tests/test_reconstruction.py b/python/ray/tests/test_reconstruction.py index f5eed1e8fb23..1cd1f133a911 100644 --- a/python/ray/tests/test_reconstruction.py +++ b/python/ray/tests/test_reconstruction.py @@ -372,6 +372,7 @@ def probe(): raise e.as_instanceof_cause() +@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_multiple_downstream_tasks(ray_start_cluster, reconstruction_enabled): config = { @@ -436,6 +437,7 @@ def dependent_task(x): raise e.as_instanceof_cause() +@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.parametrize("reconstruction_enabled", [False, True]) def test_reconstruction_chain(ray_start_cluster, reconstruction_enabled): config = { @@ -487,6 +489,7 @@ def dependent_task(x): raise e.as_instanceof_cause() +@pytest.mark.skip(reason="This hangs due to a deadlock in admission control.") @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_reconstruction_stress(ray_start_cluster): config = { diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 21fc462a7af6..f7e473eca5a2 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -2213,6 +2213,7 @@ void CoreWorker::HandleGetObjectLocationsOwner( } else { status = Status::ObjectNotFound("Object " + object_id.Hex() + " not found"); } + reply->set_object_size(reference_counter_->GetObjectSize(object_id)); send_reply_callback(status, nullptr, nullptr); } diff --git a/src/ray/core_worker/reference_count.cc b/src/ray/core_worker/reference_count.cc index c638f831dbed..ba2e20994e44 100644 --- a/src/ray/core_worker/reference_count.cc +++ b/src/ray/core_worker/reference_count.cc @@ -948,6 +948,15 @@ absl::optional> ReferenceCounter::GetObjectLocations return it->second.locations; } +size_t ReferenceCounter::GetObjectSize(const ObjectID &object_id) const { + absl::MutexLock lock(&mutex_); + auto it = object_id_refs_.find(object_id); + if (it == object_id_refs_.end()) { + return 0; + } + return it->second.object_size; +} + void ReferenceCounter::HandleObjectSpilled(const ObjectID &object_id) { absl::MutexLock lock(&mutex_); auto it = object_id_refs_.find(object_id); diff --git a/src/ray/core_worker/reference_count.h b/src/ray/core_worker/reference_count.h index caceabc53ab5..9c0576393fb3 100644 --- a/src/ray/core_worker/reference_count.h +++ b/src/ray/core_worker/reference_count.h @@ -397,6 +397,12 @@ class ReferenceCounter : public ReferenceCounterInterface, absl::optional> GetObjectLocations( const ObjectID &object_id) LOCKS_EXCLUDED(mutex_); + /// Get an object's size. This will return 0 if the object is out of scope. + /// + /// \param[in] object_id The object whose size to get. + /// \return Object size, or 0 if the object is out of scope. + size_t GetObjectSize(const ObjectID &object_id) const; + /// Handle an object has been spilled to external storage. /// /// This notifies the primary raylet that the object is safe to release and diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index 83dc3de3ca46..ab0704bcadd7 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -297,7 +297,7 @@ class ObjectInfoAccessor { /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - const StatusCallback &callback) = 0; + size_t object_size, const StatusCallback &callback) = 0; /// Add spilled location of object to GCS asynchronously. /// diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index f9380b78ee12..dfa192320976 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1070,6 +1070,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncGetAll( Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, + size_t object_size, const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object location, object id = " << object_id << ", node id = " << node_id @@ -1077,6 +1078,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_node_id(node_id.Binary()); + request.set_size(object_size); auto operation = [this, request, object_id, node_id, callback](const SequencerDoneCallback &done_callback) { @@ -1171,11 +1173,13 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations( rpc::ObjectLocationChange update; update.set_is_add(true); update.set_node_id(loc.manager()); + update.set_size(result->size()); notification.push_back(update); } if (!result->spilled_url().empty()) { rpc::ObjectLocationChange update; update.set_spilled_url(result->spilled_url()); + update.set_size(result->size()); notification.push_back(update); } subscribe(object_id, notification); diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index b498e0acfd46..2d362976dd22 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -323,7 +323,7 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { Status AsyncGetAll(const MultiItemCallback &callback) override; Status AsyncAddLocation(const ObjectID &object_id, const NodeID &node_id, - const StatusCallback &callback) override; + size_t object_size, const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, const StatusCallback &callback) override; diff --git a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc index 7af602808fc7..e896beccb6f5 100644 --- a/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc +++ b/src/ray/gcs/gcs_client/test/global_state_accessor_test.cc @@ -283,7 +283,7 @@ TEST_F(GlobalStateAccessorTest, TestObjectTable) { NodeID node_id = NodeID::FromRandom(); std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, + object_id, node_id, 0, [&promise](Status status) { promise.set_value(status.ok()); })); WaitReady(promise.get_future(), timeout_ms_); } diff --git a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc index 3b0f731bbccd..3b1a6a69ad7a 100644 --- a/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc +++ b/src/ray/gcs/gcs_client/test/service_based_gcs_client_test.cc @@ -450,7 +450,7 @@ class ServiceBasedGcsClientTest : public ::testing::Test { bool AddLocation(const ObjectID &object_id, const NodeID &node_id) { std::promise promise; RAY_CHECK_OK(gcs_client_->Objects().AsyncAddLocation( - object_id, node_id, + object_id, node_id, 0, [&promise](Status status) { promise.set_value(status.ok()); })); return WaitReady(promise.get_future(), timeout_ms_); } diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.cc b/src/ray/gcs/gcs_server/gcs_object_manager.cc index b5cc8f765113..73971ed7f18f 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc @@ -51,6 +51,7 @@ void GcsObjectManager::HandleGetAllObjectLocations( object_table_data.set_manager(node_id.Binary()); object_location_info.add_locations()->CopyFrom(object_table_data); } + object_location_info.set_size(item.second.object_size); reply->add_object_location_info_list()->CopyFrom(object_location_info); } RAY_LOG(DEBUG) << "Finished getting all object locations."; @@ -78,7 +79,8 @@ void GcsObjectManager::HandleAddObjectLocation( RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id; } - auto on_done = [this, object_id, node_id, spilled_url, reply, + size_t size = request.size(); + auto on_done = [this, object_id, node_id, spilled_url, size, reply, send_reply_callback](const Status &status) { if (status.ok()) { rpc::ObjectLocationChange notification; @@ -89,6 +91,7 @@ void GcsObjectManager::HandleAddObjectLocation( if (!spilled_url.empty()) { notification.set_spilled_url(spilled_url); } + notification.set_size(size); RAY_CHECK_OK(gcs_pub_sub_->Publish(OBJECT_CHANNEL, object_id.Hex(), notification.SerializeAsString(), nullptr)); RAY_LOG(DEBUG) << "Finished adding object location, job id = " @@ -107,6 +110,7 @@ void GcsObjectManager::HandleAddObjectLocation( }; absl::MutexLock lock(&mutex_); + object_to_locations_[object_id].object_size = size; const auto object_data = GenObjectLocationInfo(object_id); Status status = gcs_table_storage_->ObjectTable().Put(object_id, object_data, on_done); if (!status.ok()) { @@ -287,6 +291,7 @@ const ObjectLocationInfo GcsObjectManager::GenObjectLocationInfo( object_data.add_locations()->set_manager(node_id.Binary()); } object_data.set_spilled_url(it->second.spilled_url); + object_data.set_size(it->second.object_size); } return object_data; } diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.h b/src/ray/gcs/gcs_server/gcs_object_manager.h index bd21bfd1b977..2afff0816850 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.h +++ b/src/ray/gcs/gcs_server/gcs_object_manager.h @@ -65,6 +65,7 @@ class GcsObjectManager : public rpc::ObjectInfoHandler { struct LocationSet { absl::flat_hash_set locations; std::string spilled_url = ""; + size_t object_size = 0; }; /// Add a location of objects. diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index 189cc0dd7d4b..ccfda7f5a37c 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -31,13 +31,21 @@ using ray::rpc::ObjectTableData; /// object table entries up to but not including this notification. bool UpdateObjectLocations(const std::vector &location_updates, std::shared_ptr gcs_client, - std::unordered_set *node_ids, - std::string *spilled_url) { + std::unordered_set *node_ids, std::string *spilled_url, + size_t *object_size) { // location_updates contains the updates of locations of the object. // with GcsChangeMode, we can determine whether the update mode is // addition or deletion. bool isUpdated = false; for (const auto &update : location_updates) { + // The size can be 0 if the update was a deletion. This assumes that an + // object's size is always greater than 0. + // TODO(swang): If that's not the case, we should use a flag to check + // whether the size is set instead. + if (update.size() > 0) { + *object_size = update.size(); + } + if (!update.node_id().empty()) { NodeID node_id = NodeID::FromBinary(update.node_id()); if (update.is_add() && 0 == node_ids->count(node_id)) { @@ -73,9 +81,10 @@ bool UpdateObjectLocations(const std::vector &locatio ray::Status ObjectDirectory::ReportObjectAdded( const ObjectID &object_id, const NodeID &node_id, const object_manager::protocol::ObjectInfoT &object_info) { - RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id; + size_t size = object_info.data_size + object_info.metadata_size; + RAY_LOG(DEBUG) << "Reporting object added to GCS " << object_id << " size " << size; ray::Status status = - gcs_client_->Objects().AsyncAddLocation(object_id, node_id, nullptr); + gcs_client_->Objects().AsyncAddLocation(object_id, node_id, size, nullptr); return status; } @@ -119,14 +128,14 @@ void ObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { // If the subscribed object has the removed node as a location, update // its locations with an empty update so that the location will be removed. UpdateObjectLocations({}, gcs_client_, &listener.second.current_object_locations, - &listener.second.spilled_url); + &listener.second.spilled_url, &listener.second.object_size); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.spilled_url); + listener.second.spilled_url, listener.second.object_size); } } } @@ -157,7 +166,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // Update entries for this object. if (!UpdateObjectLocations(object_notifications, gcs_client_, &it->second.current_object_locations, - &it->second.spilled_url)) { + &it->second.spilled_url, &it->second.object_size)) { return; } // Copy the callbacks so that the callbacks can unsubscribe without interrupting @@ -171,7 +180,7 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url); + it->second.spilled_url, it->second.object_size); } }; status = gcs_client_->Objects().AsyncSubscribeToLocations( @@ -189,8 +198,9 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; auto &spilled_url = listener_state.spilled_url; - io_service_.post([callback, locations, spilled_url, object_id]() { - callback(object_id, locations, spilled_url); + auto object_size = it->second.object_size; + io_service_.post([callback, locations, spilled_url, object_size, object_id]() { + callback(object_id, locations, spilled_url, object_size); }); } return status; @@ -223,8 +233,9 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, // cached locations. auto &locations = it->second.current_object_locations; auto &spilled_url = it->second.spilled_url; - io_service_.post([callback, object_id, spilled_url, locations]() { - callback(object_id, locations, spilled_url); + auto object_size = it->second.object_size; + io_service_.post([callback, object_id, spilled_url, locations, object_size]() { + callback(object_id, locations, spilled_url, object_size); }); } else { // We do not have any locations cached due to a concurrent @@ -252,10 +263,12 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, std::unordered_set node_ids; std::string spilled_url; - UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url); + size_t object_size = 0; + UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url, + &object_size); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, node_ids, spilled_url); + callback(object_id, node_ids, spilled_url, object_size); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 3ce15882bfea..8f06888aee23 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -41,9 +41,9 @@ struct RemoteConnectionInfo { }; /// Callback for object location notifications. -using OnLocationsFound = - std::function &, const std::string &)>; +using OnLocationsFound = std::function &, + const std::string &, size_t object_size)>; class ObjectDirectoryInterface { public: @@ -185,6 +185,8 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_set current_object_locations; /// The location where this object has been spilled, if any. std::string spilled_url = ""; + /// The size of the object. + size_t object_size = 0; /// This flag will get set to true if received any notification of the object. /// It means current_object_locations is up-to-date with GCS. It /// should never go back to false once set to true. If this is true, and diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index d82a5fb0d069..467ea25675e9 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -73,18 +73,6 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ boost::posix_time::milliseconds(config.timer_freq_ms)) { RAY_CHECK(config_.rpc_service_threads_number > 0); - const auto &object_is_local = [this](const ObjectID &object_id) { - return local_objects_.count(object_id) != 0; - }; - const auto &send_pull_request = [this](const ObjectID &object_id, - const NodeID &client_id) { - SendPullRequest(object_id, client_id); - }; - const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; - pull_manager_.reset(new PullManager(self_node_id_, object_is_local, send_pull_request, - restore_spilled_object_, get_time, - config.pull_timeout_ms)); - push_manager_.reset(new PushManager(/* max_chunks_in_flight= */ std::max( static_cast(1L), static_cast(config_.max_bytes_in_flight / config_.object_chunk_size)))); @@ -99,14 +87,40 @@ ObjectManager::ObjectManager(asio::io_service &main_service, const NodeID &self_ main_service, config_.store_socket_name); } + const auto &object_is_local = [this](const ObjectID &object_id) { + return local_objects_.count(object_id) != 0; + }; + const auto &send_pull_request = [this](const ObjectID &object_id, + const NodeID &client_id) { + SendPullRequest(object_id, client_id); + }; + const auto &get_time = []() { return absl::GetCurrentTimeNanos() / 1e9; }; + int64_t available_memory = config.object_store_memory; + if (available_memory < 0) { + available_memory = 0; + } + pull_manager_.reset(new PullManager( + self_node_id_, object_is_local, send_pull_request, restore_spilled_object_, + get_time, config.pull_timeout_ms, available_memory, + [spill_objects_callback, object_store_full_callback]() { + // TODO(swang): This copies the out-of-memory handling in the + // CreateRequestQueue. It would be nice to unify these. + if (object_store_full_callback) { + object_store_full_callback(); + } + + static_cast(spill_objects_callback()); + })); + store_notification_->SubscribeObjAdded( [this](const object_manager::protocol::ObjectInfoT &object_info) { HandleObjectAdded(object_info); }); store_notification_->SubscribeObjDeleted([this](const ObjectID &oid) { - // TODO(swang): We may want to force the pull manager to fetch this object - // again, in case it was needed by an active pull request. NotifyDirectoryObjectDeleted(oid); + // Ask the pull manager to fetch this object again as soon as possible, if + // it was needed by an active pull request. + pull_manager_->ResetRetryTimer(oid); }); // Start object manager rpc server and send & receive request threads @@ -206,8 +220,8 @@ uint64_t ObjectManager::Pull(const std::vector &object_ref const auto &callback = [this](const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url) { - pull_manager_->OnLocationChange(object_id, client_ids, spilled_url); + const std::string &spilled_url, size_t object_size) { + pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, object_size); }; for (const auto &ref : objects_to_locate) { @@ -499,7 +513,7 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &lookup_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url) { + const std::string &spilled_url, size_t object_size) { auto &wait_state = active_wait_requests_.find(wait_id)->second; // Note that the object is guaranteed to be added to local_objects_ before // the notification is triggered. @@ -540,7 +554,7 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { wait_id, object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &subscribe_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url) { + const std::string &spilled_url, size_t object_size) { auto object_id_wait_state = active_wait_requests_.find(wait_id); if (object_id_wait_state == active_wait_requests_.end()) { // Depending on the timing of calls to the object directory, we @@ -822,6 +836,16 @@ void ObjectManager::Tick(const boost::system::error_code &e) { << ". Please file a bug report on here: " "https://github.com/ray-project/ray/issues"; + // Request the current available memory from the object + // store. + if (plasma::plasma_store_runner) { + plasma::plasma_store_runner->GetAvailableMemoryAsync([this](size_t available_memory) { + main_service_->post([this, available_memory]() { + pull_manager_->UpdatePullsBasedOnAvailableMemory(available_memory); + }); + }); + } + pull_manager_->Tick(); auto interval = boost::posix_time::milliseconds(config_.timer_freq_ms); diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index df11a4bb750f..efc37b3e8d8c 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -126,6 +126,10 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( return; } + if (reply.object_size() > 0) { + it->second.object_size = reply.object_size(); + } + std::unordered_set node_ids; for (auto const &node_id : reply.node_ids()) { node_ids.emplace(NodeID::FromBinary(node_id)); @@ -141,7 +145,8 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( for (const auto &callback_pair : callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. - callback_pair.second(object_id, it->second.current_object_locations, ""); + callback_pair.second(object_id, it->second.current_object_locations, "", + it->second.object_size); } } @@ -208,7 +213,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " << "LookupLocations returns an empty list of locations."; io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), ""); + callback(object_id, std::unordered_set(), "", 0); }); return Status::OK(); } @@ -229,7 +234,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( node_ids.emplace(NodeID::FromBinary(node_id)); } FilterRemovedNodes(gcs_client_, &node_ids); - callback(object_id, node_ids, ""); + callback(object_id, node_ids, "", reply.object_size()); }); return Status::OK(); } diff --git a/src/ray/object_manager/plasma/eviction_policy.h b/src/ray/object_manager/plasma/eviction_policy.h index 91788bb34ca5..d20d0b51eeb7 100644 --- a/src/ray/object_manager/plasma/eviction_policy.h +++ b/src/ray/object_manager/plasma/eviction_policy.h @@ -196,6 +196,8 @@ class EvictionPolicy { /// Returns debugging information for this eviction policy. virtual std::string DebugString() const; + int64_t GetPinnedMemoryBytes() const { return pinned_memory_bytes_; } + protected: /// Returns the size of the object int64_t GetObjectSize(const ObjectID &object_id) const; diff --git a/src/ray/object_manager/plasma/store.h b/src/ray/object_manager/plasma/store.h index ec338d388514..2ad3aad261c7 100644 --- a/src/ray/object_manager/plasma/store.h +++ b/src/ray/object_manager/plasma/store.h @@ -33,6 +33,7 @@ #include "ray/object_manager/plasma/connection.h" #include "ray/object_manager/plasma/create_request_queue.h" #include "ray/object_manager/plasma/plasma.h" +#include "ray/object_manager/plasma/plasma_allocator.h" #include "ray/object_manager/plasma/protocol.h" #include "ray/object_manager/plasma/quota_aware_policy.h" @@ -209,6 +210,12 @@ class PlasmaStore { /// Process queued requests to create an object. void ProcessCreateRequests(); + void GetAvailableMemory(std::function callback) const { + size_t available = + PlasmaAllocator::GetFootprintLimit() - eviction_policy_.GetPinnedMemoryBytes(); + callback(available); + } + private: PlasmaError HandleCreateObjectRequest(const std::shared_ptr &client, const std::vector &message, diff --git a/src/ray/object_manager/plasma/store_runner.h b/src/ray/object_manager/plasma/store_runner.h index 3edd70350cc2..7ac7be59bbc5 100644 --- a/src/ray/object_manager/plasma/store_runner.h +++ b/src/ray/object_manager/plasma/store_runner.h @@ -1,8 +1,7 @@ #pragma once -#include - #include +#include #include "absl/synchronization/mutex.h" #include "ray/object_manager/notification/object_store_notification_manager.h" @@ -23,6 +22,10 @@ class PlasmaStoreRunner { } bool IsPlasmaObjectSpillable(const ObjectID &object_id); + void GetAvailableMemoryAsync(std::function callback) const { + main_service_.post([this, callback]() { store_->GetAvailableMemory(callback); }); + } + private: void Shutdown(); absl::Mutex store_runner_mutex_; @@ -30,7 +33,7 @@ class PlasmaStoreRunner { int64_t system_memory_; bool hugepages_enabled_; std::string plasma_directory_; - boost::asio::io_service main_service_; + mutable boost::asio::io_service main_service_; std::unique_ptr store_; std::shared_ptr listener_; }; diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 289ad13eb5cc..1ebf9214a707 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -8,13 +8,16 @@ PullManager::PullManager( NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms) + const std::function get_time, int pull_timeout_ms, + size_t num_bytes_available, std::function object_store_full_callback) : self_node_id_(self_node_id), object_is_local_(object_is_local), send_pull_request_(send_pull_request), restore_spilled_object_(restore_spilled_object), get_time_(get_time), pull_timeout_ms_(pull_timeout_ms), + num_bytes_available_(num_bytes_available), + object_store_full_callback_(object_store_full_callback), gen_(std::chrono::high_resolution_clock::now().time_since_epoch().count()) {} uint64_t PullManager::Pull(const std::vector &object_ref_bundle, @@ -39,33 +42,224 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b it->second.bundle_request_ids.insert(bundle_it->first); } + // We have a new request. Activate the new request, if the + // current available memory allows it. + UpdatePullsBasedOnAvailableMemory(num_bytes_available_); + return bundle_it->first; } +bool PullManager::ActivateNextPullBundleRequest( + const std::map>::iterator + &next_request_it) { + // Check that we have sizes for all of the objects in the bundle. If not, we + // should not activate the bundle, since it may put us over the available + // capacity. + for (const auto &ref : next_request_it->second) { + auto obj_id = ObjectRefToId(ref); + const auto it = object_pull_requests_.find(obj_id); + RAY_CHECK(it != object_pull_requests_.end()); + if (!it->second.object_size_set) { + // NOTE(swang): The size could be 0 if we haven't received size + // information yet. If we receive the size later on, we will update the + // total bytes being pulled then. + RAY_LOG(DEBUG) << "No size for " << obj_id << ", canceling activation for pull " + << next_request_it->first; + return false; + } + } + + // Activate the bundle. + for (const auto &ref : next_request_it->second) { + auto obj_id = ObjectRefToId(ref); + bool start_pull = active_object_pull_requests_.count(obj_id) == 0; + active_object_pull_requests_[obj_id].insert(next_request_it->first); + if (start_pull) { + RAY_LOG(DEBUG) << "Activating pull for object " << obj_id; + // This is the first bundle request in the queue to require this object. + // Add the size to the number of bytes being pulled. + auto it = object_pull_requests_.find(obj_id); + RAY_CHECK(it != object_pull_requests_.end()); + num_bytes_being_pulled_ += it->second.object_size; + } + } + + // Update the pointer to the last pull request that we are actively pulling. + RAY_CHECK(next_request_it->first > highest_req_id_being_pulled_); + highest_req_id_being_pulled_ = next_request_it->first; + return true; +} + +void PullManager::DeactivatePullBundleRequest( + const std::map>::iterator &request_it) { + for (const auto &ref : request_it->second) { + auto obj_id = ObjectRefToId(ref); + RAY_CHECK(active_object_pull_requests_[obj_id].erase(request_it->first)); + if (active_object_pull_requests_[obj_id].empty()) { + RAY_LOG(DEBUG) << "Deactivating pull for object " << obj_id; + auto it = object_pull_requests_.find(obj_id); + RAY_CHECK(it != object_pull_requests_.end()); + num_bytes_being_pulled_ -= it->second.object_size; + active_object_pull_requests_.erase(obj_id); + } + } + + // If this was the last active request, update the pointer to its + // predecessor, if one exists. + if (highest_req_id_being_pulled_ == request_it->first) { + if (request_it == pull_request_bundles_.begin()) { + highest_req_id_being_pulled_ = 0; + } else { + highest_req_id_being_pulled_ = std::prev(request_it)->first; + } + } +} + +void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) { + if (num_bytes_available_ != num_bytes_available) { + RAY_LOG(DEBUG) << "Updating pulls based on available memory: " << num_bytes_available; + } + num_bytes_available_ = num_bytes_available; + uint64_t prev_highest_req_id_being_pulled = highest_req_id_being_pulled_; + + std::unordered_set object_ids_to_pull; + // While there is available capacity, activate the next pull request. + while (num_bytes_being_pulled_ < num_bytes_available_) { + // Get the next pull request in the queue. + const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); + auto next_request_it = last_request_it; + if (next_request_it == pull_request_bundles_.end()) { + // No requests are active. Get the first request in the queue. + next_request_it = pull_request_bundles_.begin(); + } else { + next_request_it++; + } + + if (next_request_it == pull_request_bundles_.end()) { + // No requests in the queue. + break; + } + + RAY_LOG(DEBUG) << "Activating request " << next_request_it->first + << " num bytes being pulled: " << num_bytes_being_pulled_ + << " num bytes available: " << num_bytes_available_; + // There is another pull bundle request that we could try, and there is + // enough space. Activate the next pull bundle request in the queue. + if (!ActivateNextPullBundleRequest(next_request_it)) { + // This pull bundle request could not be activated, due to lack of object + // size information. Wait until we have object size information before + // activating this pull bundle. + break; + } + } + + std::unordered_set object_ids_to_cancel; + // While the total bytes requested is over the available capacity, deactivate + // the last pull request, ordered by request ID. + while (num_bytes_being_pulled_ > num_bytes_available_) { + RAY_LOG(DEBUG) << "Deactivating request " << highest_req_id_being_pulled_ + << " num bytes being pulled: " << num_bytes_being_pulled_ + << " num bytes available: " << num_bytes_available_; + const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); + RAY_CHECK(last_request_it != pull_request_bundles_.end()); + DeactivatePullBundleRequest(last_request_it); + } + + TriggerOutOfMemoryHandlingIfNeeded(); + + if (highest_req_id_being_pulled_ > prev_highest_req_id_being_pulled) { + // There are newly activated requests. Start pulling objects for the newly + // activated requests. + // NOTE(swang): We could also just wait for the next timer tick to pull the + // objects, but this would add a delay of up to one tick for any bundles of + // multiple objects, even when we are not under memory pressure. + Tick(); + } +} + +void PullManager::TriggerOutOfMemoryHandlingIfNeeded() { + if (pull_request_bundles_.empty()) { + // No requests queued. + return; + } + + const auto head = pull_request_bundles_.begin(); + if (highest_req_id_being_pulled_ >= head->first) { + // At least one request is being actively pulled, so there is currently + // enough space. + return; + } + + // No requests are being pulled. Check whether this is because we don't have + // object size information yet. + size_t num_bytes_needed = 0; + for (const auto &ref : head->second) { + auto obj_id = ObjectRefToId(ref); + const auto it = object_pull_requests_.find(obj_id); + RAY_CHECK(it != object_pull_requests_.end()); + if (!it->second.object_size_set) { + // We're not pulling the first request because we don't have size + // information. Wait for the size information before triggering OOM + return; + } + num_bytes_needed += it->second.object_size; + } + + // The first request in the queue is not being pulled due to lack of space. + // Trigger out-of-memory handling to try to make room. + // TODO(swang): This can hang if no room can be made. We should return an + // error for requests whose total size is larger than the capacity of the + // memory store. + if (get_time_() - last_oom_reported_ms_ > 30000) { + RAY_LOG(WARNING) + << "There is not enough memory to pull objects needed by a queued task or " + "a worker blocked in ray.get or ray.wait. " + << "Need " << num_bytes_needed << " bytes, but only " << num_bytes_available_ + << " bytes are available on this node. " + << "This job may hang if no memory can be freed through garbage collection or " + "object spilling. See " + "https://docs.ray.io/en/master/memory-management.html for more information. " + "Please file a GitHub issue if you see this message repeatedly."; + last_oom_reported_ms_ = get_time_(); + } + object_store_full_callback_(); +} + std::vector PullManager::CancelPull(uint64_t request_id) { - std::vector objects_to_cancel; RAY_LOG(DEBUG) << "Cancel pull request " << request_id; auto bundle_it = pull_request_bundles_.find(request_id); RAY_CHECK(bundle_it != pull_request_bundles_.end()); + // If the pull request was being actively pulled, deactivate it now. + if (bundle_it->first <= highest_req_id_being_pulled_) { + DeactivatePullBundleRequest(bundle_it); + } + + // Erase this pull request. + std::vector object_ids_to_cancel; for (const auto &ref : bundle_it->second) { auto obj_id = ObjectRefToId(ref); auto it = object_pull_requests_.find(obj_id); RAY_CHECK(it != object_pull_requests_.end()); - RAY_CHECK(it->second.bundle_request_ids.erase(request_id)); + RAY_CHECK(it->second.bundle_request_ids.erase(bundle_it->first)); if (it->second.bundle_request_ids.empty()) { object_pull_requests_.erase(it); - objects_to_cancel.push_back(obj_id); + object_ids_to_cancel.push_back(obj_id); } } - pull_request_bundles_.erase(bundle_it); - return objects_to_cancel; + + // We need to update the pulls in case there is another request(s) after this + // request that can now be activated. We do this after erasing the cancelled + // request to avoid reactivating it again. + UpdatePullsBasedOnAvailableMemory(num_bytes_available_); + + return object_ids_to_cancel; } void PullManager::OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url) { + const std::string &spilled_url, size_t object_size) { // Exit if the Pull request has already been fulfilled or canceled. auto it = object_pull_requests_.find(object_id); if (it == object_pull_requests_.end()) { @@ -77,6 +271,14 @@ void PullManager::OnLocationChange(const ObjectID &object_id, // before. it->second.client_locations = std::vector(client_ids.begin(), client_ids.end()); it->second.spilled_url = spilled_url; + + if (!it->second.object_size_set) { + RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size + << ", num bytes being pulled is now " << num_bytes_being_pulled_; + it->second.object_size = object_size; + it->second.object_size_set = true; + UpdatePullsBasedOnAvailableMemory(num_bytes_available_); + } RAY_LOG(DEBUG) << "OnLocationChange " << spilled_url << " num clients " << client_ids.size(); @@ -87,10 +289,11 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) { if (object_is_local_(object_id)) { return; } - auto it = object_pull_requests_.find(object_id); - if (it == object_pull_requests_.end()) { + if (active_object_pull_requests_.count(object_id) == 0) { return; } + auto it = object_pull_requests_.find(object_id); + RAY_CHECK(it != object_pull_requests_.end()); auto &request = it->second; if (request.next_pull_time > get_time_()) { return; @@ -174,6 +377,14 @@ bool PullManager::PullFromRandomLocation(const ObjectID &object_id) { return true; } +void PullManager::ResetRetryTimer(const ObjectID &object_id) { + auto it = object_pull_requests_.find(object_id); + if (it != object_pull_requests_.end()) { + it->second.next_pull_time = get_time_(); + it->second.num_retries = 0; + } +} + void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { const auto time = get_time_(); auto retry_timeout_len = (pull_timeout_ms_ / 1000.) * (1UL << request.num_retries); @@ -184,7 +395,7 @@ void PullManager::UpdateRetryTimer(ObjectPullRequest &request) { } void PullManager::Tick() { - for (auto &pair : object_pull_requests_) { + for (auto &pair : active_object_pull_requests_) { const auto &object_id = pair.first; TryToMakeObjectLocal(object_id); } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 6364ae34a68d..e4a662eb6306 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -40,9 +40,14 @@ class PullManager { NodeID &self_node_id, const std::function object_is_local, const std::function send_pull_request, const RestoreSpilledObjectCallback restore_spilled_object, - const std::function get_time, int pull_timeout_ms); - - /// Begin a new pull request for a bundle of objects. + const std::function get_time, int pull_timeout_ms, + size_t num_bytes_available, std::function object_store_full_callback); + + /// Add a new pull request for a bundle of objects. The objects in the + /// request will get pulled once: + /// 1. Their sizes are known. + /// 2. Their total size, together with the total size of all requests + /// preceding this one, is within the capacity of the local object store. /// /// \param object_refs The bundle of objects that must be made local. /// \param objects_to_locate The objects whose new locations the caller @@ -51,6 +56,15 @@ class PullManager { uint64_t Pull(const std::vector &object_ref_bundle, std::vector *objects_to_locate); + /// Update the pull requests that are currently being pulled, according to + /// the current capacity. The PullManager will choose the objects to pull by + /// taking the longest contiguous prefix of the request queue whose total + /// size is less than the given capacity. + /// + /// \param num_bytes_available The number of bytes that are currently + /// available to store objects pulled from another node. + void UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available); + /// Called when the available locations for a given object change. /// /// \param object_id The ID of the object which is now available in a new location. @@ -60,7 +74,7 @@ class PullManager { /// non-empty, the object may no longer be on any node. void OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url); + const std::string &spilled_url, size_t object_size); /// Cancel an existing pull request. /// @@ -73,6 +87,13 @@ class PullManager { /// existing objects from other nodes if necessary. void Tick(); + /// Call to reset the retry timer for an object that is actively being + /// pulled. This should be called for objects that were evicted but that may + /// still be needed on this node. + /// + /// \param object_id The object ID to reset. + void ResetRetryTimer(const ObjectID &object_id); + /// The number of ongoing object pulls. int NumActiveRequests() const; @@ -89,6 +110,11 @@ class PullManager { std::string spilled_url; double next_pull_time; uint8_t num_retries; + bool object_size_set = false; + size_t object_size = 0; + // All bundle requests that haven't been canceled yet that require this + // object. This includes bundle requests whose objects are not actively + // being pulled. absl::flat_hash_set bundle_request_ids; }; @@ -112,6 +138,22 @@ class PullManager { /// \param request The request to update the retry time of. void UpdateRetryTimer(ObjectPullRequest &request); + /// Activate the next pull request in the queue. This will start pulls for + /// any objects in the request that are not already being pulled. + bool ActivateNextPullBundleRequest( + const std::map>::iterator + &next_request_it); + + /// Deactivate a pull request in the queue. This cancels any pull or restore + /// operations for the object. + void DeactivatePullBundleRequest( + const std::map>::iterator &request_it); + + /// Trigger out-of-memory handling if the first request in the queue needs + /// more space than the bytes available. This is needed to make room for the + /// request. + void TriggerOutOfMemoryHandlingIfNeeded(); + /// See the constructor's arguments. NodeID self_node_id_; const std::function object_is_local_; @@ -124,13 +166,51 @@ class PullManager { /// cancel. Start at 1 because 0 means null. uint64_t next_req_id_ = 1; - std::unordered_map> pull_request_bundles_; - - /// The objects that this object manager is currently trying to fetch from - /// remote object managers. + /// The currently active pull requests. Each request is a bundle of objects + /// that must be made local. The key is the ID that was assigned to that + /// request, which can be used by the caller to cancel the request. + std::map> pull_request_bundles_; + + /// The total number of bytes that we are currently pulling. This is the + /// total size of the objects requested that we are actively pulling. To + /// avoid starvation, this is always less than the available capacity in the + /// local object store. + size_t num_bytes_being_pulled_ = 0; + + /// The total number of bytes that is available to store objects that we are + /// pulling. + size_t num_bytes_available_; + + /// Triggered when the first request in the queue can't be pulled due to + /// out-of-memory. This callback should try to make more bytes available. + std::function object_store_full_callback_; + + /// The last time OOM was reported. Track this so we don't spam warnings when + /// the object store is full. + uint64_t last_oom_reported_ms_ = 0; + + /// A pointer to the highest request ID whose objects we are currently + /// pulling. We always pull a contiguous prefix of the active pull requests. + /// This means that all requests with a lower ID are either already canceled + /// or their objects are also being pulled. + uint64_t highest_req_id_being_pulled_ = 0; + + /// The objects that this object manager has been asked to fetch from remote + /// object managers. std::unordered_map object_pull_requests_; + /// The objects that we are currently fetching. This is a subset of the + /// objects that we have been asked to fetch. The total size of these objects + /// is the number of bytes that we are currently pulling, and it must be less + /// than the bytes available. + absl::flat_hash_map> + active_object_pull_requests_; + /// Internally maintained random number generator. std::mt19937_64 gen_; + + friend class PullManagerTest; + friend class PullManagerTestWithCapacity; + friend class PullManagerWithAdmissionControlTest; }; } // namespace ray diff --git a/src/ray/object_manager/test/object_manager_stress_test.cc b/src/ray/object_manager/test/object_manager_stress_test.cc deleted file mode 100644 index 8896ba9968db..000000000000 --- a/src/ray/object_manager/test/object_manager_stress_test.cc +++ /dev/null @@ -1,453 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "gtest/gtest.h" -#include "ray/common/common_protocol.h" -#include "ray/common/status.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_client/service_based_gcs_client.h" -#include "ray/object_manager/object_manager.h" -#include "ray/util/filesystem.h" -#include "src/ray/protobuf/common.pb.h" - -extern "C" { -#include "hiredis/hiredis.h" -} - -namespace ray { - -using rpc::GcsNodeInfo; - -static inline bool flushall_redis(void) { - redisContext *context = redisConnect("127.0.0.1", 6379); - if (context == nullptr || context->err) { - return false; - } - freeReplyObject(redisCommand(context, "FLUSHALL")); - freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); - freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); - redisFree(context); - - redisContext *shard_context = redisConnect("127.0.0.1", 6380); - if (shard_context == nullptr || shard_context->err) { - return false; - } - freeReplyObject(redisCommand(shard_context, "FLUSHALL")); - redisFree(shard_context); - - return true; -} - -int64_t current_time_ms() { - std::chrono::milliseconds ms_since_epoch = - std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()); - return ms_since_epoch.count(); -} - -class MockServer { - public: - MockServer(boost::asio::io_service &main_service, - const ObjectManagerConfig &object_manager_config, - std::shared_ptr gcs_client) - : node_id_(NodeID::FromRandom()), - config_(object_manager_config), - gcs_client_(gcs_client), - object_manager_(main_service, node_id_, object_manager_config, - std::make_shared(main_service, gcs_client_), - nullptr) { - RAY_CHECK_OK(RegisterGcs(main_service)); - } - - ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } - - private: - ray::Status RegisterGcs(boost::asio::io_service &io_service) { - auto object_manager_port = object_manager_.GetServerPort(); - GcsNodeInfo node_info; - node_info.set_node_id(node_id_.Binary()); - node_info.set_node_manager_address("127.0.0.1"); - node_info.set_node_manager_port(object_manager_port); - node_info.set_object_manager_port(object_manager_port); - - ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); - return status; - } - - friend class StressTestObjectManager; - - NodeID node_id_; - ObjectManagerConfig config_; - std::shared_ptr gcs_client_; - ObjectManager object_manager_; -}; - -class TestObjectManagerBase : public ::testing::Test { - public: - void SetUp() { - WaitForCondition(flushall_redis, 7000); - - // start store - socket_name_1 = TestSetupUtil::StartObjectStore(); - socket_name_2 = TestSetupUtil::StartObjectStore(); - - unsigned int pull_timeout_ms = 1000; - uint64_t object_chunk_size = static_cast(std::pow(10, 3)); - int push_timeout_ms = 10000; - - // start first server - gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); - gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", - /*is_test_client=*/false); - gcs_client_1 = std::make_shared(client_options); - RAY_CHECK_OK(gcs_client_1->Connect(main_service)); - ObjectManagerConfig om_config_1; - om_config_1.store_socket_name = socket_name_1; - om_config_1.pull_timeout_ms = pull_timeout_ms; - om_config_1.object_chunk_size = object_chunk_size; - om_config_1.push_timeout_ms = push_timeout_ms; - om_config_1.object_manager_port = 0; - om_config_1.rpc_service_threads_number = 3; - server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); - - // start second server - gcs_client_2 = std::make_shared(client_options); - RAY_CHECK_OK(gcs_client_2->Connect(main_service)); - ObjectManagerConfig om_config_2; - om_config_2.store_socket_name = socket_name_2; - om_config_2.pull_timeout_ms = pull_timeout_ms; - om_config_2.object_chunk_size = object_chunk_size; - om_config_2.push_timeout_ms = push_timeout_ms; - om_config_2.object_manager_port = 0; - om_config_2.rpc_service_threads_number = 3; - server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); - - // connect to stores. - RAY_CHECK_OK(client1.Connect(socket_name_1)); - RAY_CHECK_OK(client2.Connect(socket_name_2)); - } - - void TearDown() { - Status client1_status = client1.Disconnect(); - Status client2_status = client2.Disconnect(); - ASSERT_TRUE(client1_status.ok() && client2_status.ok()); - - gcs_client_1->Disconnect(); - gcs_client_2->Disconnect(); - - this->server1.reset(); - this->server2.reset(); - - TestSetupUtil::StopObjectStore(socket_name_1); - TestSetupUtil::StopObjectStore(socket_name_2); - - if (!gcs_server_socket_name_.empty()) { - TestSetupUtil::StopGcsServer(gcs_server_socket_name_); - } - } - - ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { - ObjectID object_id = ObjectID::FromRandom(); - RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; - uint8_t metadata[] = {5}; - int64_t metadata_size = sizeof(metadata); - uint64_t retry_with_request_id = 0; - std::shared_ptr data; - RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, - metadata_size, &retry_with_request_id, &data)); - RAY_CHECK(retry_with_request_id == 0); - RAY_CHECK_OK(client.Seal(object_id)); - return object_id; - } - - void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; - - void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; - - protected: - std::thread p; - boost::asio::io_service main_service; - std::shared_ptr gcs_client_1; - std::shared_ptr gcs_client_2; - std::unique_ptr server1; - std::unique_ptr server2; - - plasma::PlasmaClient client1; - plasma::PlasmaClient client2; - std::vector v1; - std::vector v2; - - std::string gcs_server_socket_name_; - std::string socket_name_1; - std::string socket_name_2; -}; - -class StressTestObjectManager : public TestObjectManagerBase { - public: - enum class TransferPattern { - PUSH_A_B, - PUSH_B_A, - BIDIRECTIONAL_PUSH, - PULL_A_B, - PULL_B_A, - BIDIRECTIONAL_PULL, - BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE, - }; - - int async_loop_index = -1; - size_t num_expected_objects; - - std::vector async_loop_patterns = { - TransferPattern::PUSH_A_B, - TransferPattern::PUSH_B_A, - TransferPattern::BIDIRECTIONAL_PUSH, - TransferPattern::PULL_A_B, - TransferPattern::PULL_B_A, - TransferPattern::BIDIRECTIONAL_PULL, - TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE}; - - int num_connected_clients = 0; - - NodeID node_id_1; - NodeID node_id_2; - - int64_t start_time; - - void WaitConnections() { - node_id_1 = gcs_client_1->Nodes().GetSelfId(); - node_id_2 = gcs_client_2->Nodes().GetSelfId(); - RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( - [this](const NodeID &node_id, const GcsNodeInfo &data) { - if (node_id == node_id_1 || node_id == node_id_2) { - num_connected_clients += 1; - } - if (num_connected_clients == 4) { - StartTests(); - } - }, - nullptr)); - RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( - [this](const NodeID &node_id, const GcsNodeInfo &data) { - if (node_id == node_id_1 || node_id == node_id_2) { - num_connected_clients += 1; - } - if (num_connected_clients == 4) { - StartTests(); - } - }, - nullptr)); - } - - void StartTests() { - TestConnections(); - AddTransferTestHandlers(); - TransferTestNext(); - } - - void AddTransferTestHandlers() { - ray::Status status = ray::Status::OK(); - status = server1->object_manager_.SubscribeObjAdded( - [this](const object_manager::protocol::ObjectInfoT &object_info) { - object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); - if (v1.size() == num_expected_objects && v1.size() == v2.size()) { - TransferTestComplete(); - } - }); - RAY_CHECK_OK(status); - status = server2->object_manager_.SubscribeObjAdded( - [this](const object_manager::protocol::ObjectInfoT &object_info) { - object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); - if (v2.size() == num_expected_objects && v1.size() == v2.size()) { - TransferTestComplete(); - } - }); - RAY_CHECK_OK(status); - } - - void TransferTestNext() { - async_loop_index += 1; - if ((size_t)async_loop_index < async_loop_patterns.size()) { - TransferPattern pattern = async_loop_patterns[async_loop_index]; - TransferTestExecute(100, 3 * std::pow(10, 3) - 1, pattern); - } else { - main_service.stop(); - } - } - - plasma::ObjectBuffer GetObject(plasma::PlasmaClient &client, ObjectID &object_id) { - plasma::ObjectBuffer object_buffer; - RAY_CHECK_OK(client.Get(&object_id, 1, 0, &object_buffer)); - return object_buffer; - } - - void CompareObjects(ObjectID &object_id_1, ObjectID &object_id_2) { - plasma::ObjectBuffer object_buffer_1 = GetObject(client1, object_id_1); - plasma::ObjectBuffer object_buffer_2 = GetObject(client2, object_id_2); - uint8_t *data_1 = const_cast(object_buffer_1.data->Data()); - uint8_t *data_2 = const_cast(object_buffer_2.data->Data()); - ASSERT_EQ(object_buffer_1.data->Size(), object_buffer_2.data->Size()); - ASSERT_EQ(object_buffer_1.metadata->Size(), object_buffer_2.metadata->Size()); - int64_t total_size = object_buffer_1.data->Size() + object_buffer_1.metadata->Size(); - RAY_LOG(DEBUG) << "total_size " << total_size; - for (int i = -1; ++i < total_size;) { - ASSERT_TRUE(data_1[i] == data_2[i]); - } - } - - void TransferTestComplete() { - int64_t elapsed = current_time_ms() - start_time; - RAY_LOG(INFO) << "TransferTestComplete: " - << static_cast(async_loop_patterns[async_loop_index]) << " " - << v1.size() << " " << elapsed; - ASSERT_TRUE(v1.size() == v2.size()); - for (size_t i = 0; i < v1.size(); ++i) { - ASSERT_TRUE(std::find(v1.begin(), v1.end(), v2[i]) != v1.end()); - } - - // Compare objects and their hashes. - for (size_t i = 0; i < v1.size(); ++i) { - ObjectID object_id_2 = v2[i]; - ObjectID object_id_1 = - v1[std::distance(v1.begin(), std::find(v1.begin(), v1.end(), v2[i]))]; - CompareObjects(object_id_1, object_id_2); - } - - v1.clear(); - v2.clear(); - TransferTestNext(); - } - - void TransferTestExecute(int num_trials, int64_t data_size, - TransferPattern transfer_pattern) { - NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); - NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); - - if (transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL || - transfer_pattern == TransferPattern::BIDIRECTIONAL_PUSH || - transfer_pattern == TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE) { - num_expected_objects = (size_t)2 * num_trials; - } else { - num_expected_objects = (size_t)num_trials; - } - - start_time = current_time_ms(); - - switch (transfer_pattern) { - case TransferPattern::PUSH_A_B: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid1 = WriteDataToClient(client1, data_size); - server1->object_manager_.Push(oid1, node_id_2); - } - } break; - case TransferPattern::PUSH_B_A: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid2 = WriteDataToClient(client2, data_size); - server2->object_manager_.Push(oid2, node_id_1); - } - } break; - case TransferPattern::BIDIRECTIONAL_PUSH: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid1 = WriteDataToClient(client1, data_size); - server1->object_manager_.Push(oid1, node_id_2); - ObjectID oid2 = WriteDataToClient(client2, data_size); - server2->object_manager_.Push(oid2, node_id_1); - } - } break; - case TransferPattern::PULL_A_B: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid1 = WriteDataToClient(client1, data_size); - static_cast( - server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); - } - } break; - case TransferPattern::PULL_B_A: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid2 = WriteDataToClient(client2, data_size); - static_cast( - server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); - } - } break; - case TransferPattern::BIDIRECTIONAL_PULL: { - for (int i = -1; ++i < num_trials;) { - ObjectID oid1 = WriteDataToClient(client1, data_size); - static_cast( - server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); - ObjectID oid2 = WriteDataToClient(client2, data_size); - static_cast( - server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); - } - } break; - case TransferPattern::BIDIRECTIONAL_PULL_VARIABLE_DATA_SIZE: { - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution<> dis(1, 50); - for (int i = -1; ++i < num_trials;) { - ObjectID oid1 = WriteDataToClient(client1, data_size + dis(gen)); - static_cast( - server2->object_manager_.Pull({ObjectIdToRef(oid1, rpc::Address())})); - ObjectID oid2 = WriteDataToClient(client2, data_size + dis(gen)); - static_cast( - server1->object_manager_.Pull({ObjectIdToRef(oid2, rpc::Address())})); - } - } break; - default: { - RAY_LOG(FATAL) << "No case for transfer_pattern " - << static_cast(transfer_pattern); - } break; - } - } - - void TestConnections() { - RAY_LOG(DEBUG) << "\n" - << "Server node ids:" - << "\n"; - NodeID node_id_1 = gcs_client_1->Nodes().GetSelfId(); - NodeID node_id_2 = gcs_client_2->Nodes().GetSelfId(); - RAY_LOG(DEBUG) << "Server 1: " << node_id_1 << "\n" - << "Server 2: " << node_id_2; - - RAY_LOG(DEBUG) << "\n" - << "All connected nodes:" - << "\n"; - auto data = gcs_client_1->Nodes().Get(node_id_1); - RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data->node_id()) << "\n" - << "NodeIp=" << data->node_manager_address() << "\n" - << "NodePort=" << data->node_manager_port(); - auto data2 = gcs_client_1->Nodes().Get(node_id_2); - RAY_LOG(DEBUG) << "NodeID=" << NodeID::FromBinary(data2->node_id()) << "\n" - << "NodeIp=" << data2->node_manager_address() << "\n" - << "NodePort=" << data2->node_manager_port(); - } -}; - -TEST_F(StressTestObjectManager, StartStressTestObjectManager) { - auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); - AsyncStartTests(); - main_service.run(); -} - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); - ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[2]); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/object_manager/test/object_manager_test.cc b/src/ray/object_manager/test/object_manager_test.cc deleted file mode 100644 index 7afe2e42ef03..000000000000 --- a/src/ray/object_manager/test/object_manager_test.cc +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright 2017 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "ray/object_manager/object_manager.h" - -#include -#include - -#include "gtest/gtest.h" -#include "ray/common/status.h" -#include "ray/common/test_util.h" -#include "ray/gcs/gcs_client/service_based_gcs_client.h" -#include "ray/util/filesystem.h" -#include "src/ray/protobuf/common.pb.h" - -extern "C" { -#include "hiredis/hiredis.h" -} - -namespace { -int64_t wait_timeout_ms; -} // namespace - -namespace ray { - -using rpc::GcsNodeInfo; - -static inline void flushall_redis(void) { - redisContext *context = redisConnect("127.0.0.1", 6379); - freeReplyObject(redisCommand(context, "FLUSHALL")); - freeReplyObject(redisCommand(context, "SET NumRedisShards 1")); - freeReplyObject(redisCommand(context, "LPUSH RedisShards 127.0.0.1:6380")); - redisFree(context); -} - -class MockServer { - public: - MockServer(boost::asio::io_service &main_service, - const ObjectManagerConfig &object_manager_config, - std::shared_ptr gcs_client) - : node_id_(NodeID::FromRandom()), - config_(object_manager_config), - gcs_client_(gcs_client), - object_manager_(main_service, node_id_, object_manager_config, - std::make_shared(main_service, gcs_client_), - nullptr) { - RAY_CHECK_OK(RegisterGcs(main_service)); - } - - ~MockServer() { RAY_CHECK_OK(gcs_client_->Nodes().UnregisterSelf()); } - - private: - ray::Status RegisterGcs(boost::asio::io_service &io_service) { - auto object_manager_port = object_manager_.GetServerPort(); - GcsNodeInfo node_info; - node_info.set_node_id(node_id_.Binary()); - node_info.set_node_manager_address("127.0.0.1"); - node_info.set_node_manager_port(object_manager_port); - node_info.set_object_manager_port(object_manager_port); - - ray::Status status = gcs_client_->Nodes().RegisterSelf(node_info, nullptr); - return status; - } - - friend class TestObjectManager; - - NodeID node_id_; - ObjectManagerConfig config_; - std::shared_ptr gcs_client_; - ObjectManager object_manager_; -}; - -class TestObjectManagerBase : public ::testing::Test { - public: - void SetUp() { - flushall_redis(); - - // start store - socket_name_1 = TestSetupUtil::StartObjectStore(); - socket_name_2 = TestSetupUtil::StartObjectStore(); - - unsigned int pull_timeout_ms = 1; - push_timeout_ms = 1500; - - // start first server - gcs_server_socket_name_ = TestSetupUtil::StartGcsServer("127.0.0.1"); - gcs::GcsClientOptions client_options("127.0.0.1", 6379, /*password*/ "", - /*is_test_client=*/true); - gcs_client_1 = std::make_shared(client_options); - RAY_CHECK_OK(gcs_client_1->Connect(main_service)); - ObjectManagerConfig om_config_1; - om_config_1.store_socket_name = socket_name_1; - om_config_1.pull_timeout_ms = pull_timeout_ms; - om_config_1.object_chunk_size = object_chunk_size; - om_config_1.push_timeout_ms = push_timeout_ms; - om_config_1.object_manager_port = 0; - om_config_1.rpc_service_threads_number = 3; - server1.reset(new MockServer(main_service, om_config_1, gcs_client_1)); - - // start second server - gcs_client_2 = std::make_shared(client_options); - RAY_CHECK_OK(gcs_client_2->Connect(main_service)); - ObjectManagerConfig om_config_2; - om_config_2.store_socket_name = socket_name_2; - om_config_2.pull_timeout_ms = pull_timeout_ms; - om_config_2.object_chunk_size = object_chunk_size; - om_config_2.push_timeout_ms = push_timeout_ms; - om_config_2.object_manager_port = 0; - om_config_2.rpc_service_threads_number = 3; - server2.reset(new MockServer(main_service, om_config_2, gcs_client_2)); - - // connect to stores. - RAY_CHECK_OK(client1.Connect(socket_name_1)); - RAY_CHECK_OK(client2.Connect(socket_name_2)); - } - - void TearDown() { - Status client1_status = client1.Disconnect(); - Status client2_status = client2.Disconnect(); - ASSERT_TRUE(client1_status.ok() && client2_status.ok()); - - gcs_client_1->Disconnect(); - gcs_client_2->Disconnect(); - - this->server1.reset(); - this->server2.reset(); - - TestSetupUtil::StopObjectStore(socket_name_1); - TestSetupUtil::StopObjectStore(socket_name_2); - - if (!gcs_server_socket_name_.empty()) { - TestSetupUtil::StopGcsServer(gcs_server_socket_name_); - } - } - - ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size) { - return WriteDataToClient(client, data_size, ObjectID::FromRandom()); - } - - ObjectID WriteDataToClient(plasma::PlasmaClient &client, int64_t data_size, - ObjectID object_id) { - RAY_LOG(DEBUG) << "ObjectID Created: " << object_id; - uint8_t metadata[] = {5}; - int64_t metadata_size = sizeof(metadata); - uint64_t retry_with_request_id = 0; - std::shared_ptr data; - RAY_CHECK_OK(client.Create(object_id, ray::rpc::Address(), data_size, metadata, - metadata_size, &retry_with_request_id, &data)); - RAY_CHECK(retry_with_request_id == 0); - RAY_CHECK_OK(client.Seal(object_id)); - return object_id; - } - - void object_added_handler_1(ObjectID object_id) { v1.push_back(object_id); }; - - void object_added_handler_2(ObjectID object_id) { v2.push_back(object_id); }; - - protected: - std::thread p; - boost::asio::io_service main_service; - std::shared_ptr gcs_client_1; - std::shared_ptr gcs_client_2; - std::unique_ptr server1; - std::unique_ptr server2; - - plasma::PlasmaClient client1; - plasma::PlasmaClient client2; - std::vector v1; - std::vector v2; - - std::string gcs_server_socket_name_; - std::string socket_name_1; - std::string socket_name_2; - - unsigned int push_timeout_ms; - - uint64_t object_chunk_size = static_cast(std::pow(10, 3)); -}; - -class TestObjectManager : public TestObjectManagerBase { - public: - int current_wait_test = -1; - int num_connected_clients_1 = 0; - int num_connected_clients_2 = 0; - std::atomic ready_cnt; - NodeID node_id_1; - NodeID node_id_2; - - ObjectID created_object_id1; - ObjectID created_object_id2; - - std::unique_ptr timer; - - void WaitConnections() { - node_id_1 = gcs_client_1->Nodes().GetSelfId(); - node_id_2 = gcs_client_2->Nodes().GetSelfId(); - RAY_CHECK_OK(gcs_client_1->Nodes().AsyncSubscribeToNodeChange( - [this](const NodeID &node_id, const GcsNodeInfo &data) { - if (node_id == node_id_1 || node_id == node_id_2) { - num_connected_clients_1 += 1; - } - if (num_connected_clients_1 == 2) { - ready_cnt += 1; - if (ready_cnt == 2) { - StartTests(); - } - } - }, - nullptr)); - RAY_CHECK_OK(gcs_client_2->Nodes().AsyncSubscribeToNodeChange( - [this](const NodeID &node_id, const GcsNodeInfo &data) { - if (node_id == node_id_1 || node_id == node_id_2) { - num_connected_clients_2 += 1; - } - if (num_connected_clients_2 == 2) { - ready_cnt += 1; - if (ready_cnt == 2) { - StartTests(); - } - } - }, - nullptr)); - } - - void StartTests() { - TestConnections(); - TestNotifications(); - } - - void TestNotifications() { - ray::Status status = ray::Status::OK(); - status = server1->object_manager_.SubscribeObjAdded( - [this](const object_manager::protocol::ObjectInfoT &object_info) { - object_added_handler_1(ObjectID::FromBinary(object_info.object_id)); - NotificationTestCompleteIfSatisfied(); - }); - RAY_CHECK_OK(status); - status = server2->object_manager_.SubscribeObjAdded( - [this](const object_manager::protocol::ObjectInfoT &object_info) { - object_added_handler_2(ObjectID::FromBinary(object_info.object_id)); - NotificationTestCompleteIfSatisfied(); - }); - RAY_CHECK_OK(status); - - size_t data_size = 1000000; - - // dummy_id is not local. The push function will timeout. - ObjectID dummy_id = ObjectID::FromRandom(); - server1->object_manager_.Push(dummy_id, gcs_client_2->Nodes().GetSelfId()); - - created_object_id1 = ObjectID::FromRandom(); - WriteDataToClient(client1, data_size, created_object_id1); - // Server1 holds Object1 so this Push call will success. - server1->object_manager_.Push(created_object_id1, gcs_client_2->Nodes().GetSelfId()); - - // This timer is used to guarantee that the Push function for dummy_id will timeout. - timer.reset(new boost::asio::deadline_timer(main_service)); - auto period = boost::posix_time::milliseconds(push_timeout_ms + 10); - timer->expires_from_now(period); - created_object_id2 = ObjectID::FromRandom(); - timer->async_wait([this, data_size](const boost::system::error_code &error) { - WriteDataToClient(client2, data_size, created_object_id2); - }); - } - - void NotificationTestCompleteIfSatisfied() { - size_t num_expected_objects1 = 1; - size_t num_expected_objects2 = 2; - if (v1.size() == num_expected_objects1 && v2.size() == num_expected_objects2) { - SubscribeObjectThenWait(); - } - } - - void SubscribeObjectThenWait() { - int data_size = 100; - // Test to ensure Wait works properly during an active subscription to the same - // object. - ObjectID object_1 = WriteDataToClient(client2, data_size); - ObjectID object_2 = WriteDataToClient(client2, data_size); - server2->object_manager_.Push(object_1, gcs_client_1->Nodes().GetSelfId()); - server2->object_manager_.Push(object_2, gcs_client_1->Nodes().GetSelfId()); - - UniqueID sub_id = ray::UniqueID::FromRandom(); - RAY_CHECK_OK(server1->object_manager_.object_directory_->SubscribeObjectLocations( - sub_id, object_1, rpc::Address(), - [this, sub_id, object_1, object_2](const ray::ObjectID &object_id, - const std::unordered_set &clients, - const std::string &spilled_url) { - if (!clients.empty()) { - TestWaitWhileSubscribed(sub_id, object_1, object_2); - } - })); - } - - void TestWaitWhileSubscribed(UniqueID sub_id, ObjectID object_1, ObjectID object_2) { - int required_objects = 1; - int timeout_ms = 1500; - - std::vector object_ids = {object_1, object_2}; - boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); - - UniqueID wait_id = UniqueID::FromRandom(); - - RAY_CHECK_OK(server1->object_manager_.AddWaitRequest( - wait_id, object_ids, std::unordered_map(), timeout_ms, - required_objects, - [this, sub_id, object_1, object_ids, start_time]( - const std::vector &found, - const std::vector &remaining) { - int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) - .total_milliseconds(); - RAY_LOG(DEBUG) << "elapsed " << elapsed; - RAY_LOG(DEBUG) << "found " << found.size(); - RAY_LOG(DEBUG) << "remaining " << remaining.size(); - RAY_CHECK(found.size() == 1); - // There's nothing more to test. A check will fail if unexpected behavior is - // triggered. - RAY_CHECK_OK( - server1->object_manager_.object_directory_->UnsubscribeObjectLocations( - sub_id, object_1)); - NextWaitTest(); - })); - - // Skip lookups and rely on Subscribe only to test subscribe interaction. - server1->object_manager_.SubscribeRemainingWaitObjects(wait_id); - } - - void NextWaitTest() { - int data_size = 600; - current_wait_test += 1; - switch (current_wait_test) { - case 0: { - // Ensure timeout_ms = 0 is handled correctly. - // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. - TestWait(data_size, 5, 3, /*timeout_ms=*/0, false, false); - } break; - case 1: { - // Ensure timeout_ms = 1500 is handled correctly. - // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. - TestWait(data_size, 5, 3, wait_timeout_ms, false, false); - } break; - case 2: { - // Generate objects locally to ensure local object code-path works properly. - // Out of 5 objects, we expect 3 ready objects and 2 remaining objects. - TestWait(data_size, 5, 3, wait_timeout_ms, false, /*test_local=*/true); - } break; - case 3: { - // Wait on an object that's never registered with GCS to ensure timeout works - // properly. - TestWait(data_size, /*num_objects=*/5, /*required_objects=*/6, wait_timeout_ms, - /*include_nonexistent=*/true, false); - } break; - case 4: { - // Ensure infinite time code-path works properly. - TestWait(data_size, 5, 5, /*timeout_ms=*/-1, false, false); - } break; - } - } - - void TestWait(int data_size, int num_objects, uint64_t required_objects, int timeout_ms, - bool include_nonexistent, bool test_local) { - std::vector object_ids; - for (int i = -1; ++i < num_objects;) { - ObjectID oid; - if (test_local) { - oid = WriteDataToClient(client1, data_size); - } else { - oid = WriteDataToClient(client2, data_size); - server2->object_manager_.Push(oid, gcs_client_1->Nodes().GetSelfId()); - } - object_ids.push_back(oid); - } - if (include_nonexistent) { - num_objects += 1; - object_ids.push_back(ObjectID::FromRandom()); - } - - boost::posix_time::ptime start_time = boost::posix_time::second_clock::local_time(); - RAY_CHECK_OK(server1->object_manager_.Wait( - object_ids, std::unordered_map(), timeout_ms, - required_objects, - [this, object_ids, num_objects, timeout_ms, required_objects, start_time]( - const std::vector &found, - const std::vector &remaining) { - int64_t elapsed = (boost::posix_time::second_clock::local_time() - start_time) - .total_milliseconds(); - RAY_LOG(DEBUG) << "elapsed " << elapsed; - RAY_LOG(DEBUG) << "found " << found.size(); - RAY_LOG(DEBUG) << "remaining " << remaining.size(); - - // Ensure object order is preserved for all invocations. - size_t j = 0; - size_t k = 0; - for (size_t i = 0; i < object_ids.size(); ++i) { - ObjectID oid = object_ids[i]; - // Make sure the object is in either the found vector or the remaining vector. - if (j < found.size() && found[j] == oid) { - j += 1; - } - if (k < remaining.size() && remaining[k] == oid) { - k += 1; - } - } - if (!found.empty()) { - ASSERT_EQ(j, found.size()); - } - if (!remaining.empty()) { - ASSERT_EQ(k, remaining.size()); - } - - switch (current_wait_test) { - case 0: { - // Ensure timeout_ms = 0 returns expected number of found and remaining - // objects. - ASSERT_TRUE(found.size() <= required_objects); - ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); - NextWaitTest(); - } break; - case 1: { - // Ensure lookup succeeds as expected when timeout_ms = 1500. - ASSERT_TRUE(found.size() >= required_objects); - ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); - NextWaitTest(); - } break; - case 2: { - // Ensure lookup succeeds as expected when objects are local. - ASSERT_TRUE(found.size() >= required_objects); - ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); - NextWaitTest(); - } break; - case 3: { - // Ensure lookup returns after timeout_ms elapses when one object doesn't - // exist. - ASSERT_TRUE(elapsed >= timeout_ms); - ASSERT_TRUE(static_cast(found.size() + remaining.size()) == num_objects); - NextWaitTest(); - } break; - case 4: { - // Ensure timeout_ms = -1 works properly. - ASSERT_TRUE(static_cast(found.size()) == num_objects); - ASSERT_TRUE(remaining.size() == 0); - TestWaitComplete(); - } break; - } - })); - } - - void TestWaitComplete() { main_service.stop(); } - - void TestConnections() { - RAY_LOG(DEBUG) << "\n" - << "Server node ids:" - << "\n"; - auto data = gcs_client_1->Nodes().Get(node_id_1); - RAY_LOG(DEBUG) << (NodeID::FromBinary(data->node_id()).IsNil()); - RAY_LOG(DEBUG) << "Server 1 NodeID=" << NodeID::FromBinary(data->node_id()); - RAY_LOG(DEBUG) << "Server 1 NodeIp=" << data->node_manager_address(); - RAY_LOG(DEBUG) << "Server 1 NodePort=" << data->node_manager_port(); - ASSERT_EQ(node_id_1, NodeID::FromBinary(data->node_id())); - auto data2 = gcs_client_1->Nodes().Get(node_id_2); - RAY_LOG(DEBUG) << "Server 2 NodeID=" << NodeID::FromBinary(data2->node_id()); - RAY_LOG(DEBUG) << "Server 2 NodeIp=" << data2->node_manager_address(); - RAY_LOG(DEBUG) << "Server 2 NodePort=" << data2->node_manager_port(); - ASSERT_EQ(node_id_2, NodeID::FromBinary(data2->node_id())); - } -}; - -/* TODO(ekl) this seems to be hanging occasionally on Linux -TEST_F(TestObjectManager, StartTestObjectManager) { - // TODO: Break this test suite into unit tests. - auto AsyncStartTests = main_service.wrap([this]() { WaitConnections(); }); - AsyncStartTests(); - main_service.run(); -} -*/ - -} // namespace ray - -int main(int argc, char **argv) { - ::testing::InitGoogleTest(&argc, argv); - ray::TEST_STORE_EXEC_PATH = std::string(argv[1]); - wait_timeout_ms = std::stoi(std::string(argv[2])); - ray::TEST_GCS_SERVER_EXEC_PATH = std::string(argv[3]); - return RUN_ALL_TESTS(); -} diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/test/pull_manager_test.cc index 9230c87e9db9..345cc6ceadfe 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/test/pull_manager_test.cc @@ -10,13 +10,14 @@ namespace ray { using ::testing::ElementsAre; -class PullManagerTest : public ::testing::Test { +class PullManagerTestWithCapacity { public: - PullManagerTest() + PullManagerTestWithCapacity(size_t num_available_bytes) : self_node_id_(NodeID::FromRandom()), object_is_local_(false), num_send_pull_request_calls_(0), num_restore_spilled_object_calls_(0), + num_object_store_full_calls_(0), fake_time_(0), pull_manager_(self_node_id_, [this](const ObjectID &object_id) { return object_is_local_; }, @@ -28,17 +29,51 @@ class PullManagerTest : public ::testing::Test { num_restore_spilled_object_calls_++; restore_object_callback_ = callback; }, - [this]() { return fake_time_; }, 10000) {} + [this]() { return fake_time_; }, 10000, num_available_bytes, + [this]() { num_object_store_full_calls_++; }) {} + + void AssertNoLeaks() { + ASSERT_TRUE(pull_manager_.pull_request_bundles_.empty()); + ASSERT_TRUE(pull_manager_.object_pull_requests_.empty()); + ASSERT_TRUE(pull_manager_.active_object_pull_requests_.empty()); + // Most tests should not throw OOM. + ASSERT_EQ(num_object_store_full_calls_, 0); + } NodeID self_node_id_; bool object_is_local_; int num_send_pull_request_calls_; int num_restore_spilled_object_calls_; + int num_object_store_full_calls_; std::function restore_object_callback_; double fake_time_; PullManager pull_manager_; }; +class PullManagerTest : public PullManagerTestWithCapacity, public ::testing::Test { + public: + PullManagerTest() : PullManagerTestWithCapacity(1) {} + + void AssertNumActiveRequestsEquals(size_t num_requests) { + ASSERT_EQ(pull_manager_.object_pull_requests_.size(), num_requests); + ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); + } +}; + +class PullManagerWithAdmissionControlTest : public PullManagerTestWithCapacity, + public ::testing::Test { + public: + PullManagerWithAdmissionControlTest() : PullManagerTestWithCapacity(10) {} + + void AssertNumActiveRequestsEquals(size_t num_requests) { + ASSERT_EQ(pull_manager_.active_object_pull_requests_.size(), num_requests); + } + + bool IsUnderCapacity(size_t num_bytes_requested) { + return num_bytes_requested <= pull_manager_.num_bytes_available_; + } +}; + std::vector CreateObjectRefs(int num_objs) { std::vector refs; for (int i = 0; i < num_objs; i++) { @@ -53,14 +88,14 @@ std::vector CreateObjectRefs(int num_objs) { TEST_F(PullManagerTest, TestStaleSubscription) { auto refs = CreateObjectRefs(1); auto oid = ObjectRefsToIds(refs)[0]; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(oid, client_ids, ""); + pull_manager_.OnLocationChange(oid, client_ids, "", 0); + AssertNumActiveRequestsEquals(1); // There are no client ids to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -71,29 +106,30 @@ TEST_F(PullManagerTest, TestStaleSubscription) { ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(oid, client_ids, ""); + pull_manager_.OnLocationChange(oid, client_ids, "", 0); // Now we're getting a notification about an object that was already cancelled. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + + AssertNoLeaks(); } TEST_F(PullManagerTest, TestRestoreSpilledObject) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + AssertNumActiveRequestsEquals(1); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -101,7 +137,7 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { client_ids.insert(NodeID::FromRandom()); fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); // The behavior is supposed to be to always restore the spilled object if possible (even // if it exists elsewhere in the cluster). @@ -111,26 +147,27 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { // Don't restore an object if it's local. object_is_local_ = true; num_restore_spilled_object_calls_ = 0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + + AssertNoLeaks(); } TEST_F(PullManagerTest, TestRestoreObjectFailed) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; - pull_manager_.Pull(refs, &objects_to_locate); + auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + AssertNumActiveRequestsEquals(1); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -143,14 +180,14 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { ASSERT_EQ(num_restore_spilled_object_calls_, 1); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); // We always assume the restore succeeded so there's only 1 restore call still. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 1); fake_time_ += 10.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 2); @@ -161,29 +198,32 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar"); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); // Now that we've successfully sent a pull request, we need to wait for the retry period // before sending another one. ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); + + pull_manager_.CancelPull(req_id); + AssertNoLeaks(); } TEST_F(PullManagerTest, TestManyUpdates) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (int i = 0; i < 100; i++) { - pull_manager_.OnLocationChange(obj1, client_ids, ""); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + AssertNumActiveRequestsEquals(1); } // Since no time has passed, only send a single pull request. @@ -192,25 +232,26 @@ TEST_F(PullManagerTest, TestManyUpdates) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + + AssertNoLeaks(); } TEST_F(PullManagerTest, TestRetryTimer) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; rpc::Address addr1; - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); // We need to call OnLocationChange at least once, to population the list of nodes with // the object. - pull_manager_.OnLocationChange(obj1, client_ids, ""); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + AssertNumActiveRequestsEquals(1); ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -220,7 +261,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // Location changes can trigger reset timer. for (; fake_time_ <= 120 * 10; fake_time_ += 1.) { - pull_manager_.OnLocationChange(obj1, client_ids, ""); + pull_manager_.OnLocationChange(obj1, client_ids, "", 0); } // We should make a pull request every tick (even if it's a duplicate to a node we're @@ -238,55 +279,59 @@ TEST_F(PullManagerTest, TestRetryTimer) { auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, ObjectRefsToIds(refs)); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + + AssertNoLeaks(); } TEST_F(PullManagerTest, TestBasic) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); - ASSERT_EQ(num_send_pull_request_calls_, i + 1); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } + ASSERT_EQ(num_send_pull_request_calls_, oids.size()); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); + AssertNumActiveRequestsEquals(oids.size()); // Don't pull an object if it's local. object_is_local_ = true; num_send_pull_request_calls_ = 0; + fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); ASSERT_EQ(objects_to_cancel, oids); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; + fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); + + AssertNoLeaks(); } TEST_F(PullManagerTest, TestDeduplicateBundles) { auto refs = CreateObjectRefs(3); auto oids = ObjectRefsToIds(refs); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); std::vector objects_to_locate; auto req_id1 = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); - ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); objects_to_locate.clear(); auto req_id2 = pull_manager_.Pull(refs, &objects_to_locate); @@ -295,20 +340,21 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); - ASSERT_EQ(num_send_pull_request_calls_, i + 1); - ASSERT_EQ(num_restore_spilled_object_calls_, 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } + ASSERT_EQ(num_send_pull_request_calls_, oids.size()); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); + AssertNumActiveRequestsEquals(oids.size()); // Cancel one request. auto objects_to_cancel = pull_manager_.CancelPull(req_id1); ASSERT_TRUE(objects_to_cancel.empty()); // Objects should still be pulled because the other request is still open. - ASSERT_EQ(pull_manager_.NumActiveRequests(), oids.size()); + AssertNumActiveRequestsEquals(oids.size()); fake_time_ += 10; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); ASSERT_EQ(num_send_pull_request_calls_, i + 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -316,15 +362,191 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { // Cancel the other request. objects_to_cancel = pull_manager_.CancelPull(req_id2); ASSERT_EQ(objects_to_cancel, oids); - ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + AssertNumActiveRequestsEquals(0); // Don't pull a remote object if we've canceled. object_is_local_ = false; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, ""); + pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); + + AssertNoLeaks(); +} + +TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { + /// Test admission control for a single pull bundle request. We should + /// activate the request when we are under the reported capacity and + /// deactivate it when we are over. + auto refs = CreateObjectRefs(3); + auto oids = ObjectRefsToIds(refs); + size_t object_size = 2; + AssertNumActiveRequestsEquals(0); + std::vector objects_to_locate; + auto req_id = pull_manager_.Pull(refs, &objects_to_locate); + ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + + std::unordered_set client_ids; + client_ids.insert(NodeID::FromRandom()); + for (size_t i = 0; i < oids.size(); i++) { + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + } + ASSERT_EQ(num_send_pull_request_calls_, oids.size()); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); + AssertNumActiveRequestsEquals(oids.size()); + ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); + + // Reduce the available memory. + ASSERT_EQ(num_object_store_full_calls_, 0); + pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size - 1); + AssertNumActiveRequestsEquals(0); + ASSERT_EQ(num_object_store_full_calls_, 1); + // No new pull requests after the next tick. + fake_time_ += 10; + auto prev_pull_requests = num_send_pull_request_calls_; + for (size_t i = 0; i < oids.size(); i++) { + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); + } + + // Increase the available memory again. + pull_manager_.UpdatePullsBasedOnAvailableMemory(oids.size() * object_size); + AssertNumActiveRequestsEquals(oids.size()); + ASSERT_TRUE(IsUnderCapacity(oids.size() * object_size)); + ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests + oids.size()); + + // OOM was not triggered a second time. + ASSERT_EQ(num_object_store_full_calls_, 1); + num_object_store_full_calls_ = 0; + + pull_manager_.CancelPull(req_id); + AssertNoLeaks(); +} + +TEST_F(PullManagerWithAdmissionControlTest, TestQueue) { + /// Test admission control for a queue of pull bundle requests. We should + /// activate as many requests as we can, subject to the reported capacity. + int object_size = 2; + int num_oids_per_request = 2; + int num_requests = 3; + + std::vector> bundles; + std::vector req_ids; + for (int i = 0; i < num_requests; i++) { + auto refs = CreateObjectRefs(num_oids_per_request); + auto oids = ObjectRefsToIds(refs); + std::vector objects_to_locate; + auto req_id = pull_manager_.Pull(refs, &objects_to_locate); + ASSERT_EQ(ObjectRefsToIds(objects_to_locate), oids); + + bundles.push_back(oids); + req_ids.push_back(req_id); + } + + std::unordered_set client_ids; + client_ids.insert(NodeID::FromRandom()); + for (auto &oids : bundles) { + for (size_t i = 0; i < oids.size(); i++) { + pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + } + } + + for (int capacity = 0; capacity < 20; capacity++) { + int num_requests_expected = + std::min(num_requests, capacity / (object_size * num_oids_per_request)); + pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); + + AssertNumActiveRequestsEquals(num_requests_expected * num_oids_per_request); + // The total requests that are active is under the specified capacity. + ASSERT_TRUE( + IsUnderCapacity(num_requests_expected * num_oids_per_request * object_size)); + // This is the maximum number of requests that can be served at once that + // is under the capacity. + if (num_requests_expected < num_requests) { + ASSERT_FALSE(IsUnderCapacity((num_requests_expected + 1) * num_oids_per_request * + object_size)); + } + // Check that OOM was triggered. + if (num_requests_expected == 0) { + ASSERT_EQ(num_object_store_full_calls_, 1); + } else { + ASSERT_EQ(num_object_store_full_calls_, 0); + } + num_object_store_full_calls_ = 0; + } + + for (auto req_id : req_ids) { + pull_manager_.CancelPull(req_id); + } + AssertNoLeaks(); +} + +TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { + /// Test admission control while requests are cancelled out-of-order. When an + /// active request is cancelled, we should activate another request in the + /// queue, if there is one that satisfies the reported capacity. + auto test_cancel = [&](std::vector object_sizes, int capacity, size_t cancel_idx, + int num_active_requests_expected_before, + int num_active_requests_expected_after) { + pull_manager_.UpdatePullsBasedOnAvailableMemory(capacity); + auto refs = CreateObjectRefs(object_sizes.size()); + auto oids = ObjectRefsToIds(refs); + std::vector req_ids; + for (auto &ref : refs) { + std::vector objects_to_locate; + auto req_id = pull_manager_.Pull({ref}, &objects_to_locate); + req_ids.push_back(req_id); + } + for (size_t i = 0; i < object_sizes.size(); i++) { + pull_manager_.OnLocationChange(oids[i], {}, "", object_sizes[i]); + } + AssertNumActiveRequestsEquals(num_active_requests_expected_before); + pull_manager_.CancelPull(req_ids[cancel_idx]); + AssertNumActiveRequestsEquals(num_active_requests_expected_after); + + // Request is really canceled. + pull_manager_.OnLocationChange(oids[cancel_idx], {NodeID::FromRandom()}, "", + object_sizes[cancel_idx]); + ASSERT_EQ(num_send_pull_request_calls_, 0); + + // The expected number of requests at the head of the queue are pulled. + int num_active = 0; + for (size_t i = 0; i < refs.size() && num_active < num_active_requests_expected_after; + i++) { + pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", + object_sizes[i]); + if (i != cancel_idx) { + num_active++; + } + } + ASSERT_EQ(num_send_pull_request_calls_, num_active_requests_expected_after); + + // Reset state. + for (size_t i = 0; i < req_ids.size(); i++) { + if (i != cancel_idx) { + pull_manager_.CancelPull(req_ids[i]); + } + } + num_send_pull_request_calls_ = 0; + }; + + // The next request in the queue is infeasible. If it is canceled, the + // request after that is activated. + test_cancel({1, 1, 2, 1}, 3, 2, 2, 3); + + // If an activated request is canceled, the next request is activated. + test_cancel({1, 1, 2, 1}, 3, 0, 2, 2); + test_cancel({1, 1, 2, 1}, 3, 1, 2, 2); + + // Cancellation of requests at the end of the queue has no effect. + test_cancel({1, 1, 2, 1, 1}, 3, 3, 2, 2); + + // As many new requests as possible are activated when one is canceled. + test_cancel({1, 2, 1, 1, 1}, 3, 1, 2, 3); + + AssertNoLeaks(); } } // namespace ray diff --git a/src/ray/protobuf/core_worker.proto b/src/ray/protobuf/core_worker.proto index 799530d274e9..43a3a667407b 100644 --- a/src/ray/protobuf/core_worker.proto +++ b/src/ray/protobuf/core_worker.proto @@ -186,6 +186,7 @@ message GetObjectLocationsOwnerRequest { message GetObjectLocationsOwnerReply { repeated bytes node_ids = 1; + uint64 object_size = 2; } message KillActorRequest { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index d0793c35ca13..a332a908159e 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -413,6 +413,8 @@ message ObjectLocationInfo { // For objects that have been spilled to external storage, the URL from which // they can be retrieved. string spilled_url = 3; + // The size of the object in bytes. + uint64 size = 4; } // A notification message about one object's locations being changed. @@ -423,6 +425,8 @@ message ObjectLocationChange { // The object has been spilled to this URL. This should be set xor the above // fields are set. string spilled_url = 3; + // The size of the object in bytes. + uint64 size = 4; } // A notification message about one node's resources being changed. diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index 35c86b3bedbe..eda00b806b26 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -272,6 +272,8 @@ message AddObjectLocationRequest { // The spilled URL that will be added to GCS Service. Either this or the node // ID should be set. string spilled_url = 3; + // The size of the object in bytes. + uint64 size = 4; } message AddObjectLocationReply { diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index 59d4789f08c5..f4fd3d025fda 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -179,7 +179,7 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { created_object_id, it->second.owner_addresses[created_object_id], [this, task_id, reconstruction_attempt]( const ray::ObjectID &object_id, const std::unordered_set &nodes, - const std::string &spilled_url) { + const std::string &spilled_url, size_t object_size) { if (nodes.empty() && spilled_url.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index 199e4d51ee2d..8b5fd9d0e75c 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -58,9 +58,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), ""); + callback.second(object_id, std::unordered_set(), "", 0); } else { - callback.second(object_id, it->second, ""); + callback.second(object_id, it->second, "", 0); } } callbacks_.clear(); diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index 616e7348283b..bbae5bb144b0 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -185,8 +185,9 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { MOCK_METHOD1(AsyncGetAll, Status(const gcs::MultiItemCallback &callback)); - MOCK_METHOD3(AsyncAddLocation, Status(const ObjectID &object_id, const NodeID &node_id, - const gcs::StatusCallback &callback)); + MOCK_METHOD4(AsyncAddLocation, + Status(const ObjectID &object_id, const NodeID &node_id, + size_t object_size, const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, const gcs::StatusCallback &callback) { diff --git a/src/ray/test/run_object_manager_tests.sh b/src/ray/test/run_object_manager_tests.sh deleted file mode 100755 index ebb5eba223aa..000000000000 --- a/src/ray/test/run_object_manager_tests.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -# This needs to be run in the root directory. - -# Cause the script to exit if a single command fails. -set -e -set -x - -bazel build "//:object_manager_stress_test" "//:object_manager_test" "//:plasma_store_server" - -# Get the directory in which this script is executing. -SCRIPT_DIR="$(dirname "$0")" -RAY_ROOT="$SCRIPT_DIR/../../.." -# Makes $RAY_ROOT an absolute path. -RAY_ROOT="$(cd "$RAY_ROOT" && pwd)" -if [ -z "$RAY_ROOT" ] ; then - exit 1 -fi -# Ensure we're in the right directory. -if [ ! -d "$RAY_ROOT/python" ]; then - echo "Unable to find root Ray directory. Has this script moved?" - exit 1 -fi - -REDIS_MODULE="./bazel-bin/libray_redis_module.so" -LOAD_MODULE_ARGS=(--loadmodule "${REDIS_MODULE}") -STORE_EXEC="./bazel-bin/plasma_store_server" -GCS_SERVER_EXEC="./bazel-bin/gcs_server" - -# Allow cleanup commands to fail. -bazel run //:redis-cli -- -p 6379 shutdown || true -bazel run //:redis-cli -- -p 6380 shutdown || true -sleep 1s -bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6379 & -bazel run //:redis-server -- --loglevel warning "${LOAD_MODULE_ARGS[@]}" --port 6380 & -sleep 1s -# Run tests. -./bazel-bin/object_manager_stress_test $STORE_EXEC $GCS_SERVER_EXEC -sleep 1s -# Use timeout=1000ms for the Wait tests. -./bazel-bin/object_manager_test $STORE_EXEC 1000 $GCS_SERVER_EXEC -bazel run //:redis-cli -- -p 6379 shutdown -bazel run //:redis-cli -- -p 6380 shutdown From 4e01a9ec3831223fd024606d6c8547993fd5935b Mon Sep 17 00:00:00 2001 From: Nikita Vemuri Date: Thu, 21 Jan 2021 17:01:55 -0800 Subject: [PATCH 012/245] [Autoscaler] Ensure ubuntu is owner of docker host mount folder (#13579) * change ownership to ubuntu if root * use ssh user in cluster config * formatting Co-authored-by: Nikita Vemuri --- python/ray/autoscaler/_private/command_runner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/ray/autoscaler/_private/command_runner.py b/python/ray/autoscaler/_private/command_runner.py index f328d4fd6c1a..544e8b1077e4 100644 --- a/python/ray/autoscaler/_private/command_runner.py +++ b/python/ray/autoscaler/_private/command_runner.py @@ -632,8 +632,10 @@ def run_rsync_up(self, source, target, options=None): self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), target.lstrip("/")) + host_mount_location = os.path.dirname(host_destination.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {os.path.dirname(host_destination.rstrip('/'))}", + f"mkdir -p {host_mount_location} && chown -R " + f"{self.ssh_command_runner.ssh_user} {host_mount_location}", silent=is_rsync_silent()) self.ssh_command_runner.run_rsync_up( @@ -655,8 +657,10 @@ def run_rsync_down(self, source, target, options=None): host_source = os.path.join( self._get_docker_host_mount_location( self.ssh_command_runner.cluster_name), source.lstrip("/")) + host_mount_location = os.path.dirname(host_source.rstrip("/")) self.ssh_command_runner.run( - f"mkdir -p {os.path.dirname(host_source.rstrip('/'))}", + f"mkdir -p {host_mount_location} && chown -R " + f"{self.ssh_command_runner.ssh_user} {host_mount_location}", silent=is_rsync_silent()) if source[-1] == "/": source += "." From 1fbb752f4299666af1c29a2eb8f1798df369c228 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Fri, 22 Jan 2021 03:04:38 +0200 Subject: [PATCH 013/245] [autoscaler] remove worker_default_node_type that is useless. (#13588) --- doc/source/cluster/autoscaling.rst | 3 --- python/ray/autoscaler/_private/util.py | 8 -------- python/ray/autoscaler/aws/example-multi-node-type.yaml | 3 --- .../kubernetes/operator_configs/cluster_crd.yaml | 4 ---- .../kubernetes/operator_configs/example_cluster.yaml | 2 -- .../kubernetes/operator_configs/example_cluster2.yaml | 2 -- python/ray/autoscaler/ray-schema.json | 4 ---- .../ray/autoscaler/staroid/example-multi-node-type.yaml | 3 --- python/ray/operator/operator_utils.py | 1 - python/ray/tests/test_resource_demand_scheduler.py | 3 +-- 10 files changed, 1 insertion(+), 32 deletions(-) diff --git a/doc/source/cluster/autoscaling.rst b/doc/source/cluster/autoscaling.rst index e8d8f235d4e5..ecb7af15565a 100644 --- a/doc/source/cluster/autoscaling.rst +++ b/doc/source/cluster/autoscaling.rst @@ -111,9 +111,6 @@ An example of configuring multiple node types is as follows `(full example) None: if config["head_node_type"] not in config["available_node_types"]: raise ValueError( "`head_node_type` must be one of `available_node_types`.") - if "worker_default_node_type" not in config: - raise ValueError("You must specify `worker_default_node_type` if " - "`available_node_types is set.") - if (config["worker_default_node_type"] not in config[ - "available_node_types"]): - raise ValueError("`worker_default_node_type` must be one of " - "`available_node_types`.") def prepare_config(config): @@ -123,7 +116,6 @@ def rewrite_legacy_yaml_to_available_node_types( }, } config["head_node_type"] = NODE_TYPE_LEGACY_HEAD - config["worker_default_node_type"] = NODE_TYPE_LEGACY_WORKER return config diff --git a/python/ray/autoscaler/aws/example-multi-node-type.yaml b/python/ray/autoscaler/aws/example-multi-node-type.yaml index 56b5c1b78d2e..1a83b8cc6212 100644 --- a/python/ray/autoscaler/aws/example-multi-node-type.yaml +++ b/python/ray/autoscaler/aws/example-multi-node-type.yaml @@ -55,9 +55,6 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand -# Specify the default type of the worker node (as configured above). -worker_default_node_type: cpu_16_spot - # The default settings for the head node. This will be merged with the per-node # type configs given above. head_node: diff --git a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml index 9e92d5d4f6bc..75a802b58d87 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/cluster_crd.yaml @@ -25,7 +25,6 @@ spec: required: - podTypes - headPodType - - workerDefaultPodType properties: maxWorkers: description: The maximum number of workers nodes to launch in addition to the @@ -4264,9 +4263,6 @@ spec: headPodType: description: Specifies the head node type. type: string - workerDefaultPodType: - description: Specifies the default worker node type. - type: string headStartRayCommands: description: Commands to start Ray on the head node. type: array diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml index bb4a71fcc203..8d2aa4561936 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster.yaml @@ -14,8 +14,6 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node - # Specify the default pod type for ray the worker nodes (as configured below). - workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node diff --git a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml index e5e4ecf3197a..0c6eb604e1eb 100644 --- a/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml +++ b/python/ray/autoscaler/kubernetes/operator_configs/example_cluster2.yaml @@ -14,8 +14,6 @@ spec: idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node - # Specify the default pod type for ray the worker nodes (as configured below). - workerDefaultPodType: worker-nodes # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 41a4a070832e..22b21b84cb66 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -254,10 +254,6 @@ "type": "string", "description": "If using multiple node types, specifies the head node type." }, - "worker_default_node_type": { - "type": "string", - "description": "If using multiple node types, specifies the default worker node type." - }, "head_node": { "type": "object", "description": "Provider-specific config for the head node, e.g. instance type." diff --git a/python/ray/autoscaler/staroid/example-multi-node-type.yaml b/python/ray/autoscaler/staroid/example-multi-node-type.yaml index 860bb6a87674..563e3a74c6e4 100644 --- a/python/ray/autoscaler/staroid/example-multi-node-type.yaml +++ b/python/ray/autoscaler/staroid/example-multi-node-type.yaml @@ -103,9 +103,6 @@ available_node_types: # Specify the node type of the head node (as configured above). head_node_type: cpu_4_ondemand -# Specify the default type of the worker node (as configured above). -worker_default_node_type: cpu_4_spot - # The default settings for the head node. This will be merged with the per-node # type configs given above. #head_node: diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 08926a723857..94d2a00cf34e 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -17,7 +17,6 @@ "upscalingSpeed": "upscaling_speed", "idleTimeoutMinutes": "idle_timeout_minutes", "headPodType": "head_node_type", - "workerDefaultPodType": "worker_default_node_type", "workerStartRayCommands": "worker_start_ray_commands", "headStartRayCommands": "head_start_ray_commands", "podTypes": "available_node_types" diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 3bfe28f7cc83..536cbe18bc5a 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -87,8 +87,7 @@ MULTI_WORKER_CLUSTER = dict( SMALL_CLUSTER, **{ "available_node_types": TYPES_A, - "head_node_type": "empty_node", - "worker_default_node_type": "m4.large", + "head_node_type": "empty_node" }) From 4ecd29ea2b4988dab3fc2395af4b04e0b864537c Mon Sep 17 00:00:00 2001 From: Xianyang Liu Date: Fri, 22 Jan 2021 12:10:01 +0800 Subject: [PATCH 014/245] [dashboard] Fixes dashboard issues when environments have set http_proxy (#12598) * fixes ray start with http_proxy * format * fixes * fixes * increase timeout * address comments --- dashboard/agent.py | 3 +- dashboard/head.py | 4 +- .../modules/logical_view/logical_view_head.py | 4 +- dashboard/modules/reporter/reporter_head.py | 4 +- .../stats_collector/stats_collector_head.py | 3 +- dashboard/tests/conftest.py | 57 +++++++++++++------ dashboard/tests/test_dashboard.py | 33 +++++++++++ dashboard/utils.py | 21 +++---- 8 files changed, 97 insertions(+), 32 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index f34024e545c7..7bf5e1551a2b 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -75,8 +75,9 @@ def __init__(self, logger.info("Dashboard agent grpc address: %s:%s", self.ip, self.grpc_port) self.aioredis_client = None + options = (("grpc.enable_http_proxy", 0), ) self.aiogrpc_raylet_channel = aiogrpc.insecure_channel( - f"{self.ip}:{self.node_manager_port}") + f"{self.ip}:{self.node_manager_port}", options=options) self.http_session = None def _load_modules(self): diff --git a/dashboard/head.py b/dashboard/head.py index e8e9119132d2..f1ef75ef478d 100644 --- a/dashboard/head.py +++ b/dashboard/head.py @@ -159,7 +159,9 @@ async def run(self): if not gcs_address: raise Exception("GCS address not found.") logger.info("Connect to GCS at %s", gcs_address) - channel = aiogrpc.insecure_channel(gcs_address) + options = (("grpc.enable_http_proxy", 0), ) + channel = aiogrpc.insecure_channel( + gcs_address, options=options) except Exception as ex: logger.error("Connect to GCS failed: %s, retry...", ex) await asyncio.sleep( diff --git a/dashboard/modules/logical_view/logical_view_head.py b/dashboard/modules/logical_view/logical_view_head.py index cf29db637da1..6b8e0bae1ecb 100644 --- a/dashboard/modules/logical_view/logical_view_head.py +++ b/dashboard/modules/logical_view/logical_view_head.py @@ -46,7 +46,9 @@ async def kill_actor(self, req) -> aiohttp.web.Response: except KeyError: return rest_response(success=False, message="Bad Request") try: - channel = aiogrpc.insecure_channel(f"{ip_address}:{port}") + options = (("grpc.enable_http_proxy", 0), ) + channel = aiogrpc.insecure_channel( + f"{ip_address}:{port}", options=options) stub = core_worker_pb2_grpc.CoreWorkerServiceStub(channel) await stub.KillActor( diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 8faef274d60c..2d84c6b65c21 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -38,7 +38,9 @@ async def _update_stubs(self, change): if change.new: node_id, ports = change.new ip = DataSource.node_id_to_ip[node_id] - channel = aiogrpc.insecure_channel(f"{ip}:{ports[1]}") + options = (("grpc.enable_http_proxy", 0), ) + channel = aiogrpc.insecure_channel( + f"{ip}:{ports[1]}", options=options) stub = reporter_pb2_grpc.ReporterServiceStub(channel) self._stubs[ip] = stub diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index ae75864e50ca..aa37e2e6e107 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -71,7 +71,8 @@ async def _update_stubs(self, change): node_id, node_info = change.new address = "{}:{}".format(node_info["nodeManagerAddress"], int(node_info["nodeManagerPort"])) - channel = aiogrpc.insecure_channel(address) + options = (("grpc.enable_http_proxy", 0), ) + channel = aiogrpc.insecure_channel(address, options=options) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) self._stubs[node_id] = stub diff --git a/dashboard/tests/conftest.py b/dashboard/tests/conftest.py index cb49e8bfc94a..ec893fbef252 100644 --- a/dashboard/tests/conftest.py +++ b/dashboard/tests/conftest.py @@ -1,17 +1,40 @@ -import os -import pytest -from ray.tests.conftest import * # noqa - - -@pytest.fixture -def enable_test_module(): - os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) - - -@pytest.fixture -def disable_aiohttp_cache(): - os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" - yield - os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) +import os +import pytest +from ray.tests.conftest import * # noqa + + +@pytest.fixture +def enable_test_module(): + os.environ["RAY_DASHBOARD_MODULE_TEST"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_MODULE_TEST", None) + + +@pytest.fixture +def disable_aiohttp_cache(): + os.environ["RAY_DASHBOARD_NO_CACHE"] = "true" + yield + os.environ.pop("RAY_DASHBOARD_NO_CACHE", None) + + +@pytest.fixture +def set_http_proxy(): + http_proxy = os.environ.get("http_proxy", None) + https_proxy = os.environ.get("https_proxy", None) + + # set http proxy + os.environ["http_proxy"] = "www.example.com:990" + os.environ["https_proxy"] = "www.example.com:990" + + yield + + # reset http proxy + if http_proxy: + os.environ["http_proxy"] = http_proxy + else: + del os.environ["http_proxy"] + + if https_proxy: + os.environ["https_proxy"] = https_proxy + else: + del os.environ["https_proxy"] diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py index 1acc94a169fe..529e394613d0 100644 --- a/dashboard/tests/test_dashboard.py +++ b/dashboard/tests/test_dashboard.py @@ -571,5 +571,38 @@ def test_immutable_types(): print(d3[1]) +def test_http_proxy(enable_test_module, set_http_proxy, shutdown_only): + address_info = ray.init(num_cpus=1, include_dashboard=True) + assert (wait_until_server_available(address_info["webui_url"]) is True) + + webui_url = address_info["webui_url"] + webui_url = format_web_url(webui_url) + + timeout_seconds = 10 + start_time = time.time() + while True: + time.sleep(1) + try: + response = requests.get( + webui_url + "/test/dump", + proxies={ + "http": None, + "https": None + }) + response.raise_for_status() + try: + response.json() + assert response.ok + except Exception as ex: + logger.info("failed response: %s", response.text) + raise ex + break + except (AssertionError, requests.exceptions.ConnectionError) as e: + logger.info("Retry because of %s", e) + finally: + if time.time() > start_time + timeout_seconds: + raise Exception("Timed out while testing.") + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/dashboard/utils.py b/dashboard/utils.py index e1379eea8e14..5c347ed32a49 100644 --- a/dashboard/utils.py +++ b/dashboard/utils.py @@ -1,34 +1,35 @@ import abc -import os -import socket -import time import asyncio import collections -import json import datetime import functools import importlib import inspect +import json import logging +import os import pkgutil +import socket import traceback -from base64 import b64decode from abc import ABCMeta, abstractmethod -from collections.abc import MutableMapping, Mapping, Sequence +from base64 import b64decode from collections import namedtuple +from collections.abc import MutableMapping, Mapping, Sequence from typing import Any -import aioredis +import aiohttp.signals import aiohttp.web -import ray.new_dashboard.consts as dashboard_consts +import aioredis +import time from aiohttp import hdrs from aiohttp.frozenlist import FrozenList from aiohttp.typedefs import PathLike from aiohttp.web import RouteDef -import aiohttp.signals from google.protobuf.json_format import MessageToDict -from ray.utils import binary_to_hex + +import ray.new_dashboard.consts as dashboard_consts from ray.ray_constants import env_bool +from ray.utils import binary_to_hex try: create_task = asyncio.create_task From aa5d7a5e6c3aa6cb914978bb3bff542732be0fde Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Fri, 22 Jan 2021 12:18:34 +0800 Subject: [PATCH 015/245] [Dashboard]Don't set node actors when node_id of actor is Nil (#13573) * Don't set node actors when node_id of actor is Nil * add test per comment --- .../stats_collector/stats_collector_consts.py | 3 ++ .../stats_collector/stats_collector_head.py | 15 ++++--- .../tests/test_stats_collector.py | 44 +++++++++++++++++++ 3 files changed, 56 insertions(+), 6 deletions(-) diff --git a/dashboard/modules/stats_collector/stats_collector_consts.py b/dashboard/modules/stats_collector/stats_collector_consts.py index 55119cd75dfa..cdcbf6bd126d 100644 --- a/dashboard/modules/stats_collector/stats_collector_consts.py +++ b/dashboard/modules/stats_collector/stats_collector_consts.py @@ -1,5 +1,8 @@ +import ray + NODE_STATS_UPDATE_INTERVAL_SECONDS = 1 RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1 ACTOR_CHANNEL = "ACTOR" ERROR_INFO_UPDATE_INTERVAL_SECONDS = 5 LOG_INFO_UPDATE_INTERVAL_SECONDS = 5 +NIL_NODE_ID = ray.NodeID.nil().hex() diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index aa37e2e6e107..e0b6cffa77b8 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -203,8 +203,10 @@ def _process_actor_table_data(data): node_id = actor_table_data["address"]["rayletId"] job_actors.setdefault(job_id, {})[actor_id] = actor_table_data - node_actors.setdefault(node_id, - {})[actor_id] = actor_table_data + # Update only when node_id is not Nil. + if node_id != stats_collector_consts.NIL_NODE_ID: + node_actors.setdefault( + node_id, {})[actor_id] = actor_table_data DataSource.job_actors.reset(job_actors) DataSource.node_actors.reset(node_actors) logger.info("Received %d actor info from GCS.", @@ -233,10 +235,11 @@ def _process_actor_table_data(data): node_id = actor_table_data["address"]["rayletId"] # Update actors. DataSource.actors[actor_id] = actor_table_data - # Update node actors. - node_actors = dict(DataSource.node_actors.get(node_id, {})) - node_actors[actor_id] = actor_table_data - DataSource.node_actors[node_id] = node_actors + # Update node actors (only when node_id is not Nil). + if node_id != stats_collector_consts.NIL_NODE_ID: + node_actors = dict(DataSource.node_actors.get(node_id, {})) + node_actors[actor_id] = actor_table_data + DataSource.node_actors[node_id] = node_actors # Update job actors. job_actors = dict(DataSource.job_actors.get(job_id, {})) job_actors[actor_id] = actor_table_data diff --git a/dashboard/modules/stats_collector/tests/test_stats_collector.py b/dashboard/modules/stats_collector/tests/test_stats_collector.py index bed6d650fc29..fcd1c42e3456 100644 --- a/dashboard/modules/stats_collector/tests/test_stats_collector.py +++ b/dashboard/modules/stats_collector/tests/test_stats_collector.py @@ -8,6 +8,8 @@ import pytest import ray import threading +import ray.new_dashboard.modules.stats_collector.stats_collector_consts \ + as stats_collector_consts from datetime import datetime, timedelta from ray.cluster_utils import Cluster from ray.new_dashboard.tests.conftest import * # noqa @@ -373,5 +375,47 @@ def check_errs(): check_errs, (AssertionError), timeout_ms=1000) +def test_nil_node(enable_test_module, disable_aiohttp_cache, + ray_start_with_dashboard): + assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) + is True) + webui_url = ray_start_with_dashboard["webui_url"] + assert wait_until_server_available(webui_url) + webui_url = format_web_url(webui_url) + + @ray.remote(num_gpus=1) + class InfeasibleActor: + pass + + infeasible_actor = InfeasibleActor.remote() # noqa + + timeout_seconds = 5 + start_time = time.time() + last_ex = None + while True: + time.sleep(1) + try: + resp = requests.get(f"{webui_url}/logical/actors") + resp_json = resp.json() + resp_data = resp_json["data"] + actors = resp_data["actors"] + assert len(actors) == 1 + response = requests.get(webui_url + "/test/dump?key=node_actors") + response.raise_for_status() + result = response.json() + assert stats_collector_consts.NIL_NODE_ID not in result["data"][ + "nodeActors"] + break + except Exception as ex: + last_ex = ex + finally: + if time.time() > start_time + timeout_seconds: + ex_stack = traceback.format_exception( + type(last_ex), last_ex, + last_ex.__traceback__) if last_ex else [] + ex_stack = "".join(ex_stack) + raise Exception(f"Timed out while testing, {ex_stack}") + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) From 39755fdb20c294507035b061a3f75d03f18c092c Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Thu, 21 Jan 2021 23:06:15 -0800 Subject: [PATCH 016/245] Revert "[Serve] Refactor BackendState" (#13626) This reverts commit 68038741ac2e1892db2456fed71083996613c884. --- python/ray/serve/backend_state.py | 533 ++++++++++++----------------- python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 - 4 files changed, 217 insertions(+), 327 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 4aad2671ea4e..673c4b2cfbc8 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,8 +1,7 @@ import asyncio +from asyncio.futures import Future from collections import defaultdict -from enum import Enum -import time -from typing import Dict, List, Optional, Tuple +from typing import Dict, Any, List, Optional, Set, Tuple import ray import ray.cloudpickle as pickle @@ -18,6 +17,7 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey +from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,150 +30,6 @@ _RESOURCE_CHECK_ENABLED = True -class ReplicaState(Enum): - SHOULD_START = 1 - STARTING = 2 - RUNNING = 3 - SHOULD_STOP = 4 - STOPPING = 5 - STOPPED = 6 - - -class BackendReplica: - def __init__(self, controller_name: str, detached: bool, - replica_tag: ReplicaTag, backend_tag: BackendTag): - self._actor_name = format_actor_name(replica_tag, controller_name) - self._controller_name = controller_name - self._detached = detached - self._replica_tag = replica_tag - self._backend_tag = backend_tag - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._state = ReplicaState.SHOULD_START - - def __get_state__(self): - clean_dict = self.__dict__.copy() - del clean_dict["_actor_handle"] - del clean_dict["_startup_obj_ref"] - del clean_dict["_drain_obj_ref"] - return clean_dict - - def __set_state__(self, d): - self.__dict__ = d - self._actor_handle = None - self._startup_obj_ref = None - self._drain_obj_ref = None - self._recover_from_checkpoint() - - def _recover_from_checkpoint(self): - if self._state == ReplicaState.STARTING: - # We do not need to pass in the class here because the actor - # creation has already been started if this class was checkpointed - # in the STARTING state. - self.start() - elif self._state == ReplicaState.RUNNING: - # Fetch actor handles for all backend replicas in the system. - # The actors must exist if this class was checkpointed in the - # RUNNING state. - self._actor_handle = ray.get_actor(self._actor_name) - elif self._state == ReplicaState.STOPPING: - self.stop() - - def start(self, backend_info: Optional[BackendInfo]): - assert self._state in { - ReplicaState.SHOULD_START, ReplicaState.STARTING - }, (f"State must be {ReplicaState.SHOULD_START} or " - f"{ReplicaState.STARTING}, *not* {self._state}") - try: - self._actor_handle = ray.get_actor(self._actor_name) - except ValueError: - logger.debug("Starting replica '{}' for backend '{}'.".format( - self._replica_tag, self._backend_tag)) - self._actor_handle = ray.remote(backend_info.worker_class).options( - name=self._actor_name, - lifetime="detached" if self._detached else None, - max_restarts=-1, - max_task_retries=-1, - **backend_info.replica_config.ray_actor_options).remote( - self._backend_tag, self._replica_tag, - backend_info.replica_config.actor_init_args, - backend_info.backend_config, self._controller_name) - self._startup_obj_ref = self._actor_handle.ready.remote() - self._state = ReplicaState.STARTING - - def check_started(self): - if self._state == ReplicaState.RUNNING: - return True - assert self._state == ReplicaState.STARTING, ( - f"State must be {ReplicaState.STARTING}, *not* {self._state}") - ready, _ = ray.wait([self._startup_obj_ref], timeout=0) - if len(ready) == 1: - self._state = ReplicaState.RUNNING - return True - return False - - def set_should_stop(self, graceful_shutdown_timeout_s: Duration): - self._state = ReplicaState.SHOULD_STOP - self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s - - def stop(self): - # We need to handle transitions from: - # SHOULD_START -> SHOULD_STOP -> STOPPING - # This means that the replica_handle may not have been created. - - assert self._state in { - ReplicaState.SHOULD_STOP, ReplicaState.STOPPING - }, (f"State must be {ReplicaState.SHOULD_STOP} or " - f"{ReplicaState.STOPPING}, *not* {self._state}") - - def drain_actor(actor_name): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(actor_name) - except ValueError: - return None - return replica.drain_pending_queries.remote() - - self._state = ReplicaState.STOPPING - self._drain_obj_ref = drain_actor(self._actor_name) - self._shutdown_deadline = time.time( - ) + self._graceful_shutdown_timeout_s - - def check_stopped(self): - if self._state == ReplicaState.STOPPED: - return True - assert self._state == ReplicaState.STOPPING, ( - f"State must be {ReplicaState.STOPPING}, *not* {self._state}") - - try: - replica = ray.get_actor(self._actor_name) - except ValueError: - self._state = ReplicaState.STOPPED - return True - - ready, _ = ray.wait([self._drain_obj_ref], timeout=0) - timeout_passed = time.time() > self._shutdown_deadline - - if len(ready) == 1 or timeout_passed: - if timeout_passed: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{self._actor_name} did not shutdown after " - f"{self._graceful_shutdown_timeout_s}s, force-killing.") - - ray.kill(replica, no_restart=True) - self._state = ReplicaState.STOPPED - return True - return False - - def get_actor_handle(self): - assert self._state == ReplicaState.RUNNING, ( - f"State must be {ReplicaState.RUNNING}, *not* {self._state}") - return self._actor_handle - - class BackendState: """Manages all state for backends in the system. @@ -190,65 +46,79 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ - BackendReplica]]] = defaultdict(lambda: defaultdict(list)) - self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() - self._target_replicas: Dict[BackendTag, int] = defaultdict(int) - self.backend_goals: Dict[BackendTag, GoalId] = dict() + # Non-checkpointed state. + self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag, ActorHandle]] = dict() + self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ + BackendTag, ReplicaTag]] = dict() - # Un-Checkpointed state. - self.pending_goals: Dict[GoalId, asyncio.Event] = dict() + # Checkpointed state. + self.backends: Dict[BackendTag, BackendInfo] = dict() + self.backend_replicas: Dict[BackendTag, Dict[ + ReplicaTag, ActorHandle]] = defaultdict(dict) + self.backend_goals: Dict[BackendTag, GoalId] = dict() + self.backend_replicas_to_start: Dict[BackendTag, List[ + ReplicaTag]] = defaultdict(list) + self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ + ReplicaTag, Duration]]] = defaultdict(list) + self.backends_to_remove: List[BackendTag] = list() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self._replicas, self._backend_metadata, self._target_replicas, - self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backend_to_remove, + pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) + # Fetch actor handles for all backend replicas in the system. + # All of these backend_replicas are guaranteed to already exist + # because they would not be written to a checkpoint in + # self.backend_replicas until they were created. + for backend_tag, replica_dict in self.backend_replicas.items(): + for replica_tag in replica_dict.keys(): + replica_name = format_actor_name(replica_tag, + self._controller_name) + self.backend_replicas[backend_tag][ + replica_tag] = ray.get_actor(replica_name) + self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps((self._replicas, self._backend_metadata, - self._target_replicas, self.backend_goals, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps( + (self.backends, self.backend_replicas, self.backend_goals, + self.backend_replicas_to_start, self.backend_replicas_to_stop, + self.backends_to_remove, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) - def get_running_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return { - backend_tag: { - backend_replica._replica_tag: - backend_replica.get_actor_handle() - for backend_replica in state_to_replica_dict[ - ReplicaState.RUNNING] - } - for backend_tag, state_to_replica_dict in self._replicas.items() - } - def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in - self.get_running_replica_handles().items() + for backend_tag, replica_dict in self.backend_replicas.items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self._backend_metadata.items() + for tag, info in self.backends.items() } + def get_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return self.backend_replicas + def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self._backend_metadata.get(backend_tag) + return self.backends.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -256,11 +126,7 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self._backend_metadata[backend_tag] = backend_info - self._target_replicas[ - backend_tag] = backend_info.backend_config.num_replicas - else: - self._target_replicas[backend_tag] = 0 + self.backends[backend_tag] = backend_info self.backend_goals[backend_tag] = new_goal_id @@ -270,25 +136,31 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self._backend_metadata.get(backend_tag) + backend_info = self.backends.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica_class = create_backend_replica( - replica_config.func_or_class) + backend_replica = create_backend_replica(replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica_class, + worker_class=backend_replica, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) + try: + self.scale_backend_replicas(backend_tag, + backend_config.num_replicas) + except RayServeException as e: + del self.backends[backend_tag] + raise e + # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -303,15 +175,20 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: return None + # Scale its replicas down to 0. + self.scale_backend_replicas(backend_tag, 0, force_kill) + + # Remove the backend's metadata. + del self.backends[backend_tag] + + # Add the intention to remove the backend from the routers. + self.backends_to_remove.append(backend_tag) + new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) - if force_kill: - self._backend_metadata[ - backend_tag].backend_config.\ - experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -320,18 +197,20 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self._backend_metadata: + if backend_tag not in self.backends: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self._backend_metadata[ - backend_tag].backend_config + stored_backend_config = self.backends[backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self._backend_metadata[backend_tag].backend_config = updated_config + self.backends[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self._backend_metadata[backend_tag]) + backend_tag, self.backends[backend_tag]) + + # Scale the replicas with the new configuration. + self.scale_backend_replicas(backend_tag, updated_config.num_replicas) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -381,38 +260,31 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - ) -> bool: + num_replicas: int, + force_kill: bool = False, + ) -> None: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. - The caller is responsible for then first writing a checkpoint and then - actually starting/stopping the intended replicas. This avoids - inconsistencies with starting/stopping a replica and then crashing - before writing a checkpoint. + adds the intention to start/stop them to self.backend_replicas_to_start + and self.backend_replicas_to_stop. The caller is responsible for then + first writing a checkpoint and then actually starting/stopping the + intended replicas. This avoids inconsistencies with starting/stopping a + replica and then crashing before writing a checkpoint. """ - num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self._backend_metadata + assert (backend_tag in self.backends ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = sum([ - len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), - len(self._replicas[backend_tag][ReplicaState.STARTING]), - len(self._replicas[backend_tag][ReplicaState.RUNNING]), - ]) - + current_num_replicas = len(self.backend_replicas[backend_tag]) delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self._backend_metadata[backend_tag] - if delta_num_replicas == 0: - return False - - elif delta_num_replicas > 0: + backend_info: BackendInfo = self.backends[backend_tag] + if delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -420,11 +292,10 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - logger.error( + raise RayServeException( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. This is not a problem if the cluster is " - "autoscaling. To fix this, consider scaling to replica to " + "to be added. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -434,132 +305,154 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self._replicas[backend_tag][ReplicaState.SHOULD_START].append( - BackendReplica(self._controller_name, self._detached, - replica_tag, backend_tag)) + self.backend_replicas_to_start[backend_tag].append(replica_tag) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert self._target_replicas[backend_tag] >= delta_num_replicas - + assert len( + self.backend_replicas[backend_tag]) >= delta_num_replicas + replicas_copy = self.backend_replicas.copy() for _ in range(-delta_num_replicas): - replica_state_dict = self._replicas[backend_tag] - list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ - or replica_state_dict[ReplicaState.STARTING] \ - or replica_state_dict[ReplicaState.RUNNING] - - assert len(list_to_use), replica_state_dict - replica_to_stop = list_to_use.pop() + replica_tag, _ = replicas_copy[backend_tag].popitem() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - - replica_to_stop.set_should_stop(graceful_timeout_s) - self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( - replica_to_stop) - - return True - - def scale_all_backends(self): - checkpoint_needed = False - for backend_tag, num_replicas in list(self._target_replicas.items()): - checkpoint_needed = (checkpoint_needed - or self.scale_backend_replicas(backend_tag)) - if num_replicas == 0: - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if checkpoint_needed: - self._checkpoint() - - def _pop_replicas_of_state(self, state: ReplicaState - ) -> List[Tuple[ReplicaState, BackendTag]]: - replicas = [] - for backend_tag, state_to_replica_dict in self._replicas.items(): - if state in state_to_replica_dict: - replicas.extend( - (replica, backend_tag) - for replica in state_to_replica_dict.pop(state)) - - return replicas + if force_kill: + graceful_timeout_s = 0 + self.backend_replicas_to_stop[backend_tag].append(( + replica_tag, + graceful_timeout_s, + )) + + def _start_pending_replicas(self): + for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ + items(): + for replica_tag in replicas_to_create: + replica_handle = self._start_backend_replica( + backend_tag, replica_tag) + ready_future = replica_handle.ready.remote().as_future() + self.currently_starting_replicas[ready_future] = ( + backend_tag, replica_tag, replica_handle) + + def _stop_pending_replicas(self): + for backend_tag, replicas_to_stop in ( + self.backend_replicas_to_stop.items()): + for replica_tag, shutdown_timeout in replicas_to_stop: + replica_name = format_actor_name(replica_tag, + self._controller_name) + + async def kill_actor(replica_name_to_use): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(replica_name_to_use) + except ValueError: + return + + try: + await asyncio.wait_for( + replica.drain_pending_queries.remote(), + timeout=shutdown_timeout) + except asyncio.TimeoutError: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{replica_name_to_use} did not shutdown after " + f"{shutdown_timeout}s, killing.") + finally: + ray.kill(replica, no_restart=True) + + self.currently_stopping_replicas[asyncio.ensure_future( + kill_actor(replica_name))] = (backend_tag, replica_tag) + + async def _check_currently_starting_replicas(self) -> int: + """Returns the number of pending replicas waiting to start""" + in_flight: Set[Future[Any]] = set() + + if self.currently_starting_replicas: + done, in_flight = await asyncio.wait( + list(self.currently_starting_replicas.keys()), timeout=0) + for fut in done: + (backend_tag, replica_tag, + replica_handle) = self.currently_starting_replicas.pop(fut) + self.backend_replicas[backend_tag][ + replica_tag] = replica_handle + + backend = self.backend_replicas_to_start.get(backend_tag) + if backend: + try: + backend.remove(replica_tag) + except ValueError: + pass + if len(backend) == 0: + del self.backend_replicas_to_start[backend_tag] + + async def _check_currently_stopping_replicas(self) -> int: + """Returns the number of replicas waiting to stop""" + in_flight: Set[Future[Any]] = set() + + if self.currently_stopping_replicas: + done_stopping, in_flight = await asyncio.wait( + list(self.currently_stopping_replicas.keys()), timeout=0) + for fut in done_stopping: + (backend_tag, + replica_tag) = self.currently_stopping_replicas.pop(fut) + + backend_to_stop = self.backend_replicas_to_stop.get( + backend_tag) + + if backend_to_stop: + try: + backend_to_stop.remove(replica_tag) + except ValueError: + pass + if len(backend_to_stop) == 0: + del self.backend_replicas_to_stop[backend_tag] + + backend = self.backend_replicas.get(backend_tag) + if backend: + try: + del backend[replica_tag] + except KeyError: + pass + + if len(self.backend_replicas[backend_tag]) == 0: + del self.backend_replicas[backend_tag] def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self._replicas.keys()).union( - set(self._backend_metadata.keys())) + all_tags = set(self.backend_replicas.keys()).union( + set(self.backends.keys())) for backend_tag in all_tags: - desired_num_replicas = self._target_replicas.get(backend_tag) - state_dict = self._replicas.get(backend_tag, {}) - existing_info = state_dict.get(ReplicaState.RUNNING, []) - - # If we have pending ops, the current goal is *not* ready - if (state_dict.get(ReplicaState.SHOULD_START) - or state_dict.get(ReplicaState.STARTING) - or state_dict.get(ReplicaState.SHOULD_STOP) - or state_dict.get(ReplicaState.STOPPING)): - continue - - # TODO(ilr): FIX + desired_info = self.backends.get(backend_tag) + existing_info = self.backend_replicas.get(backend_tag) # Check for deleting - if (not desired_num_replicas or - desired_num_replicas == 0) and \ + if (not desired_info or + desired_info.backend_config.num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + completed_goals.append(self.backend_goals.get(backend_tag)) # Check for a non-zero number of backends - if (desired_num_replicas and existing_info) \ - and desired_num_replicas == len(existing_info): - completed_goals.append( - self.backend_goals.pop(backend_tag, None)) + if desired_info and existing_info and desired_info.backend_config.\ + num_replicas == len(existing_info): + completed_goals.append(self.backend_goals.get(backend_tag)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: - self.scale_all_backends() - for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_START): - replica_state.start(self._backend_metadata[backend_tag]) - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.SHOULD_STOP): - replica_state.stop() - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - transition_triggered = False - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STARTING): - if replica_state.check_started(): - self._replicas[backend_tag][ReplicaState.RUNNING].append( - replica_state) - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STARTING].append( - replica_state) - - for replica_state, backend_tag in self._pop_replicas_of_state( - ReplicaState.STOPPING): - if replica_state.check_stopped(): - transition_triggered = True - else: - self._replicas[backend_tag][ReplicaState.STOPPING].append( - replica_state) - - for backend_tag in list(self._replicas.keys()): - if not any(self._replicas[backend_tag]): - del self._replicas[backend_tag] - del self._backend_metadata[backend_tag] - del self._target_replicas[backend_tag] - - if transition_triggered: + self._start_pending_replicas() + self._stop_pending_replicas() + + num_starting = len(self.currently_starting_replicas) + num_stopping = len(self.currently_stopping_replicas) + + await self._check_currently_starting_replicas() + await self._check_currently_stopping_replicas() + + if (len(self.currently_starting_replicas) != num_starting) or \ + (len(self.currently_stopping_replicas) != num_stopping): self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 41a1eca08ae8..205af81b065a 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 + experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index b5c65111a8f9..a3c75c711878 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_running_replica_handles() + return self.backend_state.get_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_running_replica_handles( + for replica_dict in self.backend_state.get_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index a35f7e54b361..202b01386059 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,9 +683,6 @@ def f(): client.create_endpoint("endpoint", backend="backend") -# This error is only printed because creation is run in the control loop, not -# in the API path. -@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance From 00c14ce4a414582987a0cdfd29df67ba38a68058 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 22 Jan 2021 00:31:33 -0800 Subject: [PATCH 017/245] [Object Spilling] Skip flaky tests (#13628) * skip flaky tests * lint * skip one more * fix --- python/ray/tests/test_object_spilling.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 745eb3bafc1d..8319dbfcac54 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -343,7 +343,9 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") + platform.system() in ["Windows", "Darwin"], + reason="Failing on " + "Windows and Mac.") def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" @@ -393,7 +395,9 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") + platform.system() in ["Windows", "Darwin"], + reason="Failing on Windows " + "and Mac.") def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" @@ -541,6 +545,7 @@ def is_dir_empty(): wait_for_condition(is_dir_empty) +@pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") def test_fusion_objects(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" From 90f1e408def3c29a91cf6ecdadc7de93b503fb99 Mon Sep 17 00:00:00 2001 From: Kai Yang Date: Fri, 22 Jan 2021 17:55:00 +0800 Subject: [PATCH 018/245] [Java] Add `fetchLocal` parameter in `Ray.wait()` (#13604) --- java/api/src/main/java/io/ray/api/Ray.java | 31 ++++++++++++---- .../java/io/ray/api/runtime/RayRuntime.java | 11 ++++-- .../io/ray/runtime/AbstractRayRuntime.java | 5 +-- .../runtime/object/LocalModeObjectStore.java | 3 +- .../ray/runtime/object/NativeObjectStore.java | 7 ++-- .../io/ray/runtime/object/ObjectStore.java | 23 ++++++++---- java/test.sh | 9 +++++ java/test/pom.xml | 35 ------------------- .../main/java/io/ray/test/PlasmaFreeTest.java | 2 +- .../io/ray/test/ReferenceCountingTest.java | 2 +- .../java/io_ray_runtime_RayNativeRuntime.h | 2 +- .../io_ray_runtime_object_NativeObjectStore.h | 6 ++-- .../io_ray_runtime_task_NativeTaskExecutor.h | 19 ---------- .../io_ray_runtime_task_NativeTaskSubmitter.h | 10 +++--- 14 files changed, 78 insertions(+), 87 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/Ray.java b/java/api/src/main/java/io/ray/api/Ray.java index da9047a66075..fb71a3bacbdf 100644 --- a/java/api/src/main/java/io/ray/api/Ray.java +++ b/java/api/src/main/java/io/ray/api/Ray.java @@ -87,6 +87,24 @@ public static List get(List> objectList) { return internal().get(objectList); } + /** + * Wait for a list of RayObjects to be available, until specified number of objects are ready, or + * specified timeout has passed. + * + * @param waitList A list of object references to wait for. + * @param numReturns The number of objects that should be returned. + * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @param fetchLocal If true, wait for the object to be downloaded onto the local node before + * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the + * local node and will return immediately once the object is available anywhere in the + * cluster. + * @return Two lists, one containing locally available objects, one containing the rest. + */ + public static WaitResult wait( + List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { + return internal().wait(waitList, numReturns, timeoutMs, fetchLocal); + } + /** * Wait for a list of RayObjects to be locally available, until specified number of objects are * ready, or specified timeout has passed. @@ -97,30 +115,29 @@ public static List get(List> objectList) { * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns, int timeoutMs) { - return internal().wait(waitList, numReturns, timeoutMs); + return wait(waitList, numReturns, timeoutMs, true); } /** - * A convenient helper method for Ray.wait. It will wait infinitely until specified number of - * objects are locally available. + * Wait for a list of RayObjects to be locally available, until specified number of objects are + * ready. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList, int numReturns) { - return internal().wait(waitList, numReturns, Integer.MAX_VALUE); + return wait(waitList, numReturns, Integer.MAX_VALUE); } /** - * A convenient helper method for Ray.wait. It will wait infinitely until all objects are locally - * available. + * Wait for a list of RayObjects to be locally available. * * @param waitList A list of object references to wait for. * @return Two lists, one containing locally available objects, one containing the rest. */ public static WaitResult wait(List> waitList) { - return internal().wait(waitList, waitList.size(), Integer.MAX_VALUE); + return wait(waitList, waitList.size()); } /** diff --git a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java index 53da3d48dae8..ac5f44f3f139 100644 --- a/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java +++ b/java/api/src/main/java/io/ray/api/runtime/RayRuntime.java @@ -53,15 +53,20 @@ public interface RayRuntime { List get(List> objectRefs); /** - * Wait for a list of RayObjects to be locally available, until specified number of objects are - * ready, or specified timeout has passed. + * Wait for a list of RayObjects to be available, until specified number of objects are ready, or + * specified timeout has passed. * * @param waitList A list of ObjectRef to wait for. * @param numReturns The number of objects that should be returned. * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @param fetchLocal If true, wait for the object to be downloaded onto the local node before + * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the + * local node and will return immediately once the object is available anywhere in the + * cluster. * @return Two lists, one containing locally available objects, one containing the rest. */ - WaitResult wait(List> waitList, int numReturns, int timeoutMs); + WaitResult wait( + List> waitList, int numReturns, int timeoutMs, boolean fetchLocal); /** * Free a list of objects from Plasma Store. diff --git a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java index f3478e4c6c68..15d9e9d76a53 100644 --- a/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java +++ b/java/runtime/src/main/java/io/ray/runtime/AbstractRayRuntime.java @@ -105,8 +105,9 @@ public void free(List> objectRefs, boolean localOnly) { } @Override - public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { - return objectStore.wait(waitList, numReturns, timeoutMs); + public WaitResult wait( + List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { + return objectStore.wait(waitList, numReturns, timeoutMs, fetchLocal); } @Override diff --git a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java index e1bfc64faa62..cb5752d00a81 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/LocalModeObjectStore.java @@ -60,7 +60,8 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait(List objectIds, int numObjects, long timeoutMs) { + public List wait( + List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { waitInternal(objectIds, numObjects, timeoutMs); return objectIds.stream().map(pool::containsKey).collect(Collectors.toList()); } diff --git a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java index 24dd5b8a2699..c68709e10e68 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/NativeObjectStore.java @@ -45,8 +45,9 @@ public List getRaw(List objectIds, long timeoutMs) { } @Override - public List wait(List objectIds, int numObjects, long timeoutMs) { - return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs); + public List wait( + List objectIds, int numObjects, long timeoutMs, boolean fetchLocal) { + return nativeWait(toBinaryList(objectIds), numObjects, timeoutMs, fetchLocal); } @Override @@ -113,7 +114,7 @@ private static List toBinaryList(List ids) { private static native List nativeGet(List ids, long timeoutMs); private static native List nativeWait( - List objectIds, int numObjects, long timeoutMs); + List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); private static native void nativeDelete(List objectIds, boolean localOnly); diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java index 8711811b24ad..5e7b626033a2 100644 --- a/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java +++ b/java/runtime/src/main/java/io/ray/runtime/object/ObjectStore.java @@ -117,25 +117,36 @@ public List get(List ids, Class elementType) { } /** - * Wait for a list of objects to appear in the object store. + * Wait for a list of RayObjects to be available, until specified number of objects are ready, or + * specified timeout has passed. * * @param objectIds IDs of the objects to wait for. * @param numObjects Number of objects that should appear. * @param timeoutMs Timeout in milliseconds, wait infinitely if it's negative. + * @param fetchLocal If true, wait for the object to be downloaded onto the local node before + * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the + * local node and will return immediately once the object is available anywhere in the + * cluster. * @return A bitset that indicates each object has appeared or not. */ - public abstract List wait(List objectIds, int numObjects, long timeoutMs); + public abstract List wait( + List objectIds, int numObjects, long timeoutMs, boolean fetchLocal); /** - * Wait for a list of RayObjects to be locally available, until specified number of objects are - * ready, or specified timeout has passed. + * Wait for a list of RayObjects to be available, until specified number of objects are ready, or + * specified timeout has passed. * * @param waitList A list of object references to wait for. * @param numReturns The number of objects that should be returned. * @param timeoutMs The maximum time in milliseconds to wait before returning. + * @param fetchLocal If true, wait for the object to be downloaded onto the local node before + * returning it as ready. If false, ray.wait() will not trigger fetching of objects to the + * local node and will return immediately once the object is available anywhere in the + * cluster. * @return Two lists, one containing locally available objects, one containing the rest. */ - public WaitResult wait(List> waitList, int numReturns, int timeoutMs) { + public WaitResult wait( + List> waitList, int numReturns, int timeoutMs, boolean fetchLocal) { Preconditions.checkNotNull(waitList); if (waitList.isEmpty()) { return new WaitResult<>(Collections.emptyList(), Collections.emptyList()); @@ -144,7 +155,7 @@ public WaitResult wait(List> waitList, int numReturns, int t List ids = waitList.stream().map(ref -> ((ObjectRefImpl) ref).getId()).collect(Collectors.toList()); - List ready = wait(ids, numReturns, timeoutMs); + List ready = wait(ids, numReturns, timeoutMs, fetchLocal); List> readyList = new ArrayList<>(); List> unreadyList = new ArrayList<>(); diff --git a/java/test.sh b/java/test.sh index 8336c1da1c5f..f946fd91ad6f 100755 --- a/java/test.sh +++ b/java/test.sh @@ -41,6 +41,15 @@ bazel build //java:gen_maven_deps echo "Build test jar." bazel build //java:all_tests_deploy.jar +java/generate_jni_header_files.sh + +if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then + echo "Files are changed after build. Common cases are:" + echo " * Java native methods doesn't match JNI files. You need to either update Java code or JNI code." + echo " * pom_template.xml and pom.xml doesn't match. You need to either update pom_template.xml or pom.xml." + exit 1 +fi + # Enable multi-worker feature in Java test TEST_ARGS=(-Dray.job.num-java-workers-per-process=10) diff --git a/java/test/pom.xml b/java/test/pom.xml index c9e34821b544..f401f3cff5ab 100644 --- a/java/test/pom.xml +++ b/java/test/pom.xml @@ -117,41 +117,6 @@ - - - com.diffplug.spotless - spotless-maven-plugin - 2.6.1 - - - - - - - - - .java - - - - - - - - true - 4 - - - - - - - 1.7 - - - - - diff --git a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java index 3e49ff798630..b8235b8d84fa 100644 --- a/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java +++ b/java/test/src/main/java/io/ray/test/PlasmaFreeTest.java @@ -25,7 +25,7 @@ public void testDeleteObjects() { () -> !TestUtils.getRuntime() .getObjectStore() - .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0) + .wait(ImmutableList.of(((ObjectRefImpl) helloId).getId()), 1, 0, true) .get(0), 50); if (TestUtils.isSingleProcessMode()) { diff --git a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java index aa56581951e6..a98f9595914b 100644 --- a/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java +++ b/java/test/src/main/java/io/ray/test/ReferenceCountingTest.java @@ -119,7 +119,7 @@ private static void fillObjectStoreAndGet( TestUtils.getRuntime().getObjectStore().getRaw(ImmutableList.of(objectId), Long.MAX_VALUE); } else { List result = - TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100); + TestUtils.getRuntime().getObjectStore().wait(ImmutableList.of(objectId), 1, 100, true); Assert.assertFalse(result.get(0)); } } diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h index 69c05cf9315f..daa4e05a9300 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_RayNativeRuntime.h @@ -25,7 +25,7 @@ extern "C" { * Class: io_ray_runtime_RayNativeRuntime * Method: nativeInitialize * Signature: - * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;)V + * (ILjava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/String;[BLio/ray/runtime/gcs/GcsClientOptions;ILjava/lang/String;Ljava/util/Map;[B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_RayNativeRuntime_nativeInitialize( JNIEnv *, jclass, jint, jstring, jint, jstring, jstring, jstring, jbyteArray, jobject, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h index b1da06e57068..fd194de55701 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_object_NativeObjectStore.h @@ -52,7 +52,7 @@ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeGet /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeWait - * Signature: (Ljava/util/List;IJ)Ljava/util/List; + * Signature: (Ljava/util/List;IJZ)Ljava/util/List; */ JNIEXPORT jobject JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeWait( JNIEnv *, jclass, jobject, jint, jlong, jboolean); @@ -68,7 +68,7 @@ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeDelete /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeAddLocalReference - * Signature: ([B)V + * Signature: ([B[B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, jclass, @@ -78,7 +78,7 @@ Java_io_ray_runtime_object_NativeObjectStore_nativeAddLocalReference(JNIEnv *, j /* * Class: io_ray_runtime_object_NativeObjectStore * Method: nativeRemoveLocalReference - * Signature: ([B)V + * Signature: ([B[B)V */ JNIEXPORT void JNICALL Java_io_ray_runtime_object_NativeObjectStore_nativeRemoveLocalReference(JNIEnv *, jclass, diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h index bf376aa12e64..ab7ec077d453 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskExecutor.h @@ -21,25 +21,6 @@ #ifdef __cplusplus extern "C" { #endif -#undef io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP -#define io_ray_runtime_task_NativeTaskExecutor_NUM_ACTOR_CHECKPOINTS_TO_KEEP 20L -/* - * Class: io_ray_runtime_task_NativeTaskExecutor - * Method: nativePrepareCheckpoint - * Signature: ()[B - */ -JNIEXPORT jbyteArray JNICALL -Java_io_ray_runtime_task_NativeTaskExecutor_nativePrepareCheckpoint(JNIEnv *, jclass); - -/* - * Class: io_ray_runtime_task_NativeTaskExecutor - * Method: nativeNotifyActorResumedFromCheckpoint - * Signature: ([B)V - */ -JNIEXPORT void JNICALL -Java_io_ray_runtime_task_NativeTaskExecutor_nativeNotifyActorResumedFromCheckpoint( - JNIEnv *, jclass, jbyteArray); - #ifdef __cplusplus } #endif diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h index 8ea517b60cf9..d57e2d573188 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.h @@ -74,13 +74,13 @@ Java_io_ray_runtime_task_NativeTaskSubmitter_nativeRemovePlacementGroup(JNIEnv * /* * Class: io_ray_runtime_task_NativeTaskSubmitter * Method: nativeWaitPlacementGroupReady - * Signature: (J)Z + * Signature: ([BI)Z */ JNIEXPORT jboolean JNICALL -Java_io_ray_runtime_task_NativeTaskSubmitter__nativeWaitPlacementGroupReady(JNIEnv *, - jclass, - jbyteArray, - jint); +Java_io_ray_runtime_task_NativeTaskSubmitter_nativeWaitPlacementGroupReady(JNIEnv *, + jclass, + jbyteArray, + jint); #ifdef __cplusplus } From da5928304a22d81b3623302767c4a89da4391e8f Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Fri, 22 Jan 2021 09:59:20 -0800 Subject: [PATCH 019/245] [Metrics] Cache metrics ports in a file at each node (#13501) * cache metric ports in a file at each node * remove old assignment of export port * lint * lint * move e2e test to top of file to avoid shutdown bug --- python/ray/node.py | 64 ++++++++++++-- python/ray/tests/test_metrics_agent.py | 114 ++++++++++++++----------- 2 files changed, 118 insertions(+), 60 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index 425965021240..186ae3dfdbfd 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -13,6 +13,9 @@ import tempfile import time +from typing import Optional, Dict +from collections import defaultdict + import ray import ray.ray_constants as ray_constants import ray._private.services @@ -121,18 +124,10 @@ def __init__(self, self._raylet_ip_address = raylet_ip_address - self.metrics_agent_port = (ray_params.metrics_agent_port - or self._get_unused_port()[0]) - self._metrics_export_port = ray_params.metrics_export_port - if self._metrics_export_port is None: - self._metrics_export_port = self._get_unused_port()[0] - ray_params.update_if_absent( include_log_monitor=True, resources={}, temp_dir=ray.utils.get_ray_temp_dir(), - metrics_agent_port=self.metrics_agent_port, - metrics_export_port=self._metrics_export_port, worker_path=os.path.join( os.path.dirname(os.path.abspath(__file__)), "workers/default_worker.py")) @@ -190,6 +185,15 @@ def __init__(self, self._raylet_socket_name = self._prepare_socket_file( self._ray_params.raylet_socket_name, default_prefix="raylet") + self.metrics_agent_port = self._get_cached_port( + "metrics_agent_port", default_port=ray_params.metrics_agent_port) + self._metrics_export_port = self._get_cached_port( + "metrics_export_port", default_port=ray_params.metrics_export_port) + + ray_params.update_if_absent( + metrics_agent_port=self.metrics_agent_port, + metrics_export_port=self._metrics_export_port) + if head: ray_params.update_if_absent(num_redis_shards=1) self._webui_url = None @@ -555,6 +559,50 @@ def _prepare_socket_file(self, socket_path, default_prefix): "{} bytes: {!r}".format(maxlen, result)) return result + def _get_cached_port(self, + port_name: str, + default_port: Optional[int] = None) -> int: + """Get a port number from a cache on this node. + + Different driver processes on a node should use the same ports for + some purposes, e.g. exporting metrics. This method returns a port + number for the given port name and caches it in a file. If the + port isn't already cached, an unused port is generated and cached. + + Args: + port_name (str): the name of the port, e.g. metrics_export_port + default_port (Optional[int]): The port to return and cache if no + port has already been cached for the given port_name. If None, an + unused port is generated and cached. + Returns: + port (int): the port number. + """ + file_path = os.path.join(self.get_session_dir_path(), + "ports_by_node.json") + + # Maps a Node.unique_id to a dict that maps port names to port numbers. + ports_by_node: Dict[str, Dict[str, int]] = defaultdict(dict) + + if not os.path.exists(file_path): + with open(file_path, "w") as f: + json.dump({}, f) + + with open(file_path, "r") as f: + ports_by_node.update(json.load(f)) + + if (self.unique_id in ports_by_node + and port_name in ports_by_node[self.unique_id]): + # The port has already been cached at this node, so use it. + port = int(ports_by_node[self.unique_id][port_name]) + else: + # Pick a new port to use and cache it at this node. + port = (default_port or self._get_unused_port()[0]) + ports_by_node[self.unique_id][port_name] = port + with open(file_path, "w") as f: + json.dump(ports_by_node, f) + + return port + def start_reaper_process(self): """ Start the reaper process. diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index b52f472efc26..86670b8a32cc 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -15,54 +15,6 @@ from ray.test_utils import wait_for_condition, SignalActor, fetch_prometheus -def test_prometheus_file_based_service_discovery(ray_start_cluster): - # Make sure Prometheus service discovery file is correctly written - # when number of nodes are dynamically changed. - NUM_NODES = 5 - cluster = ray_start_cluster - nodes = [cluster.add_node() for _ in range(NUM_NODES)] - cluster.wait_for_nodes() - addr = ray.init(address=cluster.address) - redis_address = addr["redis_address"] - writer = PrometheusServiceDiscoveryWriter( - redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") - - def get_metrics_export_address_from_node(nodes): - return [ - "{}:{}".format(node.node_ip_address, node.metrics_export_port) - for node in nodes - ] - - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - # Let's update nodes. - for _ in range(3): - nodes.append(cluster.add_node()) - - # Make sure service discovery file content is correctly updated. - loaded_json_data = json.loads(writer.get_file_discovery_content())[0] - assert (set(get_metrics_export_address_from_node(nodes)) == set( - loaded_json_data["targets"])) - - -@pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") -def test_prome_file_discovery_run_by_dashboard(shutdown_only): - ray.init(num_cpus=0) - global_node = ray.worker._global_node - temp_dir = global_node.get_temp_dir_path() - - def is_service_discovery_exist(): - for path in pathlib.Path(temp_dir).iterdir(): - if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): - return True - return False - - wait_for_condition(is_service_discovery_exist) - - @pytest.fixture def _setup_cluster_for_test(ray_start_cluster): NUM_NODES = 2 @@ -76,6 +28,10 @@ def _setup_cluster_for_test(ray_start_cluster): worker_should_exit = SignalActor.remote() + # Generate a metric in the driver. + counter = Count("test_driver_counter", description="desc") + counter.record(1) + # Generate some metrics from actor & tasks. @ray.remote def f(): @@ -132,19 +88,25 @@ def test_cases(): for components in components_dict.values()) # Make sure our user defined metrics exist - for metric_name in ["test_counter", "test_histogram"]: + for metric_name in [ + "test_counter", "test_histogram", "test_driver_counter" + ]: assert any(metric_name in full_name for full_name in metric_names) # Make sure GCS server metrics are recorded. assert "ray_outbound_heartbeat_size_kb_sum" in metric_names - # Make sure the numeric value is correct + # Make sure the numeric values are correct test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] assert test_counter_sample.value == 1.0 - # Make sure the numeric value is correct + test_driver_counter_sample = [ + m for m in metric_samples if "test_driver_counter" in m.name + ][0] + assert test_driver_counter_sample.value == 1.0 + test_histogram_samples = [ m for m in metric_samples if "test_histogram" in m.name ] @@ -178,10 +140,58 @@ def wrap_test_case_for_retry(): ) except RuntimeError: print( - f"The compoenents are {pformat(fetch_prometheus(prom_addresses))}") + f"The components are {pformat(fetch_prometheus(prom_addresses))}") test_cases() # Should fail assert +def test_prometheus_file_based_service_discovery(ray_start_cluster): + # Make sure Prometheus service discovery file is correctly written + # when number of nodes are dynamically changed. + NUM_NODES = 5 + cluster = ray_start_cluster + nodes = [cluster.add_node() for _ in range(NUM_NODES)] + cluster.wait_for_nodes() + addr = ray.init(address=cluster.address) + redis_address = addr["redis_address"] + writer = PrometheusServiceDiscoveryWriter( + redis_address, ray.ray_constants.REDIS_DEFAULT_PASSWORD, "/tmp/ray") + + def get_metrics_export_address_from_node(nodes): + return [ + "{}:{}".format(node.node_ip_address, node.metrics_export_port) + for node in nodes + ] + + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + # Let's update nodes. + for _ in range(3): + nodes.append(cluster.add_node()) + + # Make sure service discovery file content is correctly updated. + loaded_json_data = json.loads(writer.get_file_discovery_content())[0] + assert (set(get_metrics_export_address_from_node(nodes)) == set( + loaded_json_data["targets"])) + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="Failing on Windows.") +def test_prome_file_discovery_run_by_dashboard(shutdown_only): + ray.init(num_cpus=0) + global_node = ray.worker._global_node + temp_dir = global_node.get_temp_dir_path() + + def is_service_discovery_exist(): + for path in pathlib.Path(temp_dir).iterdir(): + if PROMETHEUS_SERVICE_DISCOVERY_FILE in str(path): + return True + return False + + wait_for_condition(is_service_discovery_exist) + + @pytest.fixture def metric_mock(): mock = MagicMock() From d629292d635b30a350cbd16f4a8943efa8145b00 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Fri, 22 Jan 2021 19:36:02 +0100 Subject: [PATCH 020/245] [RLlib] Add grad_clip config option to MARWIL and stabilize grad clipping against inf global_norms. (#13634) --- rllib/agents/marwil/marwil.py | 2 ++ rllib/agents/marwil/marwil_tf_policy.py | 4 +++- rllib/agents/marwil/marwil_torch_policy.py | 3 ++- rllib/agents/ppo/ppo_tf_policy.py | 10 ++++++++-- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/rllib/agents/marwil/marwil.py b/rllib/agents/marwil/marwil.py index c4f88fdb8b30..d123b3ef5f5f 100644 --- a/rllib/agents/marwil/marwil.py +++ b/rllib/agents/marwil/marwil.py @@ -21,6 +21,8 @@ "beta": 1.0, # Balancing value estimation loss and policy optimization loss. "vf_coeff": 1.0, + # If specified, clip the global norm of gradients by this amount. + "grad_clip": None, # Whether to calculate cumulative rewards. "postprocess_inputs": True, # Whether to rollout "complete_episodes" or "truncate_episodes". diff --git a/rllib/agents/marwil/marwil_tf_policy.py b/rllib/agents/marwil/marwil_tf_policy.py index 44352be4f883..211f9467e7b0 100644 --- a/rllib/agents/marwil/marwil_tf_policy.py +++ b/rllib/agents/marwil/marwil_tf_policy.py @@ -1,6 +1,7 @@ import logging import ray +from ray.rllib.agents.ppo.ppo_tf_policy import compute_and_clip_gradients from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing @@ -133,7 +134,7 @@ def __init__(self, policy, value_estimates, action_dist, actions, # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) - exp_advs = tf.math.exp(beta * (adv / c)) + exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( @@ -200,4 +201,5 @@ def setup_mixins(policy, obs_space, action_space, config): stats_fn=stats, postprocess_fn=postprocess_advantages, before_loss_init=setup_mixins, + gradients_fn=compute_and_clip_gradients, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/marwil/marwil_torch_policy.py b/rllib/agents/marwil/marwil_torch_policy.py index ef3558378794..14ae943ecaf5 100644 --- a/rllib/agents/marwil/marwil_torch_policy.py +++ b/rllib/agents/marwil/marwil_torch_policy.py @@ -4,7 +4,7 @@ from ray.rllib.policy.policy_template import build_policy_class from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_torch -from ray.rllib.utils.torch_ops import explained_variance +from ray.rllib.utils.torch_ops import apply_grad_clipping, explained_variance torch, _ = try_import_torch() @@ -98,5 +98,6 @@ def setup_mixins(policy, obs_space, action_space, config): get_default_config=lambda: ray.rllib.agents.marwil.marwil.DEFAULT_CONFIG, stats_fn=stats, postprocess_fn=postprocess_advantages, + extra_grad_process_fn=apply_grad_clipping, before_loss_init=setup_mixins, mixins=[ValueNetworkMixin]) diff --git a/rllib/agents/ppo/ppo_tf_policy.py b/rllib/agents/ppo/ppo_tf_policy.py index 57874ba296b3..5991da84e328 100644 --- a/rllib/agents/ppo/ppo_tf_policy.py +++ b/rllib/agents/ppo/ppo_tf_policy.py @@ -182,9 +182,15 @@ def compute_and_clip_gradients(policy: Policy, optimizer: LocalOptimizer, # Clip by global norm, if necessary. if policy.config["grad_clip"] is not None: + # Defuse inf gradients (due to super large losses). grads = [g for (g, v) in grads_and_vars] - policy.grads, _ = tf.clip_by_global_norm(grads, - policy.config["grad_clip"]) + grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) + # If the global_norm is inf -> All grads will be NaN. Stabilize this + # here by setting them to 0.0. This will simply ignore destructive loss + # calculations. + policy.grads = [ + tf.where(tf.math.is_nan(g), tf.zeros_like(g), g) for g in grads + ] clipped_grads_and_vars = list(zip(policy.grads, variables)) return clipped_grads_and_vars else: From 7fec19dad29ece3adb9094b52ca8a1ebe66f0e29 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Fri, 22 Jan 2021 12:07:25 -0800 Subject: [PATCH 021/245] [kubernetes][operator][minutiae] Backwards compatibility of operator (#13623) --- python/ray/operator/operator_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 94d2a00cf34e..08afda94f1d4 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -95,4 +95,4 @@ def get_cluster_owner_reference( def translate(configuration: Dict[str, Any], dictionary: Dict[str, str]) -> Dict[str, Any]: - return {dictionary[field]: configuration[field] for field in configuration} + return {dictionary[field]: configuration[field] for field in dictionary} From c4a710369b93964e219af83bb197542241750627 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 22 Jan 2021 12:10:24 -0800 Subject: [PATCH 022/245] Revert "[dashboard] Fix RAY_RAYLET_PID KeyError on Windows (#12948)" (#13572) This reverts commit ef6d859e9b7e91210683da8fd4b0897ecb0eee69. --- dashboard/agent.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index 7bf5e1551a2b..f1c496b89004 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -62,13 +62,9 @@ def __init__(self, self.object_store_name = object_store_name self.raylet_name = raylet_name self.node_id = os.environ["RAY_NODE_ID"] - # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is - # only used for fate-sharing with the raylet and we need a different - # fate-sharing mechanism for Windows anyways. - if sys.platform not in ["win32", "cygwin"]: - self.ppid = int(os.environ["RAY_RAYLET_PID"]) - assert self.ppid > 0 - logger.info("Parent pid is %s", self.ppid) + self.ppid = int(os.environ["RAY_RAYLET_PID"]) + assert self.ppid > 0 + logger.info("Parent pid is %s", self.ppid) self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) self.grpc_port = self.server.add_insecure_port( f"[::]:{self.dashboard_agent_port}") @@ -112,8 +108,7 @@ async def _check_parent(): logger.error("Failed to check parent PID, exiting.") sys.exit(1) - if sys.platform not in ["win32", "cygwin"]: - check_parent_task = create_task(_check_parent()) + check_parent_task = create_task(_check_parent()) # Create an aioredis client for all modules. try: From 0c3d9a3eaa7b640ca41479e7e14ffb2a6414463b Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Fri, 22 Jan 2021 12:11:59 -0800 Subject: [PATCH 023/245] [Metrics] Fix serialization for custom metrics (#13571) --- python/ray/tests/test_metrics_agent.py | 5 ++++- python/ray/util/metrics.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 86670b8a32cc..8e02c4ae360b 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -37,6 +37,8 @@ def _setup_cluster_for_test(ray_start_cluster): def f(): counter = Count("test_counter", description="desc") counter.record(1) + counter = ray.get(ray.put(counter)) # Test serialization. + counter.record(1) ray.get(worker_should_exit.wait.remote()) @ray.remote @@ -44,6 +46,7 @@ class A: async def ping(self): histogram = Histogram( "test_histogram", description="desc", boundaries=[0.1, 1.6]) + histogram = ray.get(ray.put(histogram)) # Test serialization. histogram.record(1.5) ray.get(worker_should_exit.wait.remote()) @@ -100,7 +103,7 @@ def test_cases(): test_counter_sample = [ m for m in metric_samples if "test_counter" in m.name ][0] - assert test_counter_sample.value == 1.0 + assert test_counter_sample.value == 2.0 test_driver_counter_sample = [ m for m in metric_samples if "test_driver_counter" in m.name diff --git a/python/ray/util/metrics.py b/python/ray/util/metrics.py index d287a503fa73..57a01cf7aa0b 100644 --- a/python/ray/util/metrics.py +++ b/python/ray/util/metrics.py @@ -147,6 +147,11 @@ def __init__(self, self._metric = CythonCount(self._name, self._description, self._unit, self._tag_keys) + def __reduce__(self): + deserializer = Count + serialized_data = (self._name, self._description, self._tag_keys) + return deserializer, serialized_data + class Histogram(Metric): """Histogram distribution of metric points. @@ -177,6 +182,12 @@ def __init__(self, self._unit, self.boundaries, self._tag_keys) + def __reduce__(self): + deserializer = Histogram + serialized_data = (self._name, self._description, self.boundaries, + self._tag_keys) + return deserializer, serialized_data + @property def info(self): """Return information about histogram metric.""" @@ -204,6 +215,11 @@ def __init__(self, self._metric = CythonGauge(self._name, self._description, self._unit, self._tag_keys) + def __reduce__(self): + deserializer = Gauge + serialized_data = (self._name, self._description, self._tag_keys) + return deserializer, serialized_data + __all__ = [ "Count", From 25e1b78eedd76033bc86e98e535e0e72d59ad290 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 22 Jan 2021 16:29:05 -0800 Subject: [PATCH 024/245] [Dependencies] Move requirements.txt to requirements directory. (#13636) --- .github/dependabot.yml | 12 ++++++++++++ ci/travis/install-dependencies.sh | 2 +- python/{ => requirements}/requirements.txt | 0 python/requirements/requirements_tune.in | 2 +- python/setup.py | 4 ++-- 5 files changed, 16 insertions(+), 4 deletions(-) rename python/{ => requirements}/requirements.txt (100%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9f8b6b7a730a..3074b6042bc9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -21,3 +21,15 @@ updates: open-pull-requests-limit: 3 reviewers: - "ray-project/ray-tune" + ignore: + # Ignore pinned dependencies in requirements.txt. + - dependency-name: aiohttp + - dependency-name: msgpack + - dependency-name: opencv-python-headless + - dependency-name: pandas + - dependency-name: scipy + - dependency-name: pydantic + - dependency-name: cython + - dependency-name: llmvlite + - dependency-name: pytest + - dependency-name: scikit-learn diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 8c42f694ce57..96f4fa95a8f2 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -274,7 +274,7 @@ install_dependencies() { local status="0"; local errmsg=""; for _ in {1..3}; do - errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements.txt 2>&1) && break; + errmsg=$(CC=gcc pip install -r "${WORKSPACE_DIR}"/python/requirements/requirements.txt 2>&1) && break; status=$errmsg && echo "'pip install ...' failed, will retry after n seconds!" && sleep 30; done if [ "$status" != "0" ]; then diff --git a/python/requirements.txt b/python/requirements/requirements.txt similarity index 100% rename from python/requirements.txt rename to python/requirements/requirements.txt diff --git a/python/requirements/requirements_tune.in b/python/requirements/requirements_tune.in index 40ccf4be43d1..9bb83cbeec73 100644 --- a/python/requirements/requirements_tune.in +++ b/python/requirements/requirements_tune.in @@ -1,5 +1,5 @@ # Use base requirements to constrain these requirements. --c ../requirements.txt +-c ./requirements.txt ax-platform==0.1.9; python_version < '3.7' ax-platform==0.1.19; python_version >= '3.7' diff --git a/python/setup.py b/python/setup.py index 18d012b99e52..a1542a7a292c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -92,7 +92,7 @@ ] # If you're adding dependencies for ray extras, please -# also update the matching section of requirements.txt +# also update the matching section of requirements/requirements.txt # in this directory extras = { "serve": [ @@ -120,7 +120,7 @@ # These are the main dependencies for users of ray. This list # should be carefully curated. If you change it, please reflect -# the change in the matching section of requirements.txt +# the change in the matching section of requirements/requirements.txt install_requires = [ # TODO(alex) Pin the version once this PR is # included in the stable release. From 01d74af89d0c6b20277393e60e68044ead1e1615 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 22 Jan 2021 16:30:10 -0800 Subject: [PATCH 025/245] [horovod] Horovod+Ray Pytorch Lightning Accelerator (#13458) --- .travis.yml | 1 + python/ray/tune/examples/mnist_ptl_mini.py | 3 +- python/ray/util/lightning_accelerators/BUILD | 33 +++ .../util/lightning_accelerators/__init__.py | 4 + .../examples/ptl_horovod_ray_example.py | 195 ++++++++++++++++++ .../horovod_ray_accelerator.py | 121 +++++++++++ .../tests/test_horovod_ray_accelerator.py | 191 +++++++++++++++++ 7 files changed, 547 insertions(+), 1 deletion(-) create mode 100644 python/ray/util/lightning_accelerators/BUILD create mode 100644 python/ray/util/lightning_accelerators/__init__.py create mode 100644 python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py create mode 100644 python/ray/util/lightning_accelerators/horovod_ray_accelerator.py create mode 100644 python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py diff --git a/.travis.yml b/.travis.yml index 5170ed0864b8..4d8f8ddd1255 100644 --- a/.travis.yml +++ b/.travis.yml @@ -420,6 +420,7 @@ matrix: script: - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... + - ./ci/keep_alive bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/lightning_accelerators/... # There are no python 3.7 tests for RaySGD at the moment # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 python/ray/util/sgd/... # - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=py37 doc/... diff --git a/python/ray/tune/examples/mnist_ptl_mini.py b/python/ray/tune/examples/mnist_ptl_mini.py index b1c2e2aa9a09..e3b226d44566 100644 --- a/python/ray/tune/examples/mnist_ptl_mini.py +++ b/python/ray/tune/examples/mnist_ptl_mini.py @@ -1,7 +1,7 @@ import torch from torch.nn import functional as F import pytorch_lightning as pl -from pl_bolts.datamodules import MNISTDataModule +from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule import os from ray.tune.integration.pytorch_lightning import TuneReportCallback @@ -16,6 +16,7 @@ def __init__(self, config, data_dir=None): self.data_dir = data_dir or os.getcwd() self.lr = config["lr"] layer_1, layer_2 = config["layer_1"], config["layer_2"] + self.batch_size = config["batch_size"] # mnist images are (1, 28, 28) (channels, width, height) self.layer_1 = torch.nn.Linear(28 * 28, layer_1) diff --git a/python/ray/util/lightning_accelerators/BUILD b/python/ray/util/lightning_accelerators/BUILD new file mode 100644 index 000000000000..4355c6d33bb4 --- /dev/null +++ b/python/ray/util/lightning_accelerators/BUILD @@ -0,0 +1,33 @@ +# -------------------------------------------------------------------- +# Tests from the python/ray/util/lightning_accelerators/tests directory. +# Please keep these sorted alphabetically. +# -------------------------------------------------------------------- + +py_test( + name = "test_horovod_ray_accelerator", + size = "medium", + srcs = ["tests/test_horovod_ray_accelerator.py"], + tags = ["exclusive", "pytorch-lightning", "pytorch", "horovod"], + deps = [":accelerator_lib"], +) + +# -------------------------------------------------------------------- +# Tests from the python/ray/util/lightning_accelerators/examples directory. +# Please keep these sorted alphabetically. +# -------------------------------------------------------------------- + +py_test( + name = "ptl_horovod_ray_example", + size = "medium", + srcs = ["examples/ptl_horovod_ray_example.py"], + tags = ["exclusive", "example", "pytorch-lightning", "pytorch", "horovod"], + deps = [":accelerator_lib"], + args = ["--smoke-test"] +) + +# # This is a dummy test dependency that causes the above tests to be +# # re-run if any of these files changes. +py_library( + name = "accelerator_lib", + srcs = glob(["**/*.py"], exclude=["tests/*.py"]), +) diff --git a/python/ray/util/lightning_accelerators/__init__.py b/python/ray/util/lightning_accelerators/__init__.py new file mode 100644 index 000000000000..038180e016ef --- /dev/null +++ b/python/ray/util/lightning_accelerators/__init__.py @@ -0,0 +1,4 @@ +from ray.util.lightning_accelerators.horovod_ray_accelerator import \ + HorovodRayAccelerator + +__all__ = ["HorovodRayAccelerator"] diff --git a/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py b/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py new file mode 100644 index 000000000000..fffcfb01f54b --- /dev/null +++ b/python/ray/util/lightning_accelerators/examples/ptl_horovod_ray_example.py @@ -0,0 +1,195 @@ +"""Example using Pytorch Lightning with a Horovod on Ray Accelerator.""" +import os +import tempfile + +import pytorch_lightning as pl +import torch +from torch.utils.data import random_split, DataLoader +from torchvision.datasets import MNIST +from torchvision import transforms + +import ray +from ray import tune +from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier +from ray.tune.integration.pytorch_lightning import TuneReportCallback +from ray.util.lightning_accelerators import HorovodRayAccelerator + + +class MNISTClassifier(LightningMNISTClassifier): + def prepare_data(self): + self.dataset = MNIST( + self.data_dir, + train=True, + download=True, + transform=transforms.ToTensor()) + + def train_dataloader(self): + dataset = self.dataset + train_length = len(dataset) + dataset_train, _ = random_split( + dataset, [train_length - 5000, 5000], + generator=torch.Generator().manual_seed(0)) + loader = DataLoader( + dataset_train, + batch_size=self.batch_size, + shuffle=True, + num_workers=1, + drop_last=True, + pin_memory=True, + ) + return loader + + def val_dataloader(self): + dataset = self.dataset + train_length = len(dataset) + _, dataset_val = random_split( + dataset, [train_length - 5000, 5000], + generator=torch.Generator().manual_seed(0)) + loader = DataLoader( + dataset_val, + batch_size=self.batch_size, + shuffle=False, + num_workers=1, + drop_last=True, + pin_memory=True, + ) + return loader + + +def train_mnist(config, + data_dir=None, + num_epochs=10, + num_hosts=1, + num_slots=4, + use_gpu=False, + callbacks=None): + model = MNISTClassifier(config, data_dir) + + callbacks = callbacks or [] + + trainer = pl.Trainer( + max_epochs=num_epochs, + gpus=int(use_gpu), + callbacks=callbacks, + accelerator=HorovodRayAccelerator( + num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu)) + trainer.fit(model) + + +def tune_mnist(data_dir, + num_samples=10, + num_epochs=10, + num_hosts=1, + num_slots=4, + use_gpu=False): + config = { + "layer_1": tune.choice([32, 64, 128]), + "layer_2": tune.choice([64, 128, 256]), + "lr": tune.loguniform(1e-4, 1e-1), + "batch_size": tune.choice([32, 64, 128]), + } + + # Add Tune callback. + metrics = {"loss": "ptl/val_loss", "acc": "ptl/val_accuracy"} + callbacks = [TuneReportCallback(metrics, on="validation_end")] + trainable = tune.with_parameters( + train_mnist, + data_dir=data_dir, + num_epochs=num_epochs, + num_hosts=num_hosts, + num_slots=num_slots, + use_gpu=use_gpu, + callbacks=callbacks) + analysis = tune.run( + trainable, + metric="loss", + mode="min", + config=config, + num_samples=num_samples, + resources_per_trial={ + "cpu": 1, + # Assume 1 cpu per slot. + "extra_cpu": num_hosts * num_slots, + # Assume 1 gpu per slot. + "extra_gpu": num_hosts * num_slots * int(use_gpu) + }, + name="tune_mnist") + + print("Best hyperparameters found were: ", analysis.best_config) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-hosts", + type=int, + help="Number of machines to train on. If using Tune, then each " + "trial will use this many machines.", + default=1) + parser.add_argument( + "--num-slots", + type=int, + help="Number of workers to " + "place on each " + "machine. If using " + "Tune, then each trial will use this many slots per machine.", + default=1) + parser.add_argument( + "--use-gpu", action="store_true", help="Use GPU for " + "training.") + parser.add_argument( + "--tune", + action="store_true", + help="Use Ray Tune " + "for " + "hyperparameter " + "tuning.") + parser.add_argument( + "--num-samples", + type=int, + default=10, + help="Number " + "of " + "samples to tune.") + parser.add_argument( + "--num-epochs", + type=int, + default=10, + help="Number " + "of " + "epochs " + "to train for.") + parser.add_argument( + "--smoke-test", action="store_true", help="Finish quickly for testing") + parser.add_argument( + "--address", + required=False, + type=str, + help="the address to use for Ray") + args, _ = parser.parse_known_args() + + num_epochs = 1 if args.smoke_test else args.num_epochs + num_hosts = 1 if args.smoke_test else args.num_hosts + num_slots = 1 if args.smoke_test else args.num_slots + use_gpu = False if args.smoke_test else args.use_gpu + num_samples = 1 if args.smoke_test else args.num_samples + + if args.smoke_test: + ray.init(num_cpus=2) + else: + ray.init(address=args.address) + + data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") + + if args.tune: + raise NotImplementedError("Using Tune + Pytorch Lightning with " + "distributed training is currently not " + "supported.") + tune_mnist(data_dir, num_samples, num_epochs, num_hosts, num_slots, + use_gpu) + else: + config = {"layer_1": 32, "layer_2": 64, "lr": 1e-1, "batch_size": 32} + train_mnist(config, data_dir, num_epochs, num_hosts, num_slots, + use_gpu) diff --git a/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py new file mode 100644 index 000000000000..04f73317a923 --- /dev/null +++ b/python/ray/util/lightning_accelerators/horovod_ray_accelerator.py @@ -0,0 +1,121 @@ +import ray +from pytorch_lightning.accelerators.horovod_accelerator import \ + HorovodAccelerator + +try: + import horovod.torch as hvd + from horovod.ray import RayExecutor +except (ModuleNotFoundError, ImportError): + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + + +def get_executable_cls(): + # Only used for testing purposes, currently. + # We need to override this in tests to ensure test path is set correctly. + return None + + +class HorovodRayAccelerator(HorovodAccelerator): + """Pytorch Lightning Accelerator for Horovod training on a Ray cluster. + + This accelerator is used to manage distributed training on a Ray cluster + via the Horovod training framework. Internally, the specified number of + Ray actors are launched in the cluster and are configured as part of the + Horovod ring. The Pytorch Lightning trainer is instantiated on the + driver and sent to each of these training workers where training is + executed. The distributed training protocol is handled by Horovod. + + Each training worker is configured to reserve 1 CPU and if 1 GPU if + ``use_gpu`` is set to ``True``. + + If using this accelerator, you should run your code like a normal Python + script: ``python train.py``, and not with ``horovodrun``. + + Args: + num_hosts (int): The number of nodes/machines to execute the job on. + num_slots (int): Number of workers to be placed on each machine. + use_gpu (bool): Whether to use GPU for allocation. For GPU to be + used, you must also set the ``gpus`` arg in your Pytorch Lightning + Trainer to a value > 0. + + Example: + + .. code_block:: python + + import pytorch_lightning as ptl + from ray.util.lightning_accelerators import HorovodRayAccelerator + + ptl_model = MNISTClassifier(...) + # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU. + accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4, + use_gpu=True). + + # If using GPUs, set the ``gpus`` arg to a value > 0. + # The actual number of GPUs is determined by ``num_slots``. + trainer = pl.Trainer(..., gpus=1, accelerator=accelerator). + trainer.fit(ptl_model). + + """ + + def __init__(self, + *args, + num_hosts=1, + num_slots=1, + use_gpu=False, + **kwargs): + super().__init__(*args, trainer=None, **kwargs) + self.nickname = "horovod_ray" + self.num_hosts = num_hosts + self.num_slots = num_slots + self.use_gpu = use_gpu + + def setup(self, model): + self.trainer.use_horovod = True + settings = RayExecutor.create_settings(timeout_s=30) + self.executor = RayExecutor( + settings, + num_hosts=self.num_hosts, + num_slots=self.num_slots, + use_gpu=self.use_gpu) + self.trainer.model = model + self.executor.start(executable_cls=get_executable_cls()) + + def train(self): + trainer = self.trainer + trainer_ref = ray.put(self.trainer) + self.trainer = None + results = self.executor.run(self.train_remote, args=[trainer_ref]) + results, state_dict, best_path = results[0] + + self.trainer = trainer + self.trainer.model.load_state_dict(state_dict) + if self.trainer.checkpoint_callback: + self.trainer.checkpoint_callback.best_model_path = best_path + + return results + + def train_remote(self, trainer_ref): + self.trainer = ray.get(trainer_ref) + hvd.init() + if self.trainer.on_gpu: + # Horovod assigns one local GPU per process. + self.trainer.root_gpu = hvd.local_rank() + + # TODO: Make changes in PTL to clean this up. + super(HorovodRayAccelerator, self).setup(self.trainer.model) + results = super(HorovodRayAccelerator, self).train() + if hvd.rank() != 0: + # Only want results from the first worker. + return None + + best_model_path = None + if self.trainer.checkpoint_callback is not None: + best_model_path = self.trainer.checkpoint_callback.best_model_path + + model = self.trainer.model + return results, model.state_dict(), best_model_path + + def teardown(self): + self.executor.shutdown() diff --git a/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py b/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py new file mode 100644 index 000000000000..1d8bb9d5e71c --- /dev/null +++ b/python/ray/util/lightning_accelerators/tests/test_horovod_ray_accelerator.py @@ -0,0 +1,191 @@ +import os + +import torch +import pytest +import ray +from pl_bolts.datamodules.mnist_datamodule import MNISTDataModule +from ray.util.sgd.tests.test_ptl import PTL_Module +from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier +from ray.util.lightning_accelerators import HorovodRayAccelerator +import pytorch_lightning as pl + +try: + import horovod # noqa: F401 + from horovod.common.util import nccl_built +except ImportError: + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + + +def _nccl_available(): + if not HOROVOD_AVAILABLE: + return False + try: + return nccl_built() + except AttributeError: + return False + + +@pytest.fixture +def ray_start_2_cpus(): + address_info = ray.init(num_cpus=2) + yield address_info + ray.shutdown() + + +@pytest.fixture +def ray_start_2_gpus(): + address_info = ray.init(num_cpus=2, num_gpus=2) + yield address_info + ray.shutdown() + # This env var is set by Pytorch Lightning. + # Make sure to reset it after each test. + # TODO: Upstream to PTL to not set this env var if using Ray. + del os.environ["CUDA_VISIBLE_DEVICES"] + + +@pytest.fixture +def seed(): + pl.seed_everything(0) + + +def get_model(lr=1e-2, hidden_size=1, data_size=10, val_size=10, batch_size=2): + config = { + "lr": lr, + "hidden_size": hidden_size, + "data_size": data_size, + "val_size": val_size, + "batch_size": batch_size + } + return PTL_Module(config) + + +def get_trainer(dir, + num_slots=2, + use_gpu=False, + max_epochs=1, + limit_train_batches=10, + limit_val_batches=10, + progress_bar_refresh_rate=0): + accelerator = HorovodRayAccelerator(num_slots=num_slots, use_gpu=use_gpu) + trainer = pl.Trainer( + default_root_dir=dir, + gpus=1 if use_gpu else 0, + max_epochs=max_epochs, + limit_train_batches=limit_train_batches, + limit_val_batches=limit_val_batches, + progress_bar_refresh_rate=progress_bar_refresh_rate, + checkpoint_callback=True, + accelerator=accelerator) + return trainer + + +def train_test(trainer, model): + initial_values = torch.tensor( + [torch.sum(torch.abs(x)) for x in model.parameters()]) + result = trainer.fit(model) + post_train_values = torch.tensor( + [torch.sum(torch.abs(x)) for x in model.parameters()]) + assert result == 1, "trainer failed" + # Check that the model is actually changed post-training. + assert torch.norm(initial_values - post_train_values) > 0.1 + + +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_train(tmpdir, ray_start_2_cpus, seed, num_slots): + model = get_model() + + trainer = get_trainer(tmpdir, num_slots=num_slots) + train_test(trainer, model) + + +def load_test(trainer, model): + trainer.fit(model) + trained_model = PTL_Module.load_from_checkpoint( + trainer.checkpoint_callback.best_model_path, config=model.config) + assert trained_model is not None, "loading model failed" + + +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_load(tmpdir, ray_start_2_cpus, seed, num_slots): + model = get_model() + trainer = get_trainer(tmpdir, num_slots=num_slots) + load_test(trainer, model) + + +def predict_test(trainer, model, dm): + trainer.fit(model, dm) + test_loader = dm.test_dataloader() + acc = pl.metrics.Accuracy() + for batch in test_loader: + x, y = batch + with torch.no_grad(): + y_hat = model(x) + y_hat = y_hat.cpu() + acc.update(y_hat, y) + average_acc = acc.compute() + assert average_acc >= 0.5, f"This model is expected to get > {0.5} in " \ + f"test set (it got {average_acc})" + + +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_predict(tmpdir, ray_start_2_cpus, seed, num_slots): + config = { + "layer_1": 32, + "layer_2": 32, + "lr": 1e-2, + "batch_size": 32, + } + model = LightningMNISTClassifier(config, tmpdir) + dm = MNISTDataModule( + data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + trainer = get_trainer( + tmpdir, limit_train_batches=10, max_epochs=1, num_slots=num_slots) + predict_test(trainer, model, dm) + + +@pytest.mark.skipif( + not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_train_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): + model = get_model() + trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) + train_test(trainer, model) + + +@pytest.mark.skipif( + not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_load_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): + model = get_model() + trainer = get_trainer(tmpdir, num_slots=num_slots, use_gpu=True) + load_test(trainer, model) + + +@pytest.mark.skipif( + not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.parametrize("num_slots", [1, 2]) +def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots): + config = { + "layer_1": 32, + "layer_2": 32, + "lr": 1e-2, + "batch_size": 32, + } + model = LightningMNISTClassifier(config, tmpdir) + dm = MNISTDataModule( + data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + trainer = get_trainer( + tmpdir, + limit_train_batches=10, + max_epochs=1, + num_slots=num_slots, + use_gpu=True) + predict_test(trainer, model, dm) From 8ef835ff03eab0e1beab1d08eb2333295846bfe1 Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Sat, 23 Jan 2021 13:57:30 +0800 Subject: [PATCH 026/245] Remove idle actor from worker pool. (#13523) --- src/ray/raylet/worker_pool.cc | 32 +++++++++---------------- src/ray/raylet/worker_pool.h | 2 -- src/ray/raylet/worker_pool_test.cc | 38 ++++-------------------------- 3 files changed, 16 insertions(+), 56 deletions(-) diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 93a568748e80..4ed257f4602e 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -159,9 +159,8 @@ Process WorkerPool::StartWorkerProcess( return Process(); } // Either there are no workers pending registration or the worker start is being forced. - RAY_LOG(DEBUG) << "Starting new worker process, current pool has " - << state.idle_actor.size() << " actor workers, and " << state.idle.size() - << " non-actor workers"; + RAY_LOG(DEBUG) << "Starting new worker process, current pool has " << state.idle.size() + << " workers"; int workers_to_start = 1; if (dynamic_options.empty()) { @@ -625,15 +624,11 @@ void WorkerPool::PushWorker(const std::shared_ptr &worker) { state.idle_dedicated_workers[task_id] = worker; } else { // The worker is not used for the actor creation task with dynamic options. - // Put the worker to the corresponding idle pool. - if (worker->GetActorId().IsNil()) { - state.idle.insert(worker); - int64_t now = current_time_ms(); - idle_of_all_languages_.emplace_back(worker, now); - idle_of_all_languages_map_[worker] = now; - } else { - state.idle_actor[worker->GetActorId()] = worker; - } + // Put the worker to the idle pool. + state.idle.insert(worker); + int64_t now = current_time_ms(); + idle_of_all_languages_.emplace_back(worker, now); + idle_of_all_languages_map_[worker] = now; } } @@ -787,7 +782,10 @@ std::shared_ptr WorkerPool::PopWorker( state.tasks_to_dedicated_workers[task_spec.TaskId()] = proc; } } - } else if (!task_spec.IsActorTask()) { + } else if (task_spec.IsActorTask()) { + // Code path of actor task. + RAY_CHECK(false) << "Direct call shouldn't reach here."; + } else { // Code path of normal task or actor creation task without dynamic worker options. // Find an available worker which is already assigned to this job. // Try to pop the most recently pushed worker. @@ -812,14 +810,6 @@ std::shared_ptr WorkerPool::PopWorker( proc = StartWorkerProcess(task_spec.GetLanguage(), rpc::WorkerType::WORKER, task_spec.JobId()); } - } else { - // Code path of actor task. - const auto &actor_id = task_spec.ActorId(); - auto actor_entry = state.idle_actor.find(actor_id); - if (actor_entry != state.idle_actor.end()) { - worker = std::move(actor_entry->second); - state.idle_actor.erase(actor_entry); - } } if (worker == nullptr && proc.IsValid()) { diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index 66d4b94c7700..703fbf77b781 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -358,8 +358,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_map> idle_dedicated_workers; /// The pool of idle non-actor workers. std::unordered_set> idle; - /// The pool of idle actor workers. - std::unordered_map> idle_actor; // States for io workers used for spilling objects. IOWorkerState spill_io_worker_state; // States for io workers used for restoring objects. diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index ee8f3356bb77..0d2c0e314f34 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -343,28 +343,6 @@ TEST_F(WorkerPoolTest, HandleWorkerPushPop) { ASSERT_EQ(popped_worker, nullptr); } -TEST_F(WorkerPoolTest, PopActorWorker) { - // Create a worker. - auto worker = CreateWorker(Process::CreateNewDummy()); - // Add the worker to the pool. - worker_pool_->PushWorker(worker); - - // Assign an actor ID to the worker. - const auto task_spec = ExampleTaskSpec(); - auto actor = worker_pool_->PopWorker(task_spec); - auto actor_id = ActorID::Of(JOB_ID, TaskID::ForDriverTask(JOB_ID), 1); - actor->AssignActorId(actor_id); - worker_pool_->PushWorker(actor); - - // Check that there are no more non-actor workers. - ASSERT_EQ(worker_pool_->PopWorker(task_spec), nullptr); - // Check that we can pop the actor worker. - const auto actor_task_spec = ExampleTaskSpec(actor_id); - actor = worker_pool_->PopWorker(actor_task_spec); - ASSERT_EQ(actor, worker); - ASSERT_EQ(actor->GetActorId(), actor_id); -} - TEST_F(WorkerPoolTest, PopWorkersOfMultipleLanguages) { // Create a Python Worker, and add it to the pool auto py_worker = CreateWorker(Process::CreateNewDummy(), Language::PYTHON); @@ -428,25 +406,19 @@ TEST_F(WorkerPoolTest, PopWorkerMultiTenancy) { worker_pool_->PushWorker(worker); } } - std::unordered_set worker_ids; for (int round = 0; round < 2; round++) { std::vector> workers; - // Pop workers for actor (creation) tasks. + // Pop workers for actor. for (auto job_id : job_ids) { - auto actor_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); - // For the first round, we pop for actor creation tasks. - // For the second round, we pop for actor tasks. - auto task_spec = - ExampleTaskSpec(round == 0 ? ActorID::Nil() : actor_id, Language::PYTHON, - job_id, round == 0 ? actor_id : ActorID::Nil()); + auto actor_creation_id = ActorID::Of(job_id, TaskID::ForDriverTask(job_id), 1); + // Pop workers for actor creation tasks. + auto task_spec = ExampleTaskSpec(/*actor_id=*/ActorID::Nil(), Language::PYTHON, + job_id, actor_creation_id); auto worker = worker_pool_->PopWorker(task_spec); ASSERT_TRUE(worker); ASSERT_EQ(worker->GetAssignedJobId(), job_id); - if (round == 0) { - worker->AssignActorId(actor_id); - } workers.push_back(worker); } From 17760e1510ef097f18cd511d5033b4426c317ab3 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Sat, 23 Jan 2021 09:32:37 +0100 Subject: [PATCH 027/245] [tune] update Optuna integration to 2.4.0 API (#13631) Co-authored-by: Amog Kamsetty --- python/ray/tune/suggest/optuna.py | 10 ++++- .../linux-py3.6-requirements_tune.txt | 45 +++++++++---------- .../linux-py3.7-requirements_tune.txt | 43 +++++++++--------- .../linux-py3.8-requirements_tune.txt | 14 +++--- python/requirements/requirements_tune.in | 2 +- 5 files changed, 59 insertions(+), 55 deletions(-) diff --git a/python/ray/tune/suggest/optuna.py b/python/ray/tune/suggest/optuna.py index a6468b8617dd..61dd13d62646 100644 --- a/python/ray/tune/suggest/optuna.py +++ b/python/ray/tune/suggest/optuna.py @@ -218,8 +218,14 @@ def on_trial_complete(self, error: bool = False): ot_trial = self._ot_trials[trial_id] ot_trial_id = ot_trial._trial_id - self._storage.set_trial_value(ot_trial_id, result.get( - self.metric, None)) + + val = result.get(self.metric, None) + if hasattr(self._storage, "set_trial_value"): + # Backwards compatibility with optuna < 2.4.0 + self._storage.set_trial_value(ot_trial_id, val) + else: + self._storage.set_trial_values(ot_trial_id, [val]) + self._storage.set_trial_state(ot_trial_id, ot.trial.TrialState.COMPLETE) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index 8d75554d451b..4351d0b6386f 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210113 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core @@ -35,7 +35,7 @@ ax-platform==0.1.9 ; python_version < "3.7" # via -r requirements_tune.in azure-core==1.10.0 # via azure-storage-blob -azure-storage-blob==12.6.0 +azure-storage-blob==12.7.1 # via mlflow backcall==0.2.0 # via ipython @@ -45,16 +45,16 @@ bayesian-optimization==1.2.0 # nevergrad bcrypt==3.2.0 # via paramiko -bleach==3.2.1 +bleach==3.2.2 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.53 +boto3==1.16.58 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.53 +botocore==1.19.58 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ click==7.1.2 # mlflow # sacremoses # wandb -cliff==3.5.0 +cliff==3.6.0 # via optuna cloudpickle==1.6.0 # via @@ -107,7 +107,7 @@ colorama==0.4.4 # via # -c ../requirements.txt # cmd2 -colorlog==4.6.2 +colorlog==4.7.2 # via optuna configparser==5.0.1 # via wandb @@ -129,7 +129,7 @@ cython==0.29.0 # -c ../requirements.txt # autogluon.core # configspace -dask[complete]==2020.12.0 +dask[complete]==2021.1.0 # via # -c ../requirements.txt # autogluon.core @@ -155,7 +155,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2020.12.0 +distributed==2021.1.0 # via # autogluon.core # dask @@ -213,13 +213,13 @@ google-auth==1.24.0 # tensorboard gpy==1.9.9 # via -r requirements_tune.in -gpytorch==1.3.0 +gpytorch==1.3.1 # via botorch graphviz==0.8.4 # via # autogluon.core # mxnet -grpcio==1.34.1 +grpcio==1.35.0 # via # -c ../requirements.txt # tensorboard @@ -330,9 +330,9 @@ kubernetes==12.0.1 # -r requirements_tune.in lightgbm==3.1.1 # via -r requirements_tune.in -locket==0.2.0 +locket==0.2.1 # via partd -mako==1.1.3 +mako==1.1.4 # via alembic markdown==3.3.3 # via tensorboard @@ -366,7 +366,7 @@ nbconvert==6.0.7 # via # jupyter # notebook -nbformat==5.0.8 +nbformat==5.1.2 # via # ipywidgets # nbclient @@ -436,7 +436,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.3.0 +optuna==2.4.0 # via -r requirements_tune.in packaging==20.8 # via @@ -501,7 +501,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.10 +prompt-toolkit==3.0.13 # via # ipython # jupyter-console @@ -584,7 +584,7 @@ pytorch-lightning==1.0.3 # pytorch-lightning-bolts pytz==2020.5 # via pandas -pyyaml==5.3.1 +pyyaml==5.4.1 # via # -c ../requirements.txt # autocfg @@ -600,12 +600,12 @@ pyyaml==5.3.1 # pytorch-lightning # wandb # yacs -pyzmq==20.0.0 +pyzmq==21.0.1 # via # jupyter-client # notebook # qtconsole -qtconsole==5.0.1 +qtconsole==5.0.2 # via jupyter qtpy==1.9.0 # via qtconsole @@ -703,7 +703,6 @@ six==1.15.0 # azure-core # bcrypt # bleach - # cliff # cryptography # cycler # databricks-cli @@ -736,7 +735,7 @@ six==1.15.0 # traitlets # wandb # websocket-client -smart_open==4.0.1 +smart_open[s3]==4.0.1 # via # -c ../requirements.txt # -r requirements_tune.in @@ -763,9 +762,9 @@ tabulate==0.8.7 # databricks-cli tblib==1.7.0 # via distributed -tensorboard-plugin-wit==1.7.0 +tensorboard-plugin-wit==1.8.0 # via tensorboard -tensorboard==2.4.0 +tensorboard==2.4.1 # via pytorch-lightning tensorboardx==2.1 # via diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index 1ac1824330c0..c7a7b9204649 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210113 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core @@ -35,7 +35,7 @@ ax-platform==0.1.19 ; python_version >= "3.7" # via -r requirements_tune.in azure-core==1.10.0 # via azure-storage-blob -azure-storage-blob==12.6.0 +azure-storage-blob==12.7.1 # via mlflow backcall==0.2.0 # via ipython @@ -45,16 +45,16 @@ bayesian-optimization==1.2.0 # nevergrad bcrypt==3.2.0 # via paramiko -bleach==3.2.1 +bleach==3.2.2 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.53 +boto3==1.16.58 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.53 +botocore==1.19.58 # via # boto3 # s3transfer @@ -87,7 +87,7 @@ click==7.1.2 # mlflow # sacremoses # wandb -cliff==3.5.0 +cliff==3.6.0 # via optuna cloudpickle==1.6.0 # via @@ -107,7 +107,7 @@ colorama==0.4.4 # via # -c ../requirements.txt # cmd2 -colorlog==4.6.2 +colorlog==4.7.2 # via optuna configparser==5.0.1 # via wandb @@ -127,7 +127,7 @@ cython==0.29.0 # -c ../requirements.txt # autogluon.core # configspace -dask[complete]==2020.12.0 +dask[complete]==2021.1.0 # via # -c ../requirements.txt # autogluon.core @@ -148,7 +148,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2020.12.0 +distributed==2021.1.0 # via # autogluon.core # dask @@ -206,13 +206,13 @@ google-auth==1.24.0 # tensorboard gpy==1.9.9 # via -r requirements_tune.in -gpytorch==1.3.0 +gpytorch==1.3.1 # via botorch graphviz==0.8.4 # via # autogluon.core # mxnet -grpcio==1.34.0 +grpcio==1.35.0 # via # -c ../requirements.txt # tensorboard @@ -321,9 +321,9 @@ kubernetes==12.0.1 # -r requirements_tune.in lightgbm==3.1.1 # via -r requirements_tune.in -locket==0.2.0 +locket==0.2.1 # via partd -mako==1.1.3 +mako==1.1.4 # via alembic markdown==3.3.3 # via tensorboard @@ -357,7 +357,7 @@ nbconvert==6.0.7 # via # jupyter # notebook -nbformat==5.0.8 +nbformat==5.1.2 # via # ipywidgets # nbclient @@ -427,7 +427,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.3.0 +optuna==2.4.0 # via -r requirements_tune.in packaging==20.8 # via @@ -492,7 +492,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.10 +prompt-toolkit==3.0.13 # via # ipython # jupyter-console @@ -575,7 +575,7 @@ pytorch-lightning==1.0.3 # pytorch-lightning-bolts pytz==2020.5 # via pandas -pyyaml==5.3.1 +pyyaml==5.4.1 # via # -c ../requirements.txt # autocfg @@ -591,12 +591,12 @@ pyyaml==5.3.1 # pytorch-lightning # wandb # yacs -pyzmq==20.0.0 +pyzmq==21.0.1 # via # jupyter-client # notebook # qtconsole -qtconsole==5.0.1 +qtconsole==5.0.2 # via jupyter qtpy==1.9.0 # via qtconsole @@ -694,7 +694,6 @@ six==1.15.0 # azure-core # bcrypt # bleach - # cliff # cryptography # cycler # databricks-cli @@ -753,9 +752,9 @@ tabulate==0.8.7 # databricks-cli tblib==1.7.0 # via distributed -tensorboard-plugin-wit==1.7.0 +tensorboard-plugin-wit==1.8.0 # via tensorboard -tensorboard==2.4.0 +tensorboard==2.4.1 # via pytorch-lightning tensorboardx==2.1 # via diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 36dbb1dce9ad..195951424490 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210121 +autogluon.core==0.0.16b20210122 # via gluoncv autograd==1.3 # via autogluon.core @@ -49,12 +49,12 @@ bleach==3.2.2 # via nbconvert bokeh==2.2.3 # via dask -boto3==1.16.57 +boto3==1.16.58 # via # -c ../requirements.txt # autogluon.core # smart-open -botocore==1.19.57 +botocore==1.19.58 # via # boto3 # s3transfer @@ -216,7 +216,7 @@ grpcio==1.35.0 # tensorboard gunicorn==20.0.4 # via mlflow -gym[atari]==0.18.0 +gym==0.18.0 # via # -c ../requirements.txt # -r requirements_tune.in @@ -417,7 +417,7 @@ opencv-python==4.5.1.48 # via # gluoncv # gym -optuna==2.3.0 +optuna==2.4.0 # via -r requirements_tune.in packaging==20.8 # via @@ -482,7 +482,7 @@ prometheus-flask-exporter==0.18.1 # via mlflow promise==2.3 # via wandb -prompt-toolkit==3.0.11 +prompt-toolkit==3.0.13 # via # ipython # jupyter-console @@ -586,7 +586,7 @@ pyzmq==21.0.1 # jupyter-client # notebook # qtconsole -qtconsole==5.0.1 +qtconsole==5.0.2 # via jupyter qtpy==1.9.0 # via qtconsole diff --git a/python/requirements/requirements_tune.in b/python/requirements/requirements_tune.in index 9bb83cbeec73..96a263204e97 100644 --- a/python/requirements/requirements_tune.in +++ b/python/requirements/requirements_tune.in @@ -20,7 +20,7 @@ matplotlib==3.3.3 mlflow==1.13.1 mxnet==1.7.0.post1 nevergrad==0.4.2.post5 -optuna==2.3.0 +optuna==2.4.0 pytest-remotedata==0.3.2 pytorch-lightning-bolts==0.2.5 pytorch-lightning==1.0.3 From b7dd7ddb5231bc4bc83ae1e385edc761d5476627 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Sat, 23 Jan 2021 22:06:51 +0200 Subject: [PATCH 028/245] deprecate useless fields in the cluster yaml. (#13637) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields Co-authored-by: Ameer Haj Ali Co-authored-by: Alex Wu Co-authored-by: Alex Wu Co-authored-by: Eric Liang Co-authored-by: Ameer Haj Ali Co-authored-by: root Co-authored-by: Dmitri Gekhtman --- dashboard/modules/reporter/reporter_head.py | 5 +---- doc/examples/lm/lm-cluster.yaml | 17 ----------------- python/ray/autoscaler/ray-schema.json | 12 ++++++++---- python/ray/serve/benchmarks/cluster.yaml | 3 --- .../test_cli_patterns/test_ray_up_config.yaml | 2 -- .../test_ray_up_docker_config.yaml | 2 -- python/ray/tests/test_coordinator_server.py | 2 -- .../util/sgd/tf/examples/tf-example-sgd.yaml | 3 --- .../sgd/torch/examples/benchmarks/README.rst | 1 - .../examples/benchmarks/horovod-benchmark.yaml | 3 --- .../util/sgd/torch/examples/example-sgd.yaml | 3 --- .../torch/examples/image_models/cluster.yaml | 3 --- .../torch/examples/segmentation/example.yaml | 2 -- .../sgd/torch/examples/sgd-development.yaml | 3 --- .../torch/examples/transformers/cluster.yaml | 2 -- release/horovod_tests/cluster.yaml | 2 -- .../long_running_distributed_tests/cluster.yaml | 1 - release/rllib_tests/stress_tests/cluster.yaml | 1 - release/stress_tests/autoscaler-cluster.yaml | 7 ------- release/stress_tests/cluster.yaml | 7 ------- .../tune_tests/scalability_tests/cluster.yaml | 2 -- release/xgboost_tests/cluster_cpu_moderate.yaml | 2 -- release/xgboost_tests/cluster_cpu_small.yaml | 2 -- release/xgboost_tests/cluster_gpu_small.yaml | 2 -- 24 files changed, 9 insertions(+), 80 deletions(-) diff --git a/dashboard/modules/reporter/reporter_head.py b/dashboard/modules/reporter/reporter_head.py index 2d84c6b65c21..7d375c8d66c4 100644 --- a/dashboard/modules/reporter/reporter_head.py +++ b/dashboard/modules/reporter/reporter_head.py @@ -78,10 +78,7 @@ async def get_ray_config(self, req) -> aiohttp.web.Response: payload = { "min_workers": cfg["min_workers"], - "max_workers": cfg["max_workers"], - "initial_workers": cfg["initial_workers"], - "autoscaling_mode": cfg["autoscaling_mode"], - "idle_timeout_minutes": cfg["idle_timeout_minutes"], + "max_workers": cfg["max_workers"] } try: diff --git a/doc/examples/lm/lm-cluster.yaml b/doc/examples/lm/lm-cluster.yaml index 3590d482aa64..7ea6641f588d 100644 --- a/doc/examples/lm/lm-cluster.yaml +++ b/doc/examples/lm/lm-cluster.yaml @@ -9,23 +9,6 @@ min_workers: 1 # node. This takes precedence over min_workers. max_workers: 2 -# The initial number of worker nodes to launch in addition to the head -# node. When the cluster is first brought up (or when it is refreshed with a -# subsequent `ray up`) this number of nodes will be started. -initial_workers: 1 - -# Whether or not to autoscale aggressively. If this is enabled, if at any point -# we would start more workers, we start at least enough to bring us to -# initial_workers. -autoscaling_mode: default - - -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.48 # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/python/ray/autoscaler/ray-schema.json b/python/ray/autoscaler/ray-schema.json index 22b21b84cb66..7c7b2a1ed4ba 100644 --- a/python/ray/autoscaler/ray-schema.json +++ b/python/ray/autoscaler/ray-schema.json @@ -24,7 +24,7 @@ "type": "string" }, "min_workers": { - "description": "The minimum number of workers nodes to launch in addition to the head node. This number should be >= 0", + "description": "DEPRECATED. Use the per node_type min_workers field instead.", "type": "integer", "minimum": 0 }, @@ -34,17 +34,17 @@ "minimum": 0 }, "initial_workers": { - "description": "The number of workers to launch initially, in addition to the head node.", + "description": "DEPRECATED.", "type": "integer", "minimum": 0 }, "autoscaling_mode": { - "description": "The mode of the autoscaler e.g. default, aggressive", + "description": "DEPRECATED. Use upscaling_speed instead.", "type": "string", "enum": [ "default", "aggressive" ] }, "target_utilization_fraction": { - "description": "The autoscaler will scale up the cluster to this target fraction of resources usage. For example, if a cluster of 8 nodes is 100% busy # and target_utilization was 0.8, it would resize the cluster to 10.", + "description": "DEPRECATED. Use upscaling_speed instead.", "type": "number", "minimum": 0, "maximum": 1 @@ -254,6 +254,10 @@ "type": "string", "description": "If using multiple node types, specifies the head node type." }, + "worker_default_node_type": { + "type": "string", + "description": "DEPRECATED." + }, "head_node": { "type": "object", "description": "Provider-specific config for the head node, e.g. instance type." diff --git a/python/ray/serve/benchmarks/cluster.yaml b/python/ray/serve/benchmarks/cluster.yaml index d588dc06a207..aad50bf97d3e 100644 --- a/python/ray/serve/benchmarks/cluster.yaml +++ b/python/ray/serve/benchmarks/cluster.yaml @@ -1,13 +1,10 @@ cluster_name: default min_workers: 5 max_workers: 5 -initial_workers: 5 -autoscaling_mode: default docker: image: 'anyscale/ray-ml:latest' container_name: ray_container pull_before_run: true -target_utilization_fraction: 0.8 idle_timeout_minutes: 5 provider: type: aws diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml index 4d63420092e5..f3d6a03ce1b1 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_config.yaml @@ -12,7 +12,6 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 -initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -27,7 +26,6 @@ setup_commands: - echo a - echo b - echo ${echo hi} -target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t1.micro diff --git a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml index 8d898f749646..bffd0f53f2ae 100644 --- a/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml +++ b/python/ray/tests/test_cli_patterns/test_ray_up_docker_config.yaml @@ -17,7 +17,6 @@ head_start_ray_commands: - ray stop - ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml idle_timeout_minutes: 5 -initial_workers: 1 initialization_commands: - echo init max_workers: 2 @@ -32,7 +31,6 @@ setup_commands: - echo a - echo b - echo ${echo hi} -target_utilization_fraction: 0.9 worker_nodes: ImageId: latest_dlami InstanceType: t3a.small diff --git a/python/ray/tests/test_coordinator_server.py b/python/ray/tests/test_coordinator_server.py index 6fb654e3e550..0c59b909e94c 100644 --- a/python/ray/tests/test_coordinator_server.py +++ b/python/ray/tests/test_coordinator_server.py @@ -52,7 +52,6 @@ def testClusterStateInit(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, - "initial_workers": 0, "provider": { "type": "local", "head_ip": "0.0.0.0:2", @@ -154,7 +153,6 @@ def testCoordinatorSenderNodeProvider(self): "cluster_name": "random_name", "min_workers": 0, "max_workers": 0, - "initial_workers": 0, "provider": { "type": "local", "coordinator_address": self.coordinator_address, diff --git a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml index 846f5f10ce3c..fcf31354b70e 100644 --- a/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml +++ b/python/ray/util/sgd/tf/examples/tf-example-sgd.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-tf # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst index 78dd71a15f51..54b3ce192b68 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/README.rst +++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst @@ -104,7 +104,6 @@ You can specify the number of nodes you want to use with the following configura # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: # Change this to a custom quantity - initial_workers: # same as above max_workers: # same as above You may want to install FP16 support for PyTorch with the following configuration in the YAML file: diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml index 04cbd520e135..7e3db50510ff 100644 --- a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml @@ -4,11 +4,8 @@ cluster_name: horovod-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 50 # docker: diff --git a/python/ray/util/sgd/torch/examples/example-sgd.yaml b/python/ray/util/sgd/torch/examples/example-sgd.yaml index fe9b18d191b0..6bbc64423aab 100644 --- a/python/ray/util/sgd/torch/examples/example-sgd.yaml +++ b/python/ray/util/sgd/torch/examples/example-sgd.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 20 # docker: diff --git a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml index fccd5f8625bd..7d9ff9be89e0 100644 --- a/python/ray/util/sgd/torch/examples/image_models/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/image_models/cluster.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch-imagenet # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/segmentation/example.yaml b/python/ray/util/sgd/torch/examples/segmentation/example.yaml index 78cd9bcb09ba..33db0f445537 100644 --- a/python/ray/util/sgd/torch/examples/segmentation/example.yaml +++ b/python/ray/util/sgd/torch/examples/segmentation/example.yaml @@ -4,10 +4,8 @@ cluster_name: sgd-coco-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 1 -initial_workers: 1 max_workers: 1 -target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml index 590cb63b0708..bc79803eeadd 100644 --- a/python/ray/util/sgd/torch/examples/sgd-development.yaml +++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml @@ -4,11 +4,8 @@ cluster_name: sgd-pytorch # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 2 -initial_workers: 2 max_workers: 2 -target_utilization_fraction: 0.9 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 10 # docker: diff --git a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml index 4cecd3bf86a1..434b48d3044f 100644 --- a/python/ray/util/sgd/torch/examples/transformers/cluster.yaml +++ b/python/ray/util/sgd/torch/examples/transformers/cluster.yaml @@ -4,10 +4,8 @@ cluster_name: transformer-cluster # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. min_workers default to 0. min_workers: 3 -initial_workers: 3 max_workers: 3 -target_utilization_fraction: 0.9 # Cloud-provider specific configuration. provider: type: aws diff --git a/release/horovod_tests/cluster.yaml b/release/horovod_tests/cluster.yaml index 880ebdba2423..5dbc457a78c7 100644 --- a/release/horovod_tests/cluster.yaml +++ b/release/horovod_tests/cluster.yaml @@ -10,8 +10,6 @@ min_workers: 3 # node. This takes precedence over min_workers. min_workers defaults to 0. max_workers: 3 -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/long_running_distributed_tests/cluster.yaml b/release/long_running_distributed_tests/cluster.yaml index f8d10549a24c..4710a47fcc4a 100644 --- a/release/long_running_distributed_tests/cluster.yaml +++ b/release/long_running_distributed_tests/cluster.yaml @@ -3,7 +3,6 @@ cluster_name: long-running-distributed-tests min_workers: 3 max_workers: 3 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/rllib_tests/stress_tests/cluster.yaml b/release/rllib_tests/stress_tests/cluster.yaml index 8f20a46afb85..4c83e27c33aa 100644 --- a/release/rllib_tests/stress_tests/cluster.yaml +++ b/release/rllib_tests/stress_tests/cluster.yaml @@ -3,7 +3,6 @@ cluster_name: ray-rllib-stress-tests min_workers: 9 max_workers: 9 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/stress_tests/autoscaler-cluster.yaml b/release/stress_tests/autoscaler-cluster.yaml index ed5ee2bd58f1..9c17d303e4db 100644 --- a/release/stress_tests/autoscaler-cluster.yaml +++ b/release/stress_tests/autoscaler-cluster.yaml @@ -13,13 +13,6 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/stress_tests/cluster.yaml b/release/stress_tests/cluster.yaml index a513d9764c11..155ae1329c0b 100644 --- a/release/stress_tests/cluster.yaml +++ b/release/stress_tests/cluster.yaml @@ -13,13 +13,6 @@ min_workers: 100 # node. This takes precedence over min_workers. max_workers: 100 -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.8 - # If a node is idle for this many minutes, it will be removed. idle_timeout_minutes: 5 diff --git a/release/tune_tests/scalability_tests/cluster.yaml b/release/tune_tests/scalability_tests/cluster.yaml index e279efb37dab..fd966898b8a7 100644 --- a/release/tune_tests/scalability_tests/cluster.yaml +++ b/release/tune_tests/scalability_tests/cluster.yaml @@ -2,9 +2,7 @@ cluster_name: ray-tune-scalability-tests min_workers: 15 max_workers: 15 -initial_workers: 15 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_moderate.yaml b/release/xgboost_tests/cluster_cpu_moderate.yaml index 18a18dceb56e..a65c49336a1c 100644 --- a/release/xgboost_tests/cluster_cpu_moderate.yaml +++ b/release/xgboost_tests/cluster_cpu_moderate.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-moderate min_workers: 31 max_workers: 31 -initial_workers: 31 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_cpu_small.yaml b/release/xgboost_tests/cluster_cpu_small.yaml index fe9e997f85aa..4b97439b9d59 100644 --- a/release/xgboost_tests/cluster_cpu_small.yaml +++ b/release/xgboost_tests/cluster_cpu_small.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-cpu-small min_workers: 3 max_workers: 3 -initial_workers: 3 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: diff --git a/release/xgboost_tests/cluster_gpu_small.yaml b/release/xgboost_tests/cluster_gpu_small.yaml index 5bea4f19acf2..535d28490f71 100644 --- a/release/xgboost_tests/cluster_gpu_small.yaml +++ b/release/xgboost_tests/cluster_gpu_small.yaml @@ -2,9 +2,7 @@ cluster_name: ray-xgboost-release-gpu-small min_workers: 4 max_workers: 4 -initial_workers: 4 -target_utilization_fraction: 0.8 idle_timeout_minutes: 15 docker: From e675e5b75a4470c01b4df577d4028b00e01d3d53 Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Sat, 23 Jan 2021 23:11:39 -0800 Subject: [PATCH 029/245] [ray_client]: Add more retry logic (#13478) --- python/ray/tests/BUILD | 1 + python/ray/tests/test_client.py | 51 +++++++++------------ python/ray/tests/test_client_init.py | 37 ++++++++++++++++ python/ray/util/client/worker.py | 66 +++++++++++++++++++++++----- 4 files changed, 114 insertions(+), 41 deletions(-) create mode 100644 python/ray/tests/test_client_init.py diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 7f4c61bb1cfb..8fe8b21c3369 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -79,6 +79,7 @@ py_test_module_list( "test_asyncio.py", "test_autoscaler.py", "test_autoscaler_yaml.py", + "test_client_init.py", "test_client_metadata.py", "test_client.py", "test_client_references.py", diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 21bb807fda55..dc5de2470e6e 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -2,42 +2,13 @@ import time import sys import logging +import threading import ray.util.client.server.server as ray_client_server -from ray.util.client import RayAPIStub from ray.util.client.common import ClientObjectRef from ray.util.client.ray_client_helpers import ray_start_client_server -def test_num_clients(shutdown_only): - # Tests num clients reporting; useful if you want to build an app that - # load balances clients between Ray client servers. - server = ray_client_server.serve("localhost:50051") - try: - api1 = RayAPIStub() - info1 = api1.connect("localhost:50051") - assert info1["num_clients"] == 1, info1 - api2 = RayAPIStub() - info2 = api2.connect("localhost:50051") - assert info2["num_clients"] == 2, info2 - - # Disconnect the first two clients. - api1.disconnect() - api2.disconnect() - time.sleep(1) - - api3 = RayAPIStub() - info3 = api3.connect("localhost:50051") - assert info3["num_clients"] == 1, info3 - - # Check info contains ray and python version. - assert isinstance(info3["ray_version"], str), info3 - assert isinstance(info3["ray_commit"], str), info3 - assert isinstance(info3["python_version"], str), info3 - finally: - server.stop(0) - - @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_real_ray_fallback(ray_start_regular_shared): with ray_start_client_server() as ray: @@ -373,5 +344,25 @@ def test_internal_kv(ray_start_regular_shared): assert ray._internal_kv_get("apple") == b"" +def test_startup_retry(ray_start_regular_shared): + from ray.util.client import ray as ray_client + ray_client._inside_client_test = True + + with pytest.raises(ConnectionError): + ray_client.connect("localhost:50051", connection_retries=1) + + def run_client(): + ray_client.connect("localhost:50051") + ray_client.disconnect() + + thread = threading.Thread(target=run_client, daemon=True) + thread.start() + time.sleep(3) + server = ray_client_server.serve("localhost:50051") + thread.join() + server.stop(0) + ray_client._inside_client_test = False + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_client_init.py b/python/ray/tests/test_client_init.py new file mode 100644 index 000000000000..1949fe3fdc8f --- /dev/null +++ b/python/ray/tests/test_client_init.py @@ -0,0 +1,37 @@ +"""Client tests that run their own init (as with init_and_serve) live here""" +import time + +import ray.util.client.server.server as ray_client_server + +from ray.util.client import RayAPIStub + + +def test_num_clients(): + # Tests num clients reporting; useful if you want to build an app that + # load balances clients between Ray client servers. + server, _ = ray_client_server.init_and_serve("localhost:50051") + try: + api1 = RayAPIStub() + info1 = api1.connect("localhost:50051") + assert info1["num_clients"] == 1, info1 + api2 = RayAPIStub() + info2 = api2.connect("localhost:50051") + assert info2["num_clients"] == 2, info2 + + # Disconnect the first two clients. + api1.disconnect() + api2.disconnect() + time.sleep(1) + + api3 = RayAPIStub() + info3 = api3.connect("localhost:50051") + assert info3["num_clients"] == 1, info3 + + # Check info contains ray and python version. + assert isinstance(info3["ray_version"], str), info3 + assert isinstance(info3["ray_commit"], str), info3 + assert isinstance(info3["python_version"], str), info3 + api3.disconnect() + finally: + ray_client_server.shutdown_with_server(server) + time.sleep(2) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index 3c6401fdafd6..d62173be745f 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -5,6 +5,7 @@ import base64 import json import logging +import time import uuid from collections import defaultdict from typing import Any @@ -33,6 +34,13 @@ MAX_TIMEOUT_SEC = 30 +def backoff(timeout: int) -> int: + timeout = timeout + 5 + if timeout > MAX_TIMEOUT_SEC: + timeout = MAX_TIMEOUT_SEC + return timeout + + class Worker: def __init__(self, conn_str: str = "", @@ -59,23 +67,59 @@ def __init__(self, else: self.channel = grpc.insecure_channel(conn_str) + # Retry the connection until the channel responds to something + # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 timeout = INITIAL_TIMEOUT_SEC - while conn_attempts < connection_retries + 1: + ray_ready = False + while conn_attempts < max(connection_retries, 1): conn_attempts += 1 try: + # Let gRPC wait for us to see if the channel becomes ready. + # If it throws, we couldn't connect. grpc.channel_ready_future(self.channel).result(timeout=timeout) - break + # The HTTP2 channel is ready. Wrap the channel with the + # RayletDriverStub, allowing for unary requests. + self.server = ray_client_pb2_grpc.RayletDriverStub( + self.channel) + # Now the HTTP2 channel is ready, or proxied, but the + # servicer may not be ready. Call is_initialized() and if + # it throws, the servicer is not ready. On success, the + # `ray_ready` result is checked. + ray_ready = self.is_initialized() + if ray_ready: + # Ray is ready! Break out of the retry loop + break + # Ray is not ready yet, wait a timeout + time.sleep(timeout) except grpc.FutureTimeoutError: - if conn_attempts >= connection_retries: - raise ConnectionError("ray client connection timeout") - logger.info(f"Couldn't connect in {timeout} seconds, retrying") - timeout = timeout + 5 - if timeout > MAX_TIMEOUT_SEC: - timeout = MAX_TIMEOUT_SEC - - self.server = ray_client_pb2_grpc.RayletDriverStub(self.channel) - + logger.info( + f"Couldn't connect channel in {timeout} seconds, retrying") + # Note that channel_ready_future constitutes its own timeout, + # which is why we do not sleep here. + except grpc.RpcError as e: + if e.code() == grpc.StatusCode.UNAVAILABLE: + # UNAVAILABLE is gRPC's retryable error, + # so we do that here. + logger.info("Ray client server unavailable, " + f"retrying in {timeout}s...") + logger.debug(f"Received when checking init: {e.details()}") + # Ray is not ready yet, wait a timeout + time.sleep(timeout) + else: + # Any other gRPC error gets a reraise + raise e + # Fallthrough, backoff, and retry at the top of the loop + logger.info("Waiting for Ray to become ready on the server, " + f"retry in {timeout}s...") + timeout = backoff(timeout) + + # If we made it through the loop without ray_ready it means we've used + # up our retries and should error back to the user. + if not ray_ready: + raise ConnectionError("ray client connection timeout") + + # Initialize the streams to finish protocol negotiation. self.data_client = DataClient(self.channel, self._client_id, self.metadata) self.reference_count: Dict[bytes, int] = defaultdict(int) From edbb2937d393f9cd95a5016bc2df5250bbd59152 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Sat, 23 Jan 2021 23:15:32 -0800 Subject: [PATCH 030/245] [Object Spilling] Multi node file spilling V2. (#13542) * done. * done. * Fix a mistake. * Ready. * Fix issues. * fix. * Finished the first round of code review. * formatting. * In progress. * Formatting. * Addressed code review. * Formatting * Fix tests. * fix bugs. * Skip flaky tests for now. --- python/ray/external_storage.py | 4 + python/ray/parameter.py | 3 + python/ray/tests/BUILD | 2 +- python/ray/tests/test_object_spilling.py | 183 ++++++++---------- src/ray/common/ray_config_def.h | 4 + src/ray/gcs/accessor.h | 2 + .../gcs/gcs_client/service_based_accessor.cc | 4 +- .../gcs/gcs_client/service_based_accessor.h | 1 + src/ray/gcs/gcs_server/gcs_object_manager.cc | 10 +- src/ray/gcs/gcs_server/gcs_object_manager.h | 1 + .../gcs_server/gcs_placement_group_manager.h | 2 +- src/ray/object_manager/common.h | 5 +- src/ray/object_manager/object_buffer_pool.cc | 5 +- src/ray/object_manager/object_directory.cc | 43 ++-- src/ray/object_manager/object_directory.h | 9 +- src/ray/object_manager/object_manager.cc | 12 +- src/ray/object_manager/object_manager.h | 5 +- .../ownership_based_object_directory.cc | 6 +- src/ray/object_manager/pull_manager.cc | 60 ++++-- src/ray/object_manager/pull_manager.h | 6 +- .../object_manager/test/pull_manager_test.cc | 130 ++++++++----- src/ray/protobuf/gcs.proto | 10 +- src/ray/protobuf/gcs_service.proto | 5 +- src/ray/protobuf/node_manager.proto | 15 ++ src/ray/raylet/local_object_manager.cc | 33 +++- src/ray/raylet/local_object_manager.h | 41 +++- src/ray/raylet/node_manager.cc | 78 ++++++-- src/ray/raylet/node_manager.h | 11 ++ src/ray/raylet/raylet.cc | 5 +- src/ray/raylet/reconstruction_policy.cc | 3 +- src/ray/raylet/reconstruction_policy_test.cc | 5 +- .../raylet/test/local_object_manager_test.cc | 86 +++++++- src/ray/raylet_client/raylet_client.cc | 12 ++ src/ray/raylet_client/raylet_client.h | 9 + .../rpc/node_manager/node_manager_client.h | 3 + .../rpc/node_manager/node_manager_server.h | 5 + 36 files changed, 571 insertions(+), 247 deletions(-) diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 1b4f6fec81f1..6e16351482cd 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -345,6 +345,10 @@ def setup_external_storage(config): elif storage_type == "smart_open": _external_storage = ExternalStorageSmartOpenImpl( **config["params"]) + elif storage_type == "mock_distributed_fs": + # This storage is used to unit test distributed external storages. + # TODO(sang): Delete it after introducing the mock S3 test. + _external_storage = FileSystemStorage(**config["params"]) else: raise ValueError(f"Unknown external storage type: {storage_type}") else: diff --git a/python/ray/parameter.py b/python/ray/parameter.py index a9b20769d1e2..666b82905b1e 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -330,3 +330,6 @@ def _check_usage(self): # Validate external storage usage. external_storage.setup_external_storage(object_spilling_config) external_storage.reset_external_storage() + # Configure the proper system config. + self._system_config["is_external_storage_type_fs"] = ( + object_spilling_config["type"] == "filesystem") diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 8fe8b21c3369..2ccdb4be2644 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -53,7 +53,6 @@ py_test_module_list( "test_multinode_failures_2.py", "test_multiprocessing.py", "test_object_manager.py", - "test_object_spilling.py", "test_output.py", "test_reconstruction.py", "test_reference_counting.py", @@ -134,6 +133,7 @@ py_test_module_list( py_test_module_list( files = [ "test_placement_group.py", + "test_object_spilling.py", ], size = "large", extra_srcs = SRCS, diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 8319dbfcac54..68824b7bb09a 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -21,6 +21,15 @@ "directory_path": spill_local_path } } +# Since we have differet protocol for a local external storage (e.g., fs) +# and distributed external storage (e.g., S3), we need to test both cases. +# This mocks the distributed fs with cluster utils. +mock_distributed_fs_object_spilling_config = { + "type": "mock_distributed_fs", + "params": { + "directory_path": spill_local_path + } +} smart_open_object_spilling_config = { "type": "smart_open", "params": { @@ -29,6 +38,15 @@ } +def create_object_spilling_config(request, tmp_path): + if (request.param["type"] == "filesystem" + or request.param["type"] == "mock_distributed_fs"): + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + request.param["params"]["directory_path"] = str(temp_folder) + return json.dumps(request.param), temp_folder + + @pytest.fixture( scope="function", params=[ @@ -36,10 +54,18 @@ # TODO(sang): Add a mock dependency to test S3. # smart_open_object_spilling_config, ]) -def object_spilling_config(request, tmpdir): - if request.param["type"] == "filesystem": - request.param["params"]["directory_path"] = str(tmpdir) - yield json.dumps(request.param) +def object_spilling_config(request, tmp_path): + yield create_object_spilling_config(request, tmp_path) + + +@pytest.fixture( + scope="function", + params=[ + file_system_object_spilling_config, + mock_distributed_fs_object_spilling_config + ]) +def multi_node_object_spilling_config(request, tmp_path): + yield create_object_spilling_config(request, tmp_path) def test_invalid_config_raises_exception(shutdown_only): @@ -75,22 +101,17 @@ def test_url_generation_and_parse(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spilling_not_done_for_pinned_object(tmp_path, shutdown_only): +def test_spilling_not_done_for_pinned_object(object_spilling_config, + shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = object_spilling_config ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, "min_spilling_size": 0, }) arr = np.random.rand(5 * 1024 * 1024) # 40 MB @@ -110,27 +131,23 @@ def is_dir_empty(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -@pytest.mark.parametrize( - "ray_start_cluster_head", [{ - "num_cpus": 0, - "object_store_memory": 75 * 1024 * 1024, - "_system_config": { +def test_spill_remote_object(ray_start_cluster, + multi_node_object_spilling_config): + cluster = ray_start_cluster + object_spilling_config, _ = multi_node_object_spilling_config + cluster.add_node( + num_cpus=0, + object_store_memory=75 * 1024 * 1024, + _system_config={ "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "max_io_workers": 4, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": "/tmp" - } - }), + "object_spilling_config": object_spilling_config, "min_spilling_size": 0, - }, - }], - indirect=True) -def test_spill_remote_object(ray_start_cluster_head): - cluster = ray_start_cluster_head + }) + ray.init(address=cluster.address) cluster.add_node(object_store_memory=75 * 1024 * 1024) + cluster.wait_for_nodes() @ray.remote def put(): @@ -162,6 +179,7 @@ def depends(arg): platform.system() == "Windows", reason="Failing on Windows.") def test_spill_objects_automatically(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. + object_spilling_config, _ = object_spilling_config ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, @@ -197,10 +215,9 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_spill_stats(tmp_path, shutdown_only): +def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, _ = object_spilling_config ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, @@ -208,14 +225,7 @@ def test_spill_stats(tmp_path, shutdown_only): "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, - "object_spilling_config": json.dumps( - { - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }, - separators=(",", ":")) + "object_spilling_config": object_spilling_config }, ) @@ -242,6 +252,7 @@ def f(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_during_get(object_spilling_config, shutdown_only): + object_spilling_config, _ = object_spilling_config ray.init( num_cpus=4, object_store_memory=100 * 1024 * 1024, @@ -273,6 +284,7 @@ def f(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") def test_spill_deadlock(object_spilling_config, shutdown_only): + object_spilling_config, _ = object_spilling_config # Limit our object store to 75 MiB of memory. ray.init( object_store_memory=75 * 1024 * 1024, @@ -302,10 +314,9 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_delete_objects(tmp_path, shutdown_only): +def test_delete_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = object_spilling_config ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -313,12 +324,7 @@ def test_delete_objects(tmp_path, shutdown_only): "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -343,13 +349,11 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on " - "Windows and Mac.") -def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): + platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") +def test_delete_objects_delete_while_creating(object_spilling_config, + shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = object_spilling_config ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -357,12 +361,7 @@ def test_delete_objects_delete_while_creating(tmp_path, shutdown_only): "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, }) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] @@ -395,25 +394,18 @@ def is_dir_empty(): @pytest.mark.skipif( - platform.system() in ["Windows", "Darwin"], - reason="Failing on Windows " - "and Mac.") -def test_delete_objects_on_worker_failure(tmp_path, shutdown_only): + platform.system() in ["Windows", "Darwin"], reason="Failing on Windows.") +def test_delete_objects_on_worker_failure(object_spilling_config, + shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = object_spilling_config ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, "min_spilling_size": 0, }) @@ -469,10 +461,10 @@ def is_dir_empty(): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_delete_objects_multi_node(tmp_path, ray_start_cluster): +def test_delete_objects_multi_node(multi_node_object_spilling_config, + ray_start_cluster): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = multi_node_object_spilling_config cluster = ray_start_cluster # Head node. cluster.add_node( @@ -483,12 +475,7 @@ def test_delete_objects_multi_node(tmp_path, ray_start_cluster): "min_spilling_size": 20 * 1024 * 1024, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, }) # Add 2 worker nodes. for _ in range(2): @@ -546,10 +533,9 @@ def is_dir_empty(): @pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") -def test_fusion_objects(tmp_path, shutdown_only): +def test_fusion_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. - temp_folder = tmp_path / "spill" - temp_folder.mkdir() + object_spilling_config, temp_folder = object_spilling_config min_spilling_size = 10 * 1024 * 1024 ray.init( object_store_memory=75 * 1024 * 1024, @@ -557,12 +543,7 @@ def test_fusion_objects(tmp_path, shutdown_only): "max_io_workers": 3, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, "min_spilling_size": min_spilling_size, }) replay_buffer = [] @@ -600,8 +581,8 @@ def test_fusion_objects(tmp_path, shutdown_only): # https://github.com/ray-project/ray/issues/12912 -def do_test_release_resource(tmp_path, expect_released): - temp_folder = tmp_path / "spill" +def do_test_release_resource(object_spilling_config, expect_released): + object_spilling_config, temp_folder = object_spilling_config ray.init( num_cpus=1, object_store_memory=75 * 1024 * 1024, @@ -609,12 +590,7 @@ def do_test_release_resource(tmp_path, expect_released): "max_io_workers": 1, "release_resources_during_plasma_fetch": expect_released, "automatic_object_spilling_enabled": True, - "object_spilling_config": json.dumps({ - "type": "filesystem", - "params": { - "directory_path": str(temp_folder) - } - }), + "object_spilling_config": object_spilling_config, }) plasma_obj = ray.put(np.ones(50 * 1024 * 1024, dtype=np.uint8)) for _ in range(5): @@ -643,14 +619,14 @@ def f(dep): @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_no_release_during_plasma_fetch(tmp_path, shutdown_only): - do_test_release_resource(tmp_path, expect_released=False) +def test_no_release_during_plasma_fetch(object_spilling_config, shutdown_only): + do_test_release_resource(object_spilling_config, expect_released=False) @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") -def test_release_during_plasma_fetch(tmp_path, shutdown_only): - do_test_release_resource(tmp_path, expect_released=True) +def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): + do_test_release_resource(object_spilling_config, expect_released=True) @pytest.mark.skip( @@ -661,6 +637,7 @@ def test_release_during_plasma_fetch(tmp_path, shutdown_only): @pytest.mark.timeout(30) def test_spill_objects_on_object_transfer(object_spilling_config, ray_start_cluster): + object_spilling_config, _ = object_spilling_config # This test checks that objects get spilled to make room for transferred # objects. cluster = ray_start_cluster diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index cfbc62517d5e..d06a1c358196 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -361,6 +361,10 @@ RAY_CONFIG(bool, automatic_object_deletion_enabled, true) /// Grace period until we throw the OOM error to the application in seconds. RAY_CONFIG(int64_t, oom_grace_period_s, 10) +/// Whether or not the external storage is file system. +/// This is configured based on object_spilling_config. +RAY_CONFIG(bool, is_external_storage_type_fs, true) + /* Configuration parameters for locality-aware scheduling. */ /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task /// dependency locality when choosing a worker for leasing. diff --git a/src/ray/gcs/accessor.h b/src/ray/gcs/accessor.h index ab0704bcadd7..3bc7002021b3 100644 --- a/src/ray/gcs/accessor.h +++ b/src/ray/gcs/accessor.h @@ -303,10 +303,12 @@ class ObjectInfoAccessor { /// /// \param object_id The ID of object which location will be added to GCS. /// \param spilled_url The URL where the object has been spilled. + /// \param spilled_node_id The NodeID where the object has been spilled. /// \param callback Callback that will be called after object has been added to GCS. /// \return Status virtual Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, + const NodeID &spilled_node_id, const StatusCallback &callback) = 0; /// Remove location of object from GCS asynchronously. diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index dfa192320976..821e0f7d930a 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -1102,13 +1102,14 @@ Status ServiceBasedObjectInfoAccessor::AsyncAddLocation(const ObjectID &object_i Status ServiceBasedObjectInfoAccessor::AsyncAddSpilledUrl( const ObjectID &object_id, const std::string &spilled_url, - const StatusCallback &callback) { + const NodeID &spilled_node_id, const StatusCallback &callback) { RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id << ", spilled_url = " << spilled_url << ", job id = " << object_id.TaskId().JobId(); rpc::AddObjectLocationRequest request; request.set_object_id(object_id.Binary()); request.set_spilled_url(spilled_url); + request.set_spilled_node_id(spilled_node_id.Binary()); auto operation = [this, request, callback](const SequencerDoneCallback &done_callback) { client_impl_->GetGcsRpcClient().AddObjectLocation( @@ -1179,6 +1180,7 @@ Status ServiceBasedObjectInfoAccessor::AsyncSubscribeToLocations( if (!result->spilled_url().empty()) { rpc::ObjectLocationChange update; update.set_spilled_url(result->spilled_url()); + update.set_spilled_node_id(result->spilled_node_id()); update.set_size(result->size()); notification.push_back(update); } diff --git a/src/ray/gcs/gcs_client/service_based_accessor.h b/src/ray/gcs/gcs_client/service_based_accessor.h index 2d362976dd22..149fa6d2e8d4 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.h +++ b/src/ray/gcs/gcs_client/service_based_accessor.h @@ -326,6 +326,7 @@ class ServiceBasedObjectInfoAccessor : public ObjectInfoAccessor { size_t object_size, const StatusCallback &callback) override; Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, + const NodeID &node_id, const StatusCallback &callback) override; Status AsyncRemoveLocation(const ObjectID &object_id, const NodeID &node_id, diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.cc b/src/ray/gcs/gcs_server/gcs_object_manager.cc index 73971ed7f18f..818904d65b61 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_object_manager.cc @@ -66,6 +66,7 @@ void GcsObjectManager::HandleAddObjectLocation( NodeID node_id; std::string spilled_url; + NodeID spilled_node_id; if (!request.node_id().empty()) { node_id = NodeID::FromBinary(request.node_id()); RAY_LOG(DEBUG) << "Adding object location, job id = " << object_id.TaskId().JobId() @@ -75,12 +76,14 @@ void GcsObjectManager::HandleAddObjectLocation( absl::MutexLock lock(&mutex_); RAY_CHECK(!request.spilled_url().empty()); spilled_url = request.spilled_url(); + spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); object_to_locations_[object_id].spilled_url = spilled_url; + object_to_locations_[object_id].spilled_node_id = spilled_node_id; RAY_LOG(DEBUG) << "Adding object spilled location, object id = " << object_id; } size_t size = request.size(); - auto on_done = [this, object_id, node_id, spilled_url, size, reply, + auto on_done = [this, object_id, node_id, spilled_url, size, spilled_node_id, reply, send_reply_callback](const Status &status) { if (status.ok()) { rpc::ObjectLocationChange notification; @@ -90,6 +93,7 @@ void GcsObjectManager::HandleAddObjectLocation( } if (!spilled_url.empty()) { notification.set_spilled_url(spilled_url); + notification.set_spilled_node_id(spilled_node_id.Binary()); } notification.set_size(size); RAY_CHECK_OK(gcs_pub_sub_->Publish(OBJECT_CHANNEL, object_id.Hex(), @@ -97,7 +101,8 @@ void GcsObjectManager::HandleAddObjectLocation( RAY_LOG(DEBUG) << "Finished adding object location, job id = " << object_id.TaskId().JobId() << ", object id = " << object_id << ", node id = " << node_id << ", task id = " << object_id.TaskId() - << ", spilled_url = " << spilled_url; + << ", spilled_url = " << spilled_url + << ", spilled_node_id = " << spilled_node_id; } else { RAY_LOG(ERROR) << "Failed to add object location: " << status.ToString() << ", job id = " << object_id.TaskId().JobId() @@ -291,6 +296,7 @@ const ObjectLocationInfo GcsObjectManager::GenObjectLocationInfo( object_data.add_locations()->set_manager(node_id.Binary()); } object_data.set_spilled_url(it->second.spilled_url); + object_data.set_spilled_node_id(it->second.spilled_node_id.Binary()); object_data.set_size(it->second.object_size); } return object_data; diff --git a/src/ray/gcs/gcs_server/gcs_object_manager.h b/src/ray/gcs/gcs_server/gcs_object_manager.h index 2afff0816850..6d4d39598cb6 100644 --- a/src/ray/gcs/gcs_server/gcs_object_manager.h +++ b/src/ray/gcs/gcs_server/gcs_object_manager.h @@ -65,6 +65,7 @@ class GcsObjectManager : public rpc::ObjectInfoHandler { struct LocationSet { absl::flat_hash_set locations; std::string spilled_url = ""; + NodeID spilled_node_id = NodeID::Nil(); size_t object_size = 0; }; diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index 8bd36941745f..c76849108990 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -193,7 +193,7 @@ class GcsPlacementGroupManager : public rpc::PlacementGroupInfoHandler { void OnPlacementGroupCreationSuccess( const std::shared_ptr &placement_group); - /// TODO-SANG Fill it up. + /// Remove the placement group of a given id. void RemovePlacementGroup(const PlacementGroupID &placement_group_id, StatusCallback on_placement_group_removed); diff --git a/src/ray/object_manager/common.h b/src/ray/object_manager/common.h index 9c71e2c2b5e8..3cda75266ad0 100644 --- a/src/ray/object_manager/common.h +++ b/src/ray/object_manager/common.h @@ -17,7 +17,8 @@ using SpillObjectsCallback = std::function; using SpaceReleasedCallback = std::function; /// A callback to call when a spilled object needs to be returned to the object store. -using RestoreSpilledObjectCallback = std::function)>; +using RestoreSpilledObjectCallback = + std::function)>; } // namespace ray diff --git a/src/ray/object_manager/object_buffer_pool.cc b/src/ray/object_manager/object_buffer_pool.cc index 4b6a44e6b5fd..726a6fefca35 100644 --- a/src/ray/object_manager/object_buffer_pool.cc +++ b/src/ray/object_manager/object_buffer_pool.cc @@ -59,7 +59,10 @@ std::pair ObjectBufferPool::Ge plasma::ObjectBuffer object_buffer; RAY_CHECK_OK(store_client_.Get(&object_id, 1, 0, &object_buffer)); if (object_buffer.data == nullptr) { - RAY_LOG(ERROR) << "Failed to get object"; + RAY_LOG(INFO) + << "Failed to get a chunk of the object: " << object_id + << ". It is mostly because the object is already evicted or spilled when the " + "pull request is received. The caller will retry the pull request again."; return std::pair( errored_chunk_, ray::Status::IOError("Unable to obtain object chunk, object not local.")); diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index ccfda7f5a37c..27e6f42b0bd6 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -32,7 +32,7 @@ using ray::rpc::ObjectTableData; bool UpdateObjectLocations(const std::vector &location_updates, std::shared_ptr gcs_client, std::unordered_set *node_ids, std::string *spilled_url, - size_t *object_size) { + NodeID *spilled_node_id, size_t *object_size) { // location_updates contains the updates of locations of the object. // with GcsChangeMode, we can determine whether the update mode is // addition or deletion. @@ -57,9 +57,12 @@ bool UpdateObjectLocations(const std::vector &locatio } } else { RAY_CHECK(!update.spilled_url().empty()); - RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url(); + const auto received_spilled_node_id = NodeID::FromBinary(update.spilled_node_id()); + RAY_LOG(DEBUG) << "Received object spilled at " << update.spilled_url() + << " spilled at " << NodeID::FromBinary(update.spilled_node_id()); if (update.spilled_url() != *spilled_url) { *spilled_url = update.spilled_url(); + *spilled_node_id = received_spilled_node_id; isUpdated = true; } } @@ -128,14 +131,17 @@ void ObjectDirectory::HandleNodeRemoved(const NodeID &node_id) { // If the subscribed object has the removed node as a location, update // its locations with an empty update so that the location will be removed. UpdateObjectLocations({}, gcs_client_, &listener.second.current_object_locations, - &listener.second.spilled_url, &listener.second.object_size); + &listener.second.spilled_url, + &listener.second.spilled_node_id, + &listener.second.object_size); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.spilled_url, listener.second.object_size); + listener.second.spilled_url, listener.second.spilled_node_id, + listener.second.object_size); } } } @@ -162,11 +168,11 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // Once this flag is set to true, it should never go back to false. it->second.subscribed = true; - // Update entries for this object. if (!UpdateObjectLocations(object_notifications, gcs_client_, &it->second.current_object_locations, - &it->second.spilled_url, &it->second.object_size)) { + &it->second.spilled_url, &it->second.spilled_node_id, + &it->second.object_size)) { return; } // Copy the callbacks so that the callbacks can unsubscribe without interrupting @@ -180,7 +186,8 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, - it->second.spilled_url, it->second.object_size); + it->second.spilled_url, it->second.spilled_node_id, + it->second.object_size); } }; status = gcs_client_->Objects().AsyncSubscribeToLocations( @@ -198,10 +205,12 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; auto &spilled_url = listener_state.spilled_url; + auto &spilled_node_id = listener_state.spilled_node_id; auto object_size = it->second.object_size; - io_service_.post([callback, locations, spilled_url, object_size, object_id]() { - callback(object_id, locations, spilled_url, object_size); - }); + io_service_.post( + [callback, locations, spilled_url, object_size, object_id, spilled_node_id]() { + callback(object_id, locations, spilled_url, spilled_node_id, object_size); + }); } return status; } @@ -233,10 +242,12 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, // cached locations. auto &locations = it->second.current_object_locations; auto &spilled_url = it->second.spilled_url; + auto &spilled_node_id = it->second.spilled_node_id; auto object_size = it->second.object_size; - io_service_.post([callback, object_id, spilled_url, locations, object_size]() { - callback(object_id, locations, spilled_url, object_size); - }); + io_service_.post( + [callback, object_id, spilled_url, locations, object_size, spilled_node_id]() { + callback(object_id, locations, spilled_url, spilled_node_id, object_size); + }); } else { // We do not have any locations cached due to a concurrent // SubscribeObjectLocations call, so look up the object's locations @@ -258,17 +269,19 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, if (!update->spilled_url().empty()) { rpc::ObjectLocationChange change; change.set_spilled_url(update->spilled_url()); + change.set_spilled_node_id(update->spilled_node_id()); notification.push_back(change); } std::unordered_set node_ids; std::string spilled_url; + NodeID spilled_node_id; size_t object_size = 0; UpdateObjectLocations(notification, gcs_client_, &node_ids, &spilled_url, - &object_size); + &spilled_node_id, &object_size); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, node_ids, spilled_url, object_size); + callback(object_id, node_ids, spilled_url, spilled_node_id, object_size); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 8f06888aee23..0a4c6300a81a 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -41,9 +41,9 @@ struct RemoteConnectionInfo { }; /// Callback for object location notifications. -using OnLocationsFound = std::function &, - const std::string &, size_t object_size)>; +using OnLocationsFound = std::function &, + const std::string &, const NodeID &, size_t object_size)>; class ObjectDirectoryInterface { public: @@ -185,6 +185,9 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_set current_object_locations; /// The location where this object has been spilled, if any. std::string spilled_url = ""; + // The node id that spills the object to the disk. + // It will be Nil if it uses a distributed external storage. + NodeID spilled_node_id = NodeID::Nil(); /// The size of the object. size_t object_size = 0; /// This flag will get set to true if received any notification of the object. diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index 467ea25675e9..ddd71c7665ab 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -220,8 +220,10 @@ uint64_t ObjectManager::Pull(const std::vector &object_ref const auto &callback = [this](const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size) { - pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, object_size); + const std::string &spilled_url, + const NodeID &spilled_node_id, size_t object_size) { + pull_manager_->OnLocationChange(object_id, client_ids, spilled_url, spilled_node_id, + object_size); }; for (const auto &ref : objects_to_locate) { @@ -513,7 +515,8 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &lookup_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url, const NodeID &spilled_node_id, + size_t object_size) { auto &wait_state = active_wait_requests_.find(wait_id)->second; // Note that the object is guaranteed to be added to local_objects_ before // the notification is triggered. @@ -554,7 +557,8 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { wait_id, object_id, wait_state.owner_addresses[object_id], [this, wait_id](const ObjectID &subscribe_object_id, const std::unordered_set &node_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url, const NodeID &spilled_node_id, + size_t object_size) { auto object_id_wait_state = active_wait_requests_.find(wait_id); if (object_id_wait_state == active_wait_requests_.end()) { // Depending on the timing of calls to the object directory, we diff --git a/src/ray/object_manager/object_manager.h b/src/ray/object_manager/object_manager.h index a114f16bc446..00073012213a 100644 --- a/src/ray/object_manager/object_manager.h +++ b/src/ray/object_manager/object_manager.h @@ -106,8 +106,9 @@ class ObjectManagerInterface { class ObjectManager : public ObjectManagerInterface, public rpc::ObjectManagerServiceHandler { public: - using RestoreSpilledObjectCallback = std::function)>; + using RestoreSpilledObjectCallback = + std::function)>; /// Implementation of object manager service diff --git a/src/ray/object_manager/ownership_based_object_directory.cc b/src/ray/object_manager/ownership_based_object_directory.cc index efc37b3e8d8c..a17d3dfc66c0 100644 --- a/src/ray/object_manager/ownership_based_object_directory.cc +++ b/src/ray/object_manager/ownership_based_object_directory.cc @@ -146,7 +146,7 @@ void OwnershipBasedObjectDirectory::SubscriptionCallback( // It is safe to call the callback directly since this is already running // in the subscription callback stack. callback_pair.second(object_id, it->second.current_object_locations, "", - it->second.object_size); + NodeID::Nil(), it->second.object_size); } } @@ -213,7 +213,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( RAY_LOG(WARNING) << "Object " << object_id << " does not have owner. " << "LookupLocations returns an empty list of locations."; io_service_.post([callback, object_id]() { - callback(object_id, std::unordered_set(), "", 0); + callback(object_id, std::unordered_set(), "", NodeID::Nil(), 0); }); return Status::OK(); } @@ -234,7 +234,7 @@ ray::Status OwnershipBasedObjectDirectory::LookupLocations( node_ids.emplace(NodeID::FromBinary(node_id)); } FilterRemovedNodes(gcs_client_, &node_ids); - callback(object_id, node_ids, "", reply.object_size()); + callback(object_id, node_ids, "", NodeID::Nil(), reply.object_size()); }); return Status::OK(); } diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 1ebf9214a707..302f2f4354ef 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -259,7 +259,8 @@ std::vector PullManager::CancelPull(uint64_t request_id) { void PullManager::OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url, + const NodeID &spilled_node_id, size_t object_size) { // Exit if the Pull request has already been fulfilled or canceled. auto it = object_pull_requests_.find(object_id); if (it == object_pull_requests_.end()) { @@ -271,7 +272,7 @@ void PullManager::OnLocationChange(const ObjectID &object_id, // before. it->second.client_locations = std::vector(client_ids.begin(), client_ids.end()); it->second.spilled_url = spilled_url; - + it->second.spilled_node_id = spilled_node_id; if (!it->second.object_size_set) { RAY_LOG(DEBUG) << "Updated size of object " << object_id << " to " << object_size << ", num bytes being pulled is now " << num_bytes_being_pulled_; @@ -299,30 +300,47 @@ void PullManager::TryToMakeObjectLocal(const ObjectID &object_id) { return; } + // We always pull objects from a remote node before + // restoring it because of two reasons. + // 1. This will help reducing the load of external storages + // or remote node that spilled the object. + // 2. Also, if we use multi-node file spilling, the restoration will be + // confirmed by a object location subscription, so we should pull first + // before requesting for object restoration. + bool did_pull = PullFromRandomLocation(object_id); + if (did_pull) { + // New object locations were found, so begin trying to pull from a + // client. + UpdateRetryTimer(request); + return; + } + + // If we cannot pull, it means all objects have been evicted, so try restoring objects + // from the external storage. If the object was spilled on the current node, the + // callback will restore the object from the local the disk. + // Otherwise, it will send a request to a remote node that spilled the object. + // If external storage is a distributed storage, we always try restoring from it without + // sending RPCs. if (!request.spilled_url.empty()) { - // Try to restore the spilled object. + const auto spilled_node_id = request.spilled_node_id; restore_spilled_object_( - object_id, request.spilled_url, [this, object_id](const ray::Status &status) { - bool did_pull = true; - // Fall back to fetching from another object manager. + object_id, request.spilled_url, spilled_node_id, + [this, object_id, spilled_node_id](const ray::Status &status) { if (!status.ok()) { - did_pull = PullFromRandomLocation(object_id); - } - if (!did_pull) { - RAY_LOG(WARNING) << "Object restoration failed and the object could not be " - "found on any other nodes. Object id: " - << object_id; + const auto node_id_with_issue = + spilled_node_id.IsNil() ? self_node_id_ : spilled_node_id; + RAY_LOG(WARNING) + << "Object restoration failed and the object could " + "not be " + "found on any other nodes. This can happen if the location where the " + "object was spilled is unreachable. This job may hang if the object " + "is permanently unreachable. " + "Please check the log of node of id: " + << node_id_with_issue << " Object id: " << object_id; } }); - UpdateRetryTimer(request); - } else { - // New object locations were found, so begin trying to pull from a - // client. This will be called every time a new client location - // appears. - bool did_pull = PullFromRandomLocation(object_id); - if (did_pull) { - UpdateRetryTimer(request); - } + // We shouldn't update the timer here because restoration takes some time, and since + // we retry pull requests with exponential backoff, the delay could be large. } } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index e4a662eb6306..26eba1a35264 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -72,9 +72,12 @@ class PullManager { /// necessarily a super or subset of the previously available nodes. /// \param spilled_url The location of the object if it was spilled. If /// non-empty, the object may no longer be on any node. + /// \param spilled_node_id The node id of the object if it was spilled. If Nil, the + /// object may no longer be on any node. void OnLocationChange(const ObjectID &object_id, const std::unordered_set &client_ids, - const std::string &spilled_url, size_t object_size); + const std::string &spilled_url, const NodeID &spilled_node_id, + size_t object_size); /// Cancel an existing pull request. /// @@ -108,6 +111,7 @@ class PullManager { bundle_request_ids() {} std::vector client_locations; std::string spilled_url; + NodeID spilled_node_id; double next_pull_time; uint8_t num_retries; bool object_size_set = false; diff --git a/src/ray/object_manager/test/pull_manager_test.cc b/src/ray/object_manager/test/pull_manager_test.cc index 345cc6ceadfe..ecdaa06198fb 100644 --- a/src/ray/object_manager/test/pull_manager_test.cc +++ b/src/ray/object_manager/test/pull_manager_test.cc @@ -24,7 +24,7 @@ class PullManagerTestWithCapacity { [this](const ObjectID &object_id, const NodeID &node_id) { num_send_pull_request_calls_++; }, - [this](const ObjectID &, const std::string &, + [this](const ObjectID &, const std::string &, const NodeID &, std::function callback) { num_restore_spilled_object_calls_++; restore_object_callback_ = callback; @@ -94,7 +94,7 @@ TEST_F(PullManagerTest, TestStaleSubscription) { ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); std::unordered_set client_ids; - pull_manager_.OnLocationChange(oid, client_ids, "", 0); + pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); AssertNumActiveRequestsEquals(1); // There are no client ids to pull from. @@ -109,7 +109,7 @@ TEST_F(PullManagerTest, TestStaleSubscription) { AssertNumActiveRequestsEquals(0); client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(oid, client_ids, "", 0); + pull_manager_.OnLocationChange(oid, client_ids, "", NodeID::Nil(), 0); // Now we're getting a notification about an object that was already cancelled. ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -128,26 +128,38 @@ TEST_F(PullManagerTest, TestRestoreSpilledObject) { ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); - AssertNumActiveRequestsEquals(1); + pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); - client_ids.insert(NodeID::FromRandom()); + NodeID node_that_object_spilled = NodeID::FromRandom(); fake_time_ += 10.; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + node_that_object_spilled, 0); // The behavior is supposed to be to always restore the spilled object if possible (even // if it exists elsewhere in the cluster). ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 2); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); + + // The restore object call will ask the remote node to restore the object, and the + // client location is updated accordingly. + client_ids.insert(node_that_object_spilled); + fake_time_ += 10.; + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + node_that_object_spilled, 0); + + // Now the pull requests are sent. + ASSERT_EQ(num_send_pull_request_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 1); // Don't restore an object if it's local. object_is_local_ = true; num_restore_spilled_object_calls_ = 0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + NodeID::FromRandom(), 0); ASSERT_EQ(num_restore_spilled_object_calls_, 0); auto objects_to_cancel = pull_manager_.CancelPull(req_id); @@ -164,51 +176,78 @@ TEST_F(PullManagerTest, TestRestoreObjectFailed) { std::vector objects_to_locate; auto req_id = pull_manager_.Pull(refs, &objects_to_locate); ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); - std::unordered_set client_ids; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); AssertNumActiveRequestsEquals(1); // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); + ASSERT_EQ(num_restore_spilled_object_calls_, 0); - restore_object_callback_(ray::Status::IOError(":(")); + // Object is now spilled to a remote node, but the client_ids are still empty. + const NodeID remote_node_object_spilled = NodeID::FromRandom(); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + remote_node_object_spilled, 0); - // client_ids is empty here, so there's nowhere to pull from. ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 1); - client_ids.insert(NodeID::FromRandom()); - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); - - // We always assume the restore succeeded so there's only 1 restore call still. - ASSERT_EQ(num_send_pull_request_calls_, 0); - ASSERT_EQ(num_restore_spilled_object_calls_, 1); + restore_object_callback_(ray::Status::IOError(":(")); + // Now the restore request has failed, the remote object shouldn't have been properly + // restored. fake_time_ += 10.0; - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + remote_node_object_spilled, 0); ASSERT_EQ(num_send_pull_request_calls_, 0); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - restore_object_callback_(ray::Status::IOError(":(")); - - // Since restore failed, we can fallback to pulling from another node immediately. - ASSERT_EQ(num_send_pull_request_calls_, 1); - ASSERT_EQ(num_restore_spilled_object_calls_, 2); - - pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", 0); + restore_object_callback_(ray::Status::OK()); + // Now the remote restoration request succeeds, so we sholud be able to pull the object. + client_ids.insert(remote_node_object_spilled); + // Since it is the second retry, the interval gets doubled. + fake_time_ += 20.0; + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + remote_node_object_spilled, 0); // Now that we've successfully sent a pull request, we need to wait for the retry period // before sending another one. ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 2); - pull_manager_.CancelPull(req_id); + auto objects_to_cancel = pull_manager_.CancelPull(req_id); AssertNoLeaks(); } +TEST_F(PullManagerTest, TestLoadBalancingRestorationRequest) { + /* Make sure when the object copy is in other raylet, we pull object from there instead + * of requesting the owner node to restore the object. */ + + auto refs = CreateObjectRefs(1); + auto obj1 = ObjectRefsToIds(refs)[0]; + rpc::Address addr1; + ASSERT_EQ(pull_manager_.NumActiveRequests(), 0); + std::vector objects_to_locate; + pull_manager_.Pull(refs, &objects_to_locate); + ASSERT_EQ(ObjectRefsToIds(objects_to_locate), ObjectRefsToIds(refs)); + ASSERT_EQ(pull_manager_.NumActiveRequests(), 1); + + std::unordered_set client_ids; + const auto copy_node1 = NodeID::FromRandom(); + const auto copy_node2 = NodeID::FromRandom(); + const auto remote_node_that_spilled_object = NodeID::FromRandom(); + client_ids.insert(copy_node1); + client_ids.insert(copy_node2); + pull_manager_.OnLocationChange(obj1, client_ids, "remote_url/foo/bar", + remote_node_that_spilled_object, 0); + + ASSERT_EQ(num_send_pull_request_calls_, 1); + // Make sure the restore request wasn't sent since there are nodes that have a copied + // object. + ASSERT_EQ(num_restore_spilled_object_calls_, 0); +} + TEST_F(PullManagerTest, TestManyUpdates) { auto refs = CreateObjectRefs(1); auto obj1 = ObjectRefsToIds(refs)[0]; @@ -222,7 +261,7 @@ TEST_F(PullManagerTest, TestManyUpdates) { client_ids.insert(NodeID::FromRandom()); for (int i = 0; i < 100; i++) { - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); AssertNumActiveRequestsEquals(1); } @@ -250,7 +289,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // We need to call OnLocationChange at least once, to population the list of nodes with // the object. - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); AssertNumActiveRequestsEquals(1); ASSERT_EQ(num_send_pull_request_calls_, 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -261,7 +300,7 @@ TEST_F(PullManagerTest, TestRetryTimer) { // Location changes can trigger reset timer. for (; fake_time_ <= 120 * 10; fake_time_ += 1.) { - pull_manager_.OnLocationChange(obj1, client_ids, "", 0); + pull_manager_.OnLocationChange(obj1, client_ids, "", NodeID::Nil(), 0); } // We should make a pull request every tick (even if it's a duplicate to a node we're @@ -294,7 +333,7 @@ TEST_F(PullManagerTest, TestBasic) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -305,7 +344,7 @@ TEST_F(PullManagerTest, TestBasic) { num_send_pull_request_calls_ = 0; fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -318,7 +357,7 @@ TEST_F(PullManagerTest, TestBasic) { num_send_pull_request_calls_ = 0; fake_time_ += 10; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -340,7 +379,7 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -354,7 +393,8 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { fake_time_ += 10; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); ASSERT_EQ(num_send_pull_request_calls_, i + 1); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -368,7 +408,7 @@ TEST_F(PullManagerTest, TestDeduplicateBundles) { object_is_local_ = false; num_send_pull_request_calls_ = 0; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", 0); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), 0); } ASSERT_EQ(num_send_pull_request_calls_, 0); @@ -390,7 +430,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { std::unordered_set client_ids; client_ids.insert(NodeID::FromRandom()); for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); } ASSERT_EQ(num_send_pull_request_calls_, oids.size()); ASSERT_EQ(num_restore_spilled_object_calls_, 0); @@ -406,7 +446,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestBasic) { fake_time_ += 10; auto prev_pull_requests = num_send_pull_request_calls_; for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); ASSERT_EQ(num_send_pull_request_calls_, prev_pull_requests); ASSERT_EQ(num_restore_spilled_object_calls_, 0); } @@ -449,7 +489,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestQueue) { client_ids.insert(NodeID::FromRandom()); for (auto &oids : bundles) { for (size_t i = 0; i < oids.size(); i++) { - pull_manager_.OnLocationChange(oids[i], client_ids, "", object_size); + pull_manager_.OnLocationChange(oids[i], client_ids, "", NodeID::Nil(), object_size); } } @@ -500,7 +540,7 @@ TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { req_ids.push_back(req_id); } for (size_t i = 0; i < object_sizes.size(); i++) { - pull_manager_.OnLocationChange(oids[i], {}, "", object_sizes[i]); + pull_manager_.OnLocationChange(oids[i], {}, "", NodeID::Nil(), object_sizes[i]); } AssertNumActiveRequestsEquals(num_active_requests_expected_before); pull_manager_.CancelPull(req_ids[cancel_idx]); @@ -508,14 +548,14 @@ TEST_F(PullManagerWithAdmissionControlTest, TestCancel) { // Request is really canceled. pull_manager_.OnLocationChange(oids[cancel_idx], {NodeID::FromRandom()}, "", - object_sizes[cancel_idx]); + NodeID::Nil(), object_sizes[cancel_idx]); ASSERT_EQ(num_send_pull_request_calls_, 0); // The expected number of requests at the head of the queue are pulled. int num_active = 0; for (size_t i = 0; i < refs.size() && num_active < num_active_requests_expected_after; i++) { - pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", + pull_manager_.OnLocationChange(oids[i], {NodeID::FromRandom()}, "", NodeID::Nil(), object_sizes[i]); if (i != cancel_idx) { num_active++; diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index a332a908159e..1e59ae8123ca 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -413,8 +413,11 @@ message ObjectLocationInfo { // For objects that have been spilled to external storage, the URL from which // they can be retrieved. string spilled_url = 3; + // The node id that spills the object to the disk. + // It will be Nil if it uses a distributed external storage. + bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 4; + uint64 size = 5; } // A notification message about one object's locations being changed. @@ -425,8 +428,11 @@ message ObjectLocationChange { // The object has been spilled to this URL. This should be set xor the above // fields are set. string spilled_url = 3; + // The node id that spills the object to the disk. + // It will be Nil if it uses a distributed external storage. + bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 4; + uint64 size = 5; } // A notification message about one node's resources being changed. diff --git a/src/ray/protobuf/gcs_service.proto b/src/ray/protobuf/gcs_service.proto index eda00b806b26..8922ce6f466b 100644 --- a/src/ray/protobuf/gcs_service.proto +++ b/src/ray/protobuf/gcs_service.proto @@ -272,8 +272,11 @@ message AddObjectLocationRequest { // The spilled URL that will be added to GCS Service. Either this or the node // ID should be set. string spilled_url = 3; + // The node id that spills the object to the disk. + // It will be Nil if it uses a distributed external storage. + bytes spilled_node_id = 4; // The size of the object in bytes. - uint64 size = 4; + uint64 size = 5; } message AddObjectLocationReply { diff --git a/src/ray/protobuf/node_manager.proto b/src/ray/protobuf/node_manager.proto index bae2a9715100..386ed988ade3 100644 --- a/src/ray/protobuf/node_manager.proto +++ b/src/ray/protobuf/node_manager.proto @@ -179,6 +179,18 @@ message RequestObjectSpillageReply { bool success = 1; } +message RestoreSpilledObjectRequest { + // ObjectID to restore. + bytes object_id = 1; + // Object URL where the object is spilled. + string object_url = 2; + // The node id of a node where the object is spilled. + bytes spilled_node_id = 3; +} + +message RestoreSpilledObjectReply { +} + message ReleaseUnusedBundlesRequest { repeated Bundle bundles_in_use = 1; } @@ -224,6 +236,9 @@ service NodeManagerService { // Ask the raylet to spill an object to external storage. rpc RequestObjectSpillage(RequestObjectSpillageRequest) returns (RequestObjectSpillageReply); + // Ask the raylet to restore the object from the external storage. + rpc RestoreSpilledObject(RestoreSpilledObjectRequest) + returns (RestoreSpilledObjectReply); // This method is only used by GCS, and the purpose is to release bundles // that may be leaked. When GCS restarts, it doesn't know which bundles it has leased // in the previous lifecycle. In this case, GCS will send a list of bundles that diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 721adb6bd3eb..9909beb76e55 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -261,11 +261,15 @@ void LocalObjectManager::AddSpilledUrls( const ObjectID &object_id = object_ids[i]; const std::string &object_url = worker_reply.spilled_objects_url(i); RAY_LOG(DEBUG) << "Object " << object_id << " spilled at " << object_url; + // Choose a node id to report. If an external storage type is not a filesystem, we + // don't need to report where this object is spilled. + const auto node_id_object_spilled = + is_external_storage_type_fs_ ? self_node_id_ : NodeID::Nil(); // Write to object directory. Wait for the write to finish before // releasing the object to make sure that the spilled object can // be retrieved by other raylets. RAY_CHECK_OK(object_info_accessor_.AsyncAddSpilledUrl( - object_id, object_url, + object_id, object_url, node_id_object_spilled, [this, object_id, object_url, callback, num_remaining](Status status) { RAY_CHECK_OK(status); // Unpin the object. @@ -298,14 +302,35 @@ void LocalObjectManager::AddSpilledUrls( } void LocalObjectManager::AsyncRestoreSpilledObject( - const ObjectID &object_id, const std::string &object_url, + const ObjectID &object_id, const std::string &object_url, const NodeID &node_id, std::function callback) { - RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " - << object_url; if (objects_pending_restore_.count(object_id) > 0) { // If the same object is restoring, we dedup here. return; } + + if (!node_id.IsNil() && node_id != self_node_id_) { + // If we know where this object was spilled, and the current node is not that one, + // send a RPC to a remote node that spilled the object to restore it. + RAY_LOG(DEBUG) << "Send a object restoration request of id: " << object_id + << " to a remote node: " << node_id; + // TODO(sang): We need to deduplicate this remote RPC. Since restore request + // is retried every 10ms without exponential backoff, this can add huge overhead to a + // remote node that spilled the object. + restore_object_from_remote_node_(object_id, object_url, node_id); + if (callback) { + callback(Status::OK()); + } + return; + } + + // Restore the object. + RAY_LOG(DEBUG) << "Restoring spilled object " << object_id << " from URL " + << object_url; + if (!node_id.IsNil()) { + RAY_CHECK(spilled_objects_url_.count(object_id) > 0); + } + RAY_CHECK(objects_pending_restore_.emplace(object_id).second) << "Object dedupe wasn't done properly. Please report if you see this issue."; io_worker_pool_.PopRestoreWorker([this, object_id, object_url, callback]( diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index 14142f5f913d..c4f157d58019 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -16,6 +16,8 @@ #include +#include +#include #include #include "ray/common/id.h" @@ -24,6 +26,7 @@ #include "ray/object_manager/common.h" #include "ray/raylet/worker_pool.h" #include "ray/rpc/worker/core_worker_client_pool.h" +#include "ray/util/util.h" #include "src/ray/protobuf/node_manager.pb.h" namespace ray { @@ -35,15 +38,18 @@ namespace raylet { class LocalObjectManager { public: LocalObjectManager( - boost::asio::io_service &io_context, size_t free_objects_batch_size, + const NodeID &node_id, size_t free_objects_batch_size, int64_t free_objects_period_ms, IOWorkerPoolInterface &io_worker_pool, gcs::ObjectInfoAccessor &object_info_accessor, rpc::CoreWorkerClientPool &owner_client_pool, bool object_pinning_enabled, bool automatic_object_deletion_enabled, int max_io_workers, - int64_t min_spilling_size, + int64_t min_spilling_size, bool is_external_storage_type_fs, std::function &)> on_objects_freed, - std::function is_plasma_object_spillable) - : free_objects_period_ms_(free_objects_period_ms), + std::function is_plasma_object_spillable, + std::function + restore_object_from_remote_node) + : self_node_id_(node_id), + free_objects_period_ms_(free_objects_period_ms), free_objects_batch_size_(free_objects_batch_size), io_worker_pool_(io_worker_pool), object_info_accessor_(object_info_accessor), @@ -55,7 +61,9 @@ class LocalObjectManager { min_spilling_size_(min_spilling_size), num_active_workers_(0), max_active_workers_(max_io_workers), - is_plasma_object_spillable_(is_plasma_object_spillable) {} + is_plasma_object_spillable_(is_plasma_object_spillable), + restore_object_from_remote_node_(restore_object_from_remote_node), + is_external_storage_type_fs_(is_external_storage_type_fs) {} /// Pin objects. /// @@ -90,10 +98,15 @@ class LocalObjectManager { /// Restore a spilled object from external storage back into local memory. /// /// \param object_id The ID of the object to restore. - /// \param object_url The URL in external storage from which the object can be restored. - /// \param callback A callback to call when the restoration is done. Status - /// will contain the error during restoration, if any. + /// \param object_url The URL where the object is spilled. + /// \param node_id Node id that we try restoring the object. If Nil is provided, the + /// object is restored directly from the external storage. If a node id is provided, it + /// sends a RPC request to a corresponding node if the given node_id is not equivalent + /// to a self node id. + /// \param callback A callback to call when the restoration is done. + /// Status will contain the error during restoration, if any. void AsyncRestoreSpilledObject(const ObjectID &object_id, const std::string &object_url, + const NodeID &node_id, std::function callback); /// Try to clear any objects that have been freed. @@ -160,6 +173,8 @@ class LocalObjectManager { /// \param urls_to_delete List of urls to delete from external storages. void DeleteSpilledObjects(std::vector &urls_to_delete); + const NodeID self_node_id_; + /// The period between attempts to eagerly evict objects from plasma. const int64_t free_objects_period_ms_; @@ -247,6 +262,16 @@ class LocalObjectManager { /// Return true if unpinned, meaning we can safely spill the object. False otherwise. std::function is_plasma_object_spillable_; + /// Callback to restore object of object id from a remote node of node id. + std::function + restore_object_from_remote_node_; + + /// Used to decide spilling protocol. + /// If it is "filesystem", it restores spilled objects only from an owner node. + /// If it is not (meaning it is distributed backend), it always restores objects + /// directly from the external storage. + bool is_external_storage_type_fs_; + /// /// Stats /// diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 1b8c50c5870e..072064f4695a 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -158,19 +158,29 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self agent_manager_service_(io_service, *agent_manager_service_handler_), client_call_manager_(io_service), worker_rpc_pool_(client_call_manager_), - local_object_manager_(io_service_, RayConfig::instance().free_objects_batch_size(), - RayConfig::instance().free_objects_period_milliseconds(), - worker_pool_, gcs_client_->Objects(), worker_rpc_pool_, - /* object_pinning_enabled */ config.object_pinning_enabled, - /* automatic_object_deletion_enabled */ - config.automatic_object_deletion_enabled, - /*max_io_workers*/ config.max_io_workers, - /*min_spilling_size*/ config.min_spilling_size, - [this](const std::vector &object_ids) { - object_manager_.FreeObjects(object_ids, - /*local_only=*/false); - }, - is_plasma_object_spillable), + local_object_manager_( + self_node_id_, RayConfig::instance().free_objects_batch_size(), + RayConfig::instance().free_objects_period_milliseconds(), worker_pool_, + gcs_client_->Objects(), worker_rpc_pool_, + /* object_pinning_enabled */ config.object_pinning_enabled, + /* automatic_object_deletion_enabled */ + config.automatic_object_deletion_enabled, + /*max_io_workers*/ config.max_io_workers, + /*min_spilling_size*/ config.min_spilling_size, + /*is_external_storage_type_fs*/ + RayConfig::instance().is_external_storage_type_fs(), + /*on_objects_freed*/ + [this](const std::vector &object_ids) { + object_manager_.FreeObjects(object_ids, + /*local_only=*/false); + }, + is_plasma_object_spillable, + /*restore_object_from_remote_node*/ + [this](const ObjectID &object_id, const std::string &spilled_url, + const NodeID &node_id) { + SendSpilledObjectRestorationRequestToRemoteNode(object_id, spilled_url, + node_id); + }), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), last_local_gc_ns_(absl::GetCurrentTimeNanos()), local_gc_interval_ns_(RayConfig::instance().local_gc_interval_s() * 1e9), @@ -511,6 +521,24 @@ void NodeManager::HandleRequestObjectSpillage( }); } +void NodeManager::HandleRestoreSpilledObject( + const rpc::RestoreSpilledObjectRequest &request, + rpc::RestoreSpilledObjectReply *reply, rpc::SendReplyCallback send_reply_callback) { + const auto object_id = ObjectID::FromBinary(request.object_id()); + const auto spilled_node_id = NodeID::FromBinary(request.spilled_node_id()); + const auto object_url = request.object_url(); + RAY_CHECK(spilled_node_id == self_node_id_); + RAY_LOG(DEBUG) << "Restore spilled object request received. Object id: " << object_id + << " spilled_node_id: " << self_node_id_ + << " object url: " << object_url; + local_object_manager_.AsyncRestoreSpilledObject(object_id, object_url, spilled_node_id, + nullptr); + // Just reply right away. The caller will keep hitting this RPC endpoint until + // restoration succeeds, so we can safely reply here without waiting for the + // restoreSpilledObject to be done. + send_reply_callback(Status::OK(), nullptr, nullptr); +} + void NodeManager::HandleReleaseUnusedBundles( const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -2714,6 +2742,30 @@ void NodeManager::PublishInfeasibleTaskError(const Task &task) const { } } +void NodeManager::SendSpilledObjectRestorationRequestToRemoteNode( + const ObjectID &object_id, const std::string &spilled_url, const NodeID &node_id) { + // Fetch from a remote node. + if (!remote_node_manager_addresses_.contains(node_id)) { + // It is possible the new node information is not received at this point. + // In this case, the PullManager will handle retry, so we just return. + return; + } + const auto &entry = remote_node_manager_addresses_.find(node_id); + // TODO(sang): Use a node manager pool instead. + auto raylet_client = + std::make_shared(rpc::NodeManagerWorkerClient::make( + entry->second.first, entry->second.second, client_call_manager_)); + raylet_client->RestoreSpilledObject( + object_id, spilled_url, node_id, + [](const ray::Status &status, const rpc::RestoreSpilledObjectReply &r) { + if (!status.ok()) { + RAY_LOG(WARNING) << "Failed to send a spilled object restoration request to a " + "remote node. This request will be retried. Error message: " + << status.ToString(); + } + }); +} + } // namespace raylet } // namespace ray diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index d626e5246297..3a68fcbae992 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -28,6 +28,7 @@ #include "ray/common/task/scheduling_resources.h" #include "ray/object_manager/object_manager.h" #include "ray/raylet/agent_manager.h" +#include "ray/raylet_client/raylet_client.h" #include "ray/raylet/local_object_manager.h" #include "ray/raylet/scheduling/scheduling_ids.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" @@ -603,6 +604,11 @@ class NodeManager : public rpc::NodeManagerServiceHandler, rpc::RequestObjectSpillageReply *reply, rpc::SendReplyCallback send_reply_callback) override; + /// Handle a `RestoreSpilledObject` request. + void HandleRestoreSpilledObject(const rpc::RestoreSpilledObjectRequest &request, + rpc::RestoreSpilledObjectReply *reply, + rpc::SendReplyCallback send_reply_callback) override; + /// Handle a `ReleaseUnusedBundles` request. void HandleReleaseUnusedBundles(const rpc::ReleaseUnusedBundlesRequest &request, rpc::ReleaseUnusedBundlesReply *reply, @@ -633,6 +639,11 @@ class NodeManager : public rpc::NodeManagerServiceHandler, /// \param task Task that is infeasible void PublishInfeasibleTaskError(const Task &task) const; + /// Send a object restoration request to a remote node of a given node id. + void SendSpilledObjectRestorationRequestToRemoteNode(const ObjectID &object_id, + const std::string &spilled_url, + const NodeID &node_id); + std::unordered_map> MakeTasksByClass( const std::vector &tasks) const; diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index 6aeec576e1e4..4d9514e626da 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -72,10 +72,11 @@ Raylet::Raylet(boost::asio::io_service &main_service, const std::string &socket_ std::make_shared(main_service, gcs_client_))), object_manager_( main_service, self_node_id_, object_manager_config, object_directory_, - [this](const ObjectID &object_id, const std::string &spilled_url, + [this](const ObjectID &object_id, const std::string &object_url, + const NodeID &node_id, std::function callback) { node_manager_.GetLocalObjectManager().AsyncRestoreSpilledObject( - object_id, spilled_url, callback); + object_id, object_url, node_id, callback); }, [this]() { // This callback is called from the plasma store thread. diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index f4fd3d025fda..1da422529cda 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -179,7 +179,8 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { created_object_id, it->second.owner_addresses[created_object_id], [this, task_id, reconstruction_attempt]( const ray::ObjectID &object_id, const std::unordered_set &nodes, - const std::string &spilled_url, size_t object_size) { + const std::string &spilled_url, const ray::NodeID &spilled_node_id, + size_t object_size) { if (nodes.empty() && spilled_url.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index 8b5fd9d0e75c..d4eb387a3ac0 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -58,9 +58,10 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), "", 0); + callback.second(object_id, std::unordered_set(), "", NodeID::Nil(), + 0); } else { - callback.second(object_id, it->second, "", 0); + callback.second(object_id, it->second, "", NodeID::Nil(), 0); } } callbacks_.clear(); diff --git a/src/ray/raylet/test/local_object_manager_test.cc b/src/ray/raylet/test/local_object_manager_test.cc index bbae5bb144b0..8ff77250f78f 100644 --- a/src/ray/raylet/test/local_object_manager_test.cc +++ b/src/ray/raylet/test/local_object_manager_test.cc @@ -84,12 +84,16 @@ class MockIOWorkerClient : public rpc::CoreWorkerClientInterface { restore_callbacks.push_back(callback); } - void ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { + bool ReplyRestoreObjects(int64_t bytes_restored, Status status = Status::OK()) { rpc::RestoreSpilledObjectsReply reply; reply.set_bytes_restored_total(bytes_restored); + if (restore_callbacks.size() == 0) { + return false; + }; auto callback = restore_callbacks.front(); callback(status, reply); restore_callbacks.pop_front(); + return true; } void DeleteSpilledObjects( @@ -190,6 +194,7 @@ class MockObjectInfoAccessor : public gcs::ObjectInfoAccessor { size_t object_size, const gcs::StatusCallback &callback)); Status AsyncAddSpilledUrl(const ObjectID &object_id, const std::string &spilled_url, + const NodeID &spilled_node_id, const gcs::StatusCallback &callback) { object_urls[object_id] = spilled_url; callbacks.push_back(callback); @@ -252,12 +257,15 @@ class LocalObjectManagerTest : public ::testing::Test { LocalObjectManagerTest() : owner_client(std::make_shared()), client_pool([&](const rpc::Address &addr) { return owner_client; }), - manager(io_service_, free_objects_batch_size, + manager_node_id_(NodeID::FromRandom()), + manager(manager_node_id_, free_objects_batch_size, /*free_objects_period_ms=*/1000, worker_pool, object_table, client_pool, /*object_pinning_enabled=*/true, /*automatic_object_delete_enabled=*/true, /*max_io_workers=*/2, /*min_spilling_size=*/0, + /*is_external_storage_type_fs=*/true, + /*on_objects_freed=*/ [&](const std::vector &object_ids) { for (const auto &object_id : object_ids) { freed.insert(object_id); @@ -266,12 +274,24 @@ class LocalObjectManagerTest : public ::testing::Test { /*is_plasma_object_spillable=*/ [&](const ray::ObjectID &object_id) { return unevictable_objects_.count(object_id) == 0; + }, + /*restore_object_from_remote_node=*/ + [&](const ObjectID &object_id, const std::string spilled_url, + const NodeID &node_id) { + if (remote_node_set_restore_requested_.count(node_id) == 0) { + remote_node_set_restore_requested_.emplace( + node_id, std::unordered_set()); + } + remote_node_set_restore_requested_[node_id].emplace(object_id); }), unpins(std::make_shared>()) { RayConfig::instance().initialize({{"object_spilling_config", "mock_config"}}); } - void TearDown() { unevictable_objects_.clear(); } + void TearDown() { + unevictable_objects_.clear(); + remote_node_set_restore_requested_.clear(); + } std::string BuildURL(const std::string url, int offset = 0, int num_objects = 1) { return url + "?" + "num_objects=" + std::to_string(num_objects) + @@ -284,7 +304,10 @@ class LocalObjectManagerTest : public ::testing::Test { rpc::CoreWorkerClientPool client_pool; MockIOWorkerPool worker_pool; MockObjectInfoAccessor object_table; + NodeID manager_node_id_; LocalObjectManager manager; + std::unordered_map> + remote_node_set_restore_requested_; std::unordered_set freed; // This hashmap is incremented when objects are unpinned by destroying their @@ -323,16 +346,43 @@ TEST_F(LocalObjectManagerTest, TestPin) { } TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { - ObjectID object_id = ObjectID::FromRandom(); - std::string object_url("url"); + // First, spill objects. + std::vector object_ids; + std::vector> objects; + + for (size_t i = 0; i < free_objects_batch_size; i++) { + ObjectID object_id = ObjectID::FromRandom(); + object_ids.push_back(object_id); + auto data_buffer = std::make_shared(0, object_id, unpins); + std::unique_ptr object( + new RayObject(data_buffer, nullptr, std::vector())); + objects.push_back(std::move(object)); + } + manager.PinObjects(object_ids, std::move(objects)); + + manager.SpillObjects(object_ids, + [&](const Status &status) mutable { ASSERT_TRUE(status.ok()); }); + std::vector urls; + for (size_t i = 0; i < object_ids.size(); i++) { + urls.push_back(BuildURL("url" + std::to_string(i))); + } + ASSERT_TRUE(worker_pool.io_worker_client->ReplySpillObjects(urls)); + for (size_t i = 0; i < object_ids.size(); i++) { + ASSERT_TRUE(object_table.ReplyAsyncAddSpilledUrl()); + } + + // Then try restoring objects from local. + ObjectID object_id = object_ids[0]; + const auto url = urls[0]; int num_times_fired = 0; EXPECT_CALL(worker_pool, PushRestoreWorker(_)); // Subsequent calls should be deduped, so that only one callback should be fired. for (int i = 0; i < 10; i++) { - manager.AsyncRestoreSpilledObject(object_id, object_url, [&](const Status &status) { - ASSERT_TRUE(status.ok()); - num_times_fired++; - }); + manager.AsyncRestoreSpilledObject(object_id, url, manager_node_id_, + [&](const Status &status) { + ASSERT_TRUE(status.ok()); + num_times_fired++; + }); } ASSERT_EQ(num_times_fired, 0); @@ -342,7 +392,25 @@ TEST_F(LocalObjectManagerTest, TestRestoreSpilledObject) { ASSERT_EQ(num_times_fired, 0); } worker_pool.io_worker_client->ReplyRestoreObjects(10); + // The restore should've been invoked. ASSERT_EQ(num_times_fired, 1); + + // If the object wasn't spilled on the current node, it should request restoration to + // remote nodes. + ObjectID remote_object_id = ObjectID::FromRandom(); + const auto remote_object_url = BuildURL("remote_url"); + NodeID remote_node_id = NodeID::FromRandom(); + manager.AsyncRestoreSpilledObject(remote_object_id, remote_object_url, remote_node_id, + [&](const Status &status) { + ASSERT_TRUE(status.ok()); + num_times_fired++; + }); + // Make sure the remote call was invoked. + ASSERT_FALSE(worker_pool.io_worker_client->ReplyRestoreObjects(10)); + ASSERT_TRUE(remote_node_set_restore_requested_.count(remote_node_id) > 0); + ASSERT_TRUE(remote_node_set_restore_requested_[remote_node_id].count(remote_object_id) > + 0); + ASSERT_EQ(num_times_fired, 2); } TEST_F(LocalObjectManagerTest, TestExplicitSpill) { diff --git a/src/ray/raylet_client/raylet_client.cc b/src/ray/raylet_client/raylet_client.cc index 739832b2bb40..b3177071a144 100644 --- a/src/ray/raylet_client/raylet_client.cc +++ b/src/ray/raylet_client/raylet_client.cc @@ -311,6 +311,18 @@ void raylet::RayletClient::RequestObjectSpillage( grpc_client_->RequestObjectSpillage(request, callback); } +void raylet::RayletClient::RestoreSpilledObject( + const ObjectID &object_id, const std::string &object_url, + const NodeID &spilled_node_id, + const rpc::ClientCallback &callback) { + RAY_CHECK(!spilled_node_id.IsNil()); + rpc::RestoreSpilledObjectRequest request; + request.set_object_id(object_id.Binary()); + request.set_object_url(object_url); + request.set_spilled_node_id(spilled_node_id.Binary()); + grpc_client_->RestoreSpilledObject(request, callback); +} + Status raylet::RayletClient::ReturnWorker(int worker_port, const WorkerID &worker_id, bool disconnect_worker) { rpc::ReturnWorkerRequest request; diff --git a/src/ray/raylet_client/raylet_client.h b/src/ray/raylet_client/raylet_client.h index 185ca445ac3b..cf9cfea56d7f 100644 --- a/src/ray/raylet_client/raylet_client.h +++ b/src/ray/raylet_client/raylet_client.h @@ -332,6 +332,15 @@ class RayletClient : public RayletClientInterface { const ObjectID &object_id, const rpc::ClientCallback &callback); + /// Ask the raylet to restore the object of a given id. + /// \param object_id Object id that the remote raylet needs to restore. + /// \param object_url Object URL where the object is spilled. + /// \param spilled_node_id Node id of a node where the object is spilled. + void RestoreSpilledObject( + const ObjectID &object_id, const std::string &object_url, + const NodeID &spilled_node_id, + const rpc::ClientCallback &callback); + /// Implements WorkerLeaseInterface. void RequestWorkerLease( const ray::TaskSpecification &resource_spec, diff --git a/src/ray/rpc/node_manager/node_manager_client.h b/src/ray/rpc/node_manager/node_manager_client.h index 1c9b16c18370..81182ab94ab4 100644 --- a/src/ray/rpc/node_manager/node_manager_client.h +++ b/src/ray/rpc/node_manager/node_manager_client.h @@ -100,6 +100,9 @@ class NodeManagerWorkerClient /// Ask the raylet to spill an object to external storage. VOID_RPC_CLIENT_METHOD(NodeManagerService, RequestObjectSpillage, grpc_client_, ) + /// Ask the raylet to restore an object from external storage. + VOID_RPC_CLIENT_METHOD(NodeManagerService, RestoreSpilledObject, grpc_client_, ) + /// Release unused bundles. VOID_RPC_CLIENT_METHOD(NodeManagerService, ReleaseUnusedBundles, grpc_client_, ) diff --git a/src/ray/rpc/node_manager/node_manager_server.h b/src/ray/rpc/node_manager/node_manager_server.h index 08893d49f7a7..7f769150871c 100644 --- a/src/ray/rpc/node_manager/node_manager_server.h +++ b/src/ray/rpc/node_manager/node_manager_server.h @@ -36,6 +36,7 @@ namespace rpc { RPC_SERVICE_HANDLER(NodeManagerService, CommitBundleResources) \ RPC_SERVICE_HANDLER(NodeManagerService, CancelResourceReserve) \ RPC_SERVICE_HANDLER(NodeManagerService, RequestObjectSpillage) \ + RPC_SERVICE_HANDLER(NodeManagerService, RestoreSpilledObject) \ RPC_SERVICE_HANDLER(NodeManagerService, ReleaseUnusedBundles) /// Interface of the `NodeManagerService`, see `src/ray/protobuf/node_manager.proto`. @@ -102,6 +103,10 @@ class NodeManagerServiceHandler { RequestObjectSpillageReply *reply, SendReplyCallback send_reply_callback) = 0; + virtual void HandleRestoreSpilledObject(const RestoreSpilledObjectRequest &request, + RestoreSpilledObjectReply *reply, + SendReplyCallback send_reply_callback) = 0; + virtual void HandleReleaseUnusedBundles(const ReleaseUnusedBundlesRequest &request, ReleaseUnusedBundlesReply *reply, SendReplyCallback send_reply_callback) = 0; From 4dabf017ee8ef5214974326c11bf893ea95e70d1 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Mon, 25 Jan 2021 02:31:53 +0200 Subject: [PATCH 031/245] Close #12031 (Autoscaler is overriding your resource for same quantity) (#13671) --- python/ray/node.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/ray/node.py b/python/ray/node.py index 186ae3dfdbfd..086865023e54 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -292,9 +292,10 @@ def merge_resources(env_dict, params_dict): for key in set(env_dict.keys()).intersection( set(params_dict.keys())): - logger.warning("Autoscaler is overriding your resource:" - "{}: {} with {}.".format( - key, params_dict[key], env_dict[key])) + if params_dict[key] != env_dict[key]: + logger.warning("Autoscaler is overriding your resource:" + "{}: {} with {}.".format( + key, params_dict[key], env_dict[key])) return num_cpus, num_gpus, memory, object_store_memory, result if not self._resource_spec: From e9103eeb6dffb4a2275162bcc5e71619b8a66f6c Mon Sep 17 00:00:00 2001 From: Kai Yang Date: Mon, 25 Jan 2021 18:07:45 +0800 Subject: [PATCH 032/245] [Java] [Test] Move multi-worker config to ray.conf file (#13583) --- java/test.sh | 7 ++----- java/test/src/main/java/io/ray/test/FailureTest.java | 5 +---- java/test/src/main/java/io/ray/test/JobConfigTest.java | 5 +---- java/test/src/main/java/io/ray/test/KillActorTest.java | 5 +---- java/test/src/main/resources/ray.conf | 6 ++++++ 5 files changed, 11 insertions(+), 17 deletions(-) create mode 100644 java/test/src/main/resources/ray.conf diff --git a/java/test.sh b/java/test.sh index f946fd91ad6f..49a0d68bbdc5 100755 --- a/java/test.sh +++ b/java/test.sh @@ -50,18 +50,15 @@ if ! git diff --exit-code -- java src/ray/core_worker/lib/java; then exit 1 fi -# Enable multi-worker feature in Java test -TEST_ARGS=(-Dray.job.num-java-workers-per-process=10) - echo "Running tests under cluster mode." # TODO(hchen): Ideally, we should use the following bazel command to run Java tests. However, if there're skipped tests, # TestNG will exit with code 2. And bazel treats it as test failure. # bazel test //java:all_tests --config=ci || cluster_exit_code=$? -run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +run_testng java -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running tests under single-process mode." # bazel test //java:all_tests --jvmopt="-Dray.run-mode=SINGLE_PROCESS" --config=ci || single_exit_code=$? -run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar "${TEST_ARGS[@]}" org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml +run_testng java -Dray.run-mode="SINGLE_PROCESS" -cp "$ROOT_DIR"/../bazel-bin/java/all_tests_deploy.jar org.testng.TestNG -d /tmp/ray_java_test_output "$ROOT_DIR"/testng.xml echo "Running connecting existing cluster tests." case "${OSTYPE}" in diff --git a/java/test/src/main/java/io/ray/test/FailureTest.java b/java/test/src/main/java/io/ray/test/FailureTest.java index 218c78271023..5bfc40dd672e 100644 --- a/java/test/src/main/java/io/ray/test/FailureTest.java +++ b/java/test/src/main/java/io/ray/test/FailureTest.java @@ -23,20 +23,17 @@ public class FailureTest extends BaseTest { private static final String EXCEPTION_MESSAGE = "Oops"; - private String oldNumWorkersPerProcess; - @BeforeClass public void setUp() { // This is needed by `testGetThrowsQuicklyWhenFoundException`. // Set one worker per process. Otherwise, if `badFunc2` and `slowFunc` run in the same // process, `sleep` will delay `System.exit`. - oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); + System.clearProperty("ray.job.num-java-workers-per-process"); } public static int badFunc() { diff --git a/java/test/src/main/java/io/ray/test/JobConfigTest.java b/java/test/src/main/java/io/ray/test/JobConfigTest.java index 4ba9e484d5a1..f5efc3377c3c 100644 --- a/java/test/src/main/java/io/ray/test/JobConfigTest.java +++ b/java/test/src/main/java/io/ray/test/JobConfigTest.java @@ -10,11 +10,8 @@ @Test(groups = {"cluster"}) public class JobConfigTest extends BaseTest { - private String oldNumWorkersPerProcess; - @BeforeClass public void setupJobConfig() { - oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "3"); System.setProperty("ray.job.jvm-options.0", "-DX=999"); System.setProperty("ray.job.jvm-options.1", "-DY=998"); @@ -24,7 +21,7 @@ public void setupJobConfig() { @AfterClass public void tearDownJobConfig() { - System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); + System.clearProperty("ray.job.num-java-workers-per-process"); System.clearProperty("ray.job.jvm-options.0"); System.clearProperty("ray.job.jvm-options.1"); System.clearProperty("ray.job.worker-env.foo1"); diff --git a/java/test/src/main/java/io/ray/test/KillActorTest.java b/java/test/src/main/java/io/ray/test/KillActorTest.java index d862d3e1232a..fd92b97118ef 100644 --- a/java/test/src/main/java/io/ray/test/KillActorTest.java +++ b/java/test/src/main/java/io/ray/test/KillActorTest.java @@ -14,17 +14,14 @@ @Test(groups = {"cluster"}) public class KillActorTest extends BaseTest { - private String oldNumWorkersPerProcess; - @BeforeClass public void setUp() { - oldNumWorkersPerProcess = System.getProperty("ray.job.num-java-workers-per-process"); System.setProperty("ray.job.num-java-workers-per-process", "1"); } @AfterClass public void tearDown() { - System.setProperty("ray.job.num-java-workers-per-process", oldNumWorkersPerProcess); + System.clearProperty("ray.job.num-java-workers-per-process"); } public static class HangActor { diff --git a/java/test/src/main/resources/ray.conf b/java/test/src/main/resources/ray.conf new file mode 100644 index 000000000000..b838c0075a3f --- /dev/null +++ b/java/test/src/main/resources/ray.conf @@ -0,0 +1,6 @@ +ray { + job { + # Enable multi-worker feature in Java test + num-java-workers-per-process: 10 + } +} From 9423930bcccfe8c43eae8791fdf9c5b6c546c620 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Mon, 25 Jan 2021 12:32:41 +0100 Subject: [PATCH 033/245] [RLlib] MAML: Add cartpole mass test for PyTorch. (#13679) --- python/requirements_rllib.txt | 3 +++ rllib/agents/maml/tests/test_maml.py | 24 +++++++++++++-------- rllib/examples/env/cartpole_mass.py | 31 ++++++++++++++++++++++++++++ rllib/examples/env/pendulum_mass.py | 9 +++++--- 4 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 rllib/examples/env/cartpole_mass.py diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 94ae9cdbb338..0cefb02969b3 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -13,3 +13,6 @@ pettingzoo>=1.4.0 # For tests on RecSim and Kaggle envs. recsim kaggle_environments + +# For MAML on PyTorch. +higher diff --git a/rllib/agents/maml/tests/test_maml.py b/rllib/agents/maml/tests/test_maml.py index e5ef3cf694b0..b84e02857190 100644 --- a/rllib/agents/maml/tests/test_maml.py +++ b/rllib/agents/maml/tests/test_maml.py @@ -23,15 +23,21 @@ def test_maml_compilation(self): num_iterations = 1 # Test for tf framework (torch not implemented yet). - for _ in framework_iterator(config, frameworks=("tf")): - trainer = maml.MAMLTrainer( - config=config, - env="ray.rllib.examples.env.pendulum_mass.PendulumMassEnv") - for i in range(num_iterations): - trainer.train() - check_compute_single_action( - trainer, include_prev_action_reward=True) - trainer.stop() + for fw in framework_iterator(config, frameworks=("tf", "torch")): + for env in [ + "pendulum_mass.PendulumMassEnv", + "cartpole_mass.CartPoleMassEnv" + ]: + if fw == "tf" and env.startswith("cartpole"): + continue + print("env={}".format(env)) + env_ = "ray.rllib.examples.env.{}".format(env) + trainer = maml.MAMLTrainer(config=config, env=env_) + for i in range(num_iterations): + trainer.train() + check_compute_single_action( + trainer, include_prev_action_reward=True) + trainer.stop() if __name__ == "__main__": diff --git a/rllib/examples/env/cartpole_mass.py b/rllib/examples/env/cartpole_mass.py new file mode 100644 index 000000000000..a0519cb17869 --- /dev/null +++ b/rllib/examples/env/cartpole_mass.py @@ -0,0 +1,31 @@ +import numpy as np +import gym +from gym.envs.classic_control.cartpole import CartPoleEnv +from ray.rllib.env.meta_env import MetaEnv + + +class CartPoleMassEnv(CartPoleEnv, gym.utils.EzPickle, MetaEnv): + """CartPoleMassEnv varies the weights of the cart and the pole. + """ + + def sample_tasks(self, n_tasks): + # Sample new cart- and pole masses (random floats between 0.5 and 2.0 + # (cart) and between 0.05 and 0.2 (pole)). + cart_masses = np.random.uniform(low=0.5, high=2.0, size=(n_tasks, 1)) + pole_masses = np.random.uniform(low=0.05, high=0.2, size=(n_tasks, 1)) + return np.concatenate([cart_masses, pole_masses], axis=-1) + + def set_task(self, task): + """ + Args: + task (Tuple[float]): Masses of the cart and the pole. + """ + self.masscart = task[0] + self.masspole = task[1] + + def get_task(self): + """ + Returns: + Tuple[float]: The current mass of the cart- and pole. + """ + return np.array([self.masscart, self.masspole]) diff --git a/rllib/examples/env/pendulum_mass.py b/rllib/examples/env/pendulum_mass.py index c4dc93ed7342..b68b283e7410 100644 --- a/rllib/examples/env/pendulum_mass.py +++ b/rllib/examples/env/pendulum_mass.py @@ -11,19 +11,22 @@ class PendulumMassEnv(PendulumEnv, gym.utils.EzPickle, MetaEnv): """ def sample_tasks(self, n_tasks): - # Mass is a random float between 0.5 and 2 + # Sample new pendulum masses (random floats between 0.5 and 2). return np.random.uniform(low=0.5, high=2.0, size=(n_tasks, )) def set_task(self, task): """ Args: - task: task of the meta-learning environment + task (float): Task of the meta-learning environment (here: mass of + the pendulum). """ + # self.m is the mass property of the pendulum. self.m = task def get_task(self): """ Returns: - task: task of the meta-learning environment + float: The current mass of the pendulum (self.m in the PendulumEnv + object). """ return self.m From 964689b280dd63b3192148dbfabf27db45d7e40b Mon Sep 17 00:00:00 2001 From: Jan Blumenkamp Date: Mon, 25 Jan 2021 11:42:39 +0000 Subject: [PATCH 034/245] [RLlib] Fix bug in ModelCatalog when using custom action distribution (#12846) * return tuple returned from _get_multi_action_distribution when using custom action dict * Always return dst_class and required_model_output_shape in _get_multi_action_distribution * pass model config to _get_multi_action_distribution --- rllib/models/catalog.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 8e3e43dd08b3..6d0bfd111296 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -204,8 +204,8 @@ def get_action_dist( "Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) - dist_cls = ModelCatalog._get_multi_action_distribution( - dist_cls, action_space, {}, framework) + return ModelCatalog._get_multi_action_distribution( + dist_cls, action_space, config, framework) # Dist_type is given directly as a class. elif type(dist_type) is type and \ @@ -740,7 +740,8 @@ def _get_multi_action_distribution(dist_class, action_space, config, action_space=action_space, child_distributions=child_dists, input_lens=input_lens), int(sum(input_lens)) - return dist_class + return dist_class, dist_class.required_model_output_shape( + action_space, config) @staticmethod def _validate_config(config: ModelConfigDict, framework: str) -> None: From b4702de1c2539403deb08403fb296483b117f425 Mon Sep 17 00:00:00 2001 From: Maltimore Date: Mon, 25 Jan 2021 12:56:00 +0100 Subject: [PATCH 035/245] [RLlib] move evaluation to trainer.step() such that the result is properly logged (#12708) --- rllib/agents/trainer.py | 8 -------- rllib/agents/trainer_template.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 9055fe378a36..47e637f6dea7 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -535,14 +535,6 @@ def train(self) -> ResultDict: if hasattr(self, "workers") and isinstance(self.workers, WorkerSet): self._sync_filters_if_needed(self.workers) - if self.config["evaluation_interval"] == 1 or ( - self._iteration > 0 and self.config["evaluation_interval"] - and self._iteration % self.config["evaluation_interval"] == 0): - evaluation_metrics = self._evaluate() - assert isinstance(evaluation_metrics, dict), \ - "_evaluate() needs to return a dict." - result.update(evaluation_metrics) - return result def _sync_filters_if_needed(self, workers: WorkerSet): diff --git a/rllib/agents/trainer_template.py b/rllib/agents/trainer_template.py index b896958b6bf1..600cbef12bd9 100644 --- a/rllib/agents/trainer_template.py +++ b/rllib/agents/trainer_template.py @@ -146,6 +146,18 @@ def _init(self, config: TrainerConfigDict, @override(Trainer) def step(self): res = next(self.train_exec_impl) + + # self._iteration gets incremented after this function returns, + # meaning that e. g. the first time this function is called, + # self._iteration will be 0. We check `self._iteration+1` in the + # if-statement below to reflect that the first training iteration + # is already over. + if (self.config["evaluation_interval"] and (self._iteration + 1) % + self.config["evaluation_interval"] == 0): + evaluation_metrics = self._evaluate() + assert isinstance(evaluation_metrics, dict), \ + "_evaluate() needs to return a dict." + res.update(evaluation_metrics) return res @override(Trainer) From db2c836587f9bc487d93486ad9bc03b73e3c1f25 Mon Sep 17 00:00:00 2001 From: "DK.Pino" Date: Mon, 25 Jan 2021 20:14:21 +0800 Subject: [PATCH 036/245] [Placement Group] Move PlacementGroup public method to interface. (#13629) --- .../api/placementgroup/PlacementGroup.java | 50 ++++++++++++++++++- .../placementgroup/PlacementGroupImpl.java | 12 ++--- .../java/io/ray/test/PlacementGroupTest.java | 40 ++++++--------- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java index 9b4080deb988..0c5b31b67889 100644 --- a/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java +++ b/java/api/src/main/java/io/ray/api/placementgroup/PlacementGroup.java @@ -1,9 +1,57 @@ package io.ray.api.placementgroup; +import io.ray.api.id.PlacementGroupId; +import java.util.List; +import java.util.Map; + /** * A placement group is used to place interdependent actors according to a specific strategy {@link * PlacementStrategy}. When a placement group is created, the corresponding actor slots and * resources are preallocated. A placement group consists of one or more bundles plus a specific * placement strategy. */ -public interface PlacementGroup {} +public interface PlacementGroup { + + /** + * Get the id of current placement group. + * + * @return Id of current placement group. + */ + PlacementGroupId getId(); + + /** + * Get the name of current placement group. + * + * @return Name of current placement group. + */ + String getName(); + + /** + * Get all bundles which key is resource name and value is resource value. + * + * @return All bundles of current placement group. + */ + List> getBundles(); + + /** + * Get the strategy of current placement group. + * + * @return Strategy of current placement group. + */ + PlacementStrategy getStrategy(); + + /** + * Get the state of current placement group. + * + * @return Creation state of current placement group. + */ + PlacementGroupState getState(); + + /** + * Wait for the placement group to be ready within the specified time. + * + * @param timeoutSeconds Timeout in seconds. + * @return True if the placement group is created. False otherwise. + */ + boolean wait(int timeoutSeconds); +} diff --git a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java index 1d0d540848bf..55ca446f8423 100644 --- a/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java +++ b/java/runtime/src/main/java/io/ray/runtime/placementgroup/PlacementGroupImpl.java @@ -30,32 +30,32 @@ private PlacementGroupImpl( this.state = state; } + @Override public PlacementGroupId getId() { return id; } + @Override public String getName() { return name; } + @Override public List> getBundles() { return bundles; } + @Override public PlacementStrategy getStrategy() { return strategy; } + @Override public PlacementGroupState getState() { return state; } - /** - * Wait for the placement group to be ready within the specified time. - * - * @param timeoutSeconds Timeout in seconds. - * @return True if the placement group is created. False otherwise. - */ + @Override public boolean wait(int timeoutSeconds) { return Ray.internal().waitPlacementGroupReady(id, timeoutSeconds); } diff --git a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java index 14bf0fd6a577..edbd2c30e4d6 100644 --- a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java +++ b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java @@ -7,7 +7,6 @@ import io.ray.api.placementgroup.PlacementGroupState; import io.ray.api.placementgroup.PlacementStrategy; import io.ray.runtime.exception.RayException; -import io.ray.runtime.placementgroup.PlacementGroupImpl; import java.util.List; import org.testng.Assert; import org.testng.annotations.Test; @@ -32,8 +31,7 @@ public int getValue() { // This test just creates a placement group with one bundle. // It's not comprehensive to test all placement group test cases. public void testCreateAndCallActor() { - PlacementGroupImpl placementGroup = - (PlacementGroupImpl) PlacementGroupTestUtils.createSimpleGroup(); + PlacementGroup placementGroup = PlacementGroupTestUtils.createSimpleGroup(); Assert.assertTrue(placementGroup.wait(10)); Assert.assertEquals(placementGroup.getName(), "unnamed_group"); @@ -48,22 +46,18 @@ public void testCreateAndCallActor() { @Test(groups = {"cluster"}) public void testGetPlacementGroup() { - PlacementGroupImpl firstPlacementGroup = - (PlacementGroupImpl) - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - - PlacementGroupImpl secondPlacementGroup = - (PlacementGroupImpl) - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroup firstPlacementGroup = + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); + + PlacementGroup secondPlacementGroup = + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); Assert.assertTrue(firstPlacementGroup.wait(10)); Assert.assertTrue(secondPlacementGroup.wait(10)); - PlacementGroupImpl firstPlacementGroupRes = - (PlacementGroupImpl) Ray.getPlacementGroup((firstPlacementGroup).getId()); - PlacementGroupImpl secondPlacementGroupRes = - (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroup firstPlacementGroupRes = Ray.getPlacementGroup((firstPlacementGroup).getId()); + PlacementGroup secondPlacementGroupRes = Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertNotNull(firstPlacementGroupRes); Assert.assertNotNull(secondPlacementGroupRes); @@ -76,9 +70,9 @@ public void testGetPlacementGroup() { List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); - PlacementGroupImpl placementGroupRes = (PlacementGroupImpl) allPlacementGroup.get(0); + PlacementGroup placementGroupRes = allPlacementGroup.get(0); Assert.assertNotNull(placementGroupRes.getId()); - PlacementGroupImpl expectPlacementGroup = + PlacementGroup expectPlacementGroup = placementGroupRes.getId().equals(firstPlacementGroup.getId()) ? firstPlacementGroup : secondPlacementGroup; @@ -94,18 +88,16 @@ public void testRemovePlacementGroup() { PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); - PlacementGroupImpl secondPlacementGroup = - (PlacementGroupImpl) - PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( - "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); + PlacementGroup secondPlacementGroup = + PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( + "CPU", 1, PlacementStrategy.PACK, 1.0, "second_placement_group"); List allPlacementGroup = Ray.getAllPlacementGroups(); Assert.assertEquals(allPlacementGroup.size(), 2); Ray.removePlacementGroup(secondPlacementGroup.getId()); - PlacementGroupImpl removedPlacementGroup = - (PlacementGroupImpl) Ray.getPlacementGroup((secondPlacementGroup).getId()); + PlacementGroup removedPlacementGroup = Ray.getPlacementGroup((secondPlacementGroup).getId()); Assert.assertEquals(removedPlacementGroup.getState(), PlacementGroupState.REMOVED); // Wait for placement group after it is removed. From f9f2bfa77861539e467802185d665ae79f5ce25c Mon Sep 17 00:00:00 2001 From: Lingxuan Zuo Date: Mon, 25 Jan 2021 20:32:08 +0800 Subject: [PATCH 037/245] [Metric] Fix crashed when register metric view in multithread (#13485) * Fix crashed when register metric view in multithread * fix comments * fix --- src/ray/stats/metric.cc | 29 ++++++++++++++++++----------- src/ray/stats/metric.h | 3 +++ src/ray/stats/stats_test.cc | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/src/ray/stats/metric.cc b/src/ray/stats/metric.cc index 4a475a338408..d4b253428b92 100644 --- a/src/ray/stats/metric.cc +++ b/src/ray/stats/metric.cc @@ -22,6 +22,8 @@ namespace ray { namespace stats { +absl::Mutex Metric::registration_mutex_; + static void RegisterAsView(opencensus::stats::ViewDescriptor view_descriptor, const std::vector &keys) { // Register global keys. @@ -85,19 +87,24 @@ void Metric::Record(double value, const TagsType &tags) { return; } + // NOTE(lingxuan.zlx): Double check for recording performance while + // processing in multithread and avoid race since metrics may invoke + // record in different threads or code pathes. if (measure_ == nullptr) { - // Measure could be registered before, so we try to get it first. - MeasureDouble registered_measure = - opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); - - if (registered_measure.IsValid()) { - measure_.reset(new MeasureDouble(registered_measure)); - } else { - measure_.reset( - new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); + absl::MutexLock lock(®istration_mutex_); + if (measure_ == nullptr) { + // Measure could be registered before, so we try to get it first. + MeasureDouble registered_measure = + opencensus::stats::MeasureRegistry::GetMeasureDoubleByName(name_); + + if (registered_measure.IsValid()) { + measure_.reset(new MeasureDouble(registered_measure)); + } else { + measure_.reset( + new MeasureDouble(MeasureDouble::Register(name_, description_, unit_))); + } + RegisterView(); } - - RegisterView(); } // Do record. diff --git a/src/ray/stats/metric.h b/src/ray/stats/metric.h index 06e8534c4c67..dac50bc2d947 100644 --- a/src/ray/stats/metric.h +++ b/src/ray/stats/metric.h @@ -129,6 +129,9 @@ class Metric { std::vector tag_keys_; std::unique_ptr> measure_; + // For making sure thread-safe to all of metric registrations. + static absl::Mutex registration_mutex_; + }; // class Metric class Gauge : public Metric { diff --git a/src/ray/stats/stats_test.cc b/src/ray/stats/stats_test.cc index 21e1627233a4..38f7952823d7 100644 --- a/src/ray/stats/stats_test.cc +++ b/src/ray/stats/stats_test.cc @@ -116,6 +116,38 @@ TEST_F(StatsTest, InitializationTest) { ASSERT_TRUE(new_first_tag.second == test_tag_value_that_shouldnt_be_applied); } +TEST(Metric, MultiThreadMetricRegisterViewTest) { + ray::stats::Shutdown(); + std::shared_ptr exporter( + new stats::StdoutExporterClient()); + ray::stats::Init({}, MetricsAgentPort, exporter); + std::vector threads; + const stats::TagKeyType tag1 = stats::TagKeyType::Register("k1"); + const stats::TagKeyType tag2 = stats::TagKeyType::Register("k2"); + for (int index = 0; index < 10; ++index) { + threads.emplace_back([tag1, tag2, index]() { + for (int i = 0; i < 100; i++) { + stats::Count random_counter( + "ray.random.counter" + std::to_string(index) + std::to_string(i), "", "", + {tag1, tag2}); + random_counter.Record(i); + stats::Gauge random_gauge( + "ray.random.gauge" + std::to_string(index) + std::to_string(i), "", "", + {tag1, tag2}); + random_gauge.Record(i); + stats::Sum random_sum( + "ray.random.sum" + std::to_string(index) + std::to_string(i), "", "", + {tag1, tag2}); + random_sum.Record(i); + } + }); + } + for (auto &thread : threads) { + thread.join(); + } + ray::stats::Shutdown(); +} + TEST_F(StatsTest, MultiThreadedInitializationTest) { // Make sure stats module is thread-safe. // Shutdown the stats module first. From 79209110c50dddd4e3f722aa6f22733151140818 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Mon, 25 Jan 2021 08:40:59 -0800 Subject: [PATCH 038/245] [kubernetes][operator][hotfix] Dictionary fix (#13663) --- python/ray/operator/operator_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ray/operator/operator_utils.py b/python/ray/operator/operator_utils.py index 08afda94f1d4..5d51baebbd77 100644 --- a/python/ray/operator/operator_utils.py +++ b/python/ray/operator/operator_utils.py @@ -95,4 +95,7 @@ def get_cluster_owner_reference( def translate(configuration: Dict[str, Any], dictionary: Dict[str, str]) -> Dict[str, Any]: - return {dictionary[field]: configuration[field] for field in dictionary} + return { + dictionary[field]: configuration[field] + for field in dictionary if field in configuration + } From 1c77cc7e23921c1a8c5838e67257e1734c37e398 Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Mon, 25 Jan 2021 11:59:46 -0600 Subject: [PATCH 039/245] [docs] Remove API warning from mp.Pool (#13683) --- doc/source/multiprocessing.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/source/multiprocessing.rst b/doc/source/multiprocessing.rst index 3e3d57292b04..7d027b734fd9 100644 --- a/doc/source/multiprocessing.rst +++ b/doc/source/multiprocessing.rst @@ -10,11 +10,6 @@ using `Ray Actors `__ instead of local processes. This makes it eas to scale existing applications that use ``multiprocessing.Pool`` from a single node to a cluster. -.. note:: - - This API is new and may be revised in future Ray releases. If you encounter - any bugs, please file an `issue on GitHub`_. - .. _`multiprocessing.Pool API`: https://docs.python.org/3/library/multiprocessing.html#module-multiprocessing.pool Quickstart From d96a9fa19225b95b51d9d4422ad82324e75ad6d0 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Mon, 25 Jan 2021 10:35:25 -0800 Subject: [PATCH 040/245] Revert "Revert "[dashboard] Fix RAY_RAYLET_PID KeyError on Windows (#12948)" (#13572)" (#13685) This reverts commit c4a710369b93964e219af83bb197542241750627. --- dashboard/agent.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index f1c496b89004..7bf5e1551a2b 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -62,9 +62,13 @@ def __init__(self, self.object_store_name = object_store_name self.raylet_name = raylet_name self.node_id = os.environ["RAY_NODE_ID"] - self.ppid = int(os.environ["RAY_RAYLET_PID"]) - assert self.ppid > 0 - logger.info("Parent pid is %s", self.ppid) + # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is + # only used for fate-sharing with the raylet and we need a different + # fate-sharing mechanism for Windows anyways. + if sys.platform not in ["win32", "cygwin"]: + self.ppid = int(os.environ["RAY_RAYLET_PID"]) + assert self.ppid > 0 + logger.info("Parent pid is %s", self.ppid) self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) self.grpc_port = self.server.add_insecure_port( f"[::]:{self.dashboard_agent_port}") @@ -108,7 +112,8 @@ async def _check_parent(): logger.error("Failed to check parent PID, exiting.") sys.exit(1) - check_parent_task = create_task(_check_parent()) + if sys.platform not in ["win32", "cygwin"]: + check_parent_task = create_task(_check_parent()) # Create an aioredis client for all modules. try: From 9feae90e3bbf1455017d3cf8741c58704ade6906 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Mon, 25 Jan 2021 14:37:07 -0800 Subject: [PATCH 041/245] skip test_spill (#13693) --- python/ray/tests/test_object_spilling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 68824b7bb09a..a80a91580c6f 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -214,7 +214,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): @pytest.mark.skipif( - platform.system() == "Windows", reason="Failing on Windows.") + platform.system() in ["Darwin", "Windows"], reason="Failing on Windows.") def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config From 0d75f37c1f5cc805628fbfe889c7aaa2a7355a78 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Jan 2021 00:03:38 +0100 Subject: [PATCH 042/245] [tune](deps): Bump distributed in /python/requirements (#13643) Bumps [distributed](https://github.com/dask/distributed) from 2020.12.0 to 2021.1.1. - [Release notes](https://github.com/dask/distributed/releases) - [Changelog](https://github.com/dask/distributed/blob/master/docs/release-procedure.md) - [Commits](https://github.com/dask/distributed/compare/2020.12.0...2021.01.1) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- python/requirements/linux-py3.8-requirements_tune.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index 4351d0b6386f..bae7f20ae363 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -155,7 +155,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.0 +distributed==2021.1.1 # via # autogluon.core # dask diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index c7a7b9204649..bb10df777068 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -148,7 +148,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.0 +distributed==2021.1.1 # via # autogluon.core # dask diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 195951424490..8ef61bd51b63 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -146,7 +146,7 @@ defusedxml==0.6.0 # via nbconvert dill==0.3.3 # via autogluon.core -distributed==2021.1.0 +distributed==2021.1.1 # via # autogluon.core # dask From 8b8d6b984b4caf5b08edc9a446adfacf7c21f22b Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 25 Jan 2021 16:05:59 -0800 Subject: [PATCH 043/245] [Buildkite] Add all Python tests (#13566) --- .bazelrc | 1 + .buildkite/Dockerfile | 12 +- .buildkite/pipeline.yml | 143 +++++++++++++++++- ci/travis/install-dependencies.sh | 29 +++- python/ray/scripts/scripts.py | 6 +- python/ray/tests/test_stress.py | 2 +- python/ray/tests/test_stress_failure.py | 2 +- python/ray/tests/test_stress_sharded.py | 2 +- .../tests/test_unreconstructable_errors.py | 2 +- 9 files changed, 183 insertions(+), 16 deletions(-) diff --git a/.bazelrc b/.bazelrc index 2baaa0fa2af5..8de20992a595 100644 --- a/.bazelrc +++ b/.bazelrc @@ -95,6 +95,7 @@ test:asan --test_env=ASAN_OPTIONS="detect_leaks=0" test:asan --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.2 /usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" # For example, for Ubuntu 18.04 libasan can be found here: # test:asan --test_env=LD_PRELOAD="/usr/lib/gcc/x86_64-linux-gnu/7/libasan.so" +test:asan-buildkite --test_env=LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libasan.so.5" # CI configuration: aquery:ci --color=no diff --git a/.buildkite/Dockerfile b/.buildkite/Dockerfile index 2f52fb92d1d1..86bd28148985 100644 --- a/.buildkite/Dockerfile +++ b/.buildkite/Dockerfile @@ -5,15 +5,25 @@ ARG BUILDKITE_PULL_REQUEST ENV DEBIAN_FRONTEND=noninteractive ENV TZ=America/Los_Angeles + ENV BUILDKITE=true ENV CI=true ENV PYTHON=3.6 +ENV RAY_USE_RANDOM_PORTS=1 +ENV RAY_DEFAULT_BUILD=1 RUN apt-get update -qq RUN apt-get install -y -qq \ curl python-is-python3 git build-essential \ - sudo unzip apt-utils dialog tzdata wget + sudo unzip apt-utils dialog tzdata wget rsync \ + language-pack-en tmux cmake gdb vim htop \ + libgtk2.0-dev zlib1g-dev libgl1-mesa-dev + +# System conf for tests RUN locale -a +ENV LC_ALL=en_US.utf8 +ENV LANG=en_US.utf8 +RUN echo "ulimit -c 0" >> /root/.bashrc # Setup Bazel caches RUN (echo "build --remote_cache=${REMOTE_CACHE_URL}" >> /root/.bazelrc); \ diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 91c673d52604..0544234af182 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,6 +1,141 @@ -- label: "Ray Core Tests (:buildkite: Experimental)" +- label: ":cpp: Tests" commands: - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -- label: "Ray Dashboard Tests" + - bash src/ray/test/run_object_manager_tests.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + -- //:all -rllib/... -core_worker_test + +- label: ":cpp: Tests (ASAN)" commands: - - bazel test --config=ci $(./scripts/bazel_export_options) python/ray/new_dashboard/... + - bazel test --config=ci --config=asan $(./scripts/bazel_export_options) + --build_tests_only + --config=asan-buildkite + --jobs=2 + -- //:all -//:core_worker_test + +- label: ":serverless: Dashboard + Serve Tests" + commands: + - TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + python/ray/new_dashboard/... + - bazel test --config=ci $(./scripts/bazel_export_options) + python/ray/serve/... + +- label: ":python: (Small & Large)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,-medium_size_python_tests_a_to_j,-medium_size_python_tests_k_to_z + python/ray/tests/... + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,client_tests + --test_env=RAY_CLIENT_MODE=1 + python/ray/tests/... +- label: ":python: (Medium A-J)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_a_to_j + python/ray/tests/... +- label: ":python: (Medium K-Z)" + commands: + - bazel test --config=ci $(./scripts/bazel_export_options) + --test_tag_filters=-kubernetes,-jenkins_only,medium_size_python_tests_k_to_z + python/ray/tests/... + +- label: ":brain: RLlib: Learning tests (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_tf + rllib/... + +- label: ":brain: RLlib: Learning tests with tf=1.x (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=1.14.0 TFP_VERSION=0.7 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_tf + rllib/... + +- label: ":brain: RLlib: Learning tests with Torch (from rllib/tuned_examples/*.yaml)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=learning_tests_torch + rllib/... + +- label: ":brain: RLlib: Quick Agent train.py runs" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=quick_train + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + # Test everything that does not have any of the "main" labels: + # "learning_tests|quick_train|examples|tests_dir". + - bazel test --config=ci $(./scripts/bazel_export_options) + --build_tests_only + --test_tag_filters=-learning_tests_tf,-learning_tests_torch,-quick_train,-examples,-tests_dir + --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/examples/" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_A,examples_B --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_C,examples_D --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/tests/ (A-L)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + +- label: ":brain: RLlib: rllib/tests/ (M-Z)" + commands: + - RLLIB_TESTING=1 TF_VERSION=2.1.0 TFP_VERSION=0.8 TORCH_VERSION=1.6 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only + --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z --test_env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1 + rllib/... + + +- label: ":octopus: Tune tests and examples" + commands: + - TUNE_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=-jenkins_only,-example python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=example,-tf,-pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37,-flaky python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-py37,flaky python/ray/tune/... + +- label: ":octopus: SGD tests and examples" + commands: + - SGD_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 python/ray/util/sgd/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 python/ray/util/sgd/... + +- label: ":octopus: Tune/SGD tests and examples. Python 3.7" + commands: + - TUNE_TESTING=1 PYTHON=3.7 INSTALL_HOROVOD=1 ./ci/travis/install-dependencies.sh + # Bcause Python version changed, we need to re-install Ray here + - rm -rf ./python/ray/thirdparty_files; ./ci/travis/ci.sh build + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=py37 python/ray/tune/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only python/ray/util/xgboost/... + +- label: ":book: Doc tests and examples" + commands: + - DOC_TESTING=1 ./ci/travis/install-dependencies.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,-pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=tf,-pytorch,-py37 doc/... + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=-tf,pytorch,-py37 doc/... \ No newline at end of file diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 96f4fa95a8f2..498aaf419533 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -23,6 +23,13 @@ pkg_install_helper() { } install_bazel() { + if command -v bazel; then + if [ -n "${BUILDKITE-}" ]; then + echo "Bazel exists, skipping the install" + return + fi + fi + "${ROOT_DIR}"/install-bazel.sh if [ -f /etc/profile.d/bazel.sh ]; then . /etc/profile.d/bazel.sh @@ -30,6 +37,11 @@ install_bazel() { } install_base() { + if [ -n "${BUILDKITE-}" ]; then + echo "Skipping install_base in Buildkite" + return + fi + case "${OSTYPE}" in linux*) # Expired apt key error: https://github.com/bazelbuild/bazel/issues/11470#issuecomment-633205152 @@ -188,9 +200,7 @@ install_nvm() { > "${NVM_HOME}/nvm.sh" fi elif [ -n "${BUILDKITE-}" ]; then - # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions - curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - - sudo apt-get install -y nodejs + echo "Skipping nvm on Buildkite because we will use apt-get." else test -f "${NVM_HOME}/nvm.sh" # double-check NVM is already available on other platforms fi @@ -216,10 +226,19 @@ install_upgrade_pip() { } install_node() { + if command -v node; then + if [ -n "${BUILDKITE-}" ]; then + echo "Node existed, skipping install"; + return + fi + fi + if [ "${OSTYPE}" = msys ] ; then { echo "WARNING: Skipping running Node.js due to incompatibilities with Windows"; } 2> /dev/null elif [ -n "${BUILDKITE-}" ] ; then - { echo "WARNING: Skipping running Node.js on buildkite because it's already there"; } 2> /dev/null + # https://github.com/nodesource/distributions/blob/master/README.md#installation-instructions + curl -sL https://deb.nodesource.com/setup_14.x | sudo -E bash - + sudo apt-get install -y nodejs else # Install the latest version of Node.js in order to build the dashboard. ( @@ -258,7 +277,7 @@ install_dependencies() { if [ -n "${PYTHON-}" ]; then # Remove this entire section once RLlib and Serve dependencies are fixed. - if [ -z "${BUILDKITE-}" ] && [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then + if [ "${DOC_TESTING-}" != 1 ] && [ "${SGD_TESTING-}" != 1 ] && [ "${TUNE_TESTING-}" != 1 ]; then # PyTorch is installed first since we are using a "-f" directive to find the wheels. # We want to install the CPU version only. local torch_url="https://download.pytorch.org/whl/torch_stable.html" diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 6fecd2dc272b..b61c6939984c 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -739,6 +739,7 @@ def stop(force, verbose, log_style, log_color): total_found = 0 total_stopped = 0 + stopped = [] for keyword, filter_by_cmd in processes_to_kill: if filter_by_cmd and is_linux and len(keyword) > 15: # getting here is an internal bug, so we do not use cli_logger @@ -777,6 +778,7 @@ def stop(force, verbose, log_style, log_color): cf.dimmed("(via SIGTERM)")) total_stopped += 1 + stopped.append(proc) except psutil.NoSuchProcess: cli_logger.verbose( "Attempted to stop `{}`, but process was already dead.", @@ -799,8 +801,8 @@ def stop(force, verbose, log_style, log_color): cli_logger.warning("Try running the command again, or use `{}`.", cf.bold("--force")) - # TODO(maximsmol): we should probably block until the processes actually - # all died somehow + # Wait for the processes to actually stop. + psutil.wait_procs(stopped, timeout=2) @cli.command() diff --git a/python/ray/tests/test_stress.py b/python/ray/tests/test_stress.py index 2007887367ef..99ed186716e2 100644 --- a/python/ray/tests/test_stress.py +++ b/python/ray/tests/test_stress.py @@ -15,7 +15,7 @@ def ray_start_combination(request): initialize_head=True, head_node_args={ "num_cpus": 10, - "redis_max_memory": 10**7 + "redis_max_memory": 10**8 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) diff --git a/python/ray/tests/test_stress_failure.py b/python/ray/tests/test_stress_failure.py index 01d39afa8065..83d9f40f24ed 100644 --- a/python/ray/tests/test_stress_failure.py +++ b/python/ray/tests/test_stress_failure.py @@ -20,7 +20,7 @@ def ray_start_reconstruction(request): head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, - "redis_max_memory": 10**7, + "redis_max_memory": 10**8, "_system_config": { "object_timeout_milliseconds": 200 } diff --git a/python/ray/tests/test_stress_sharded.py b/python/ray/tests/test_stress_sharded.py index 7f05f27acb37..c6e5cd484bb2 100644 --- a/python/ray/tests/test_stress_sharded.py +++ b/python/ray/tests/test_stress_sharded.py @@ -14,7 +14,7 @@ def ray_start_sharded(request): object_store_memory=int(0.5 * 10**9), num_cpus=10, # _num_redis_shards=num_redis_shards, - _redis_max_memory=10**7) + _redis_max_memory=10**8) yield None diff --git a/python/ray/tests/test_unreconstructable_errors.py b/python/ray/tests/test_unreconstructable_errors.py index 501dce905530..24be89b94297 100644 --- a/python/ray/tests/test_unreconstructable_errors.py +++ b/python/ray/tests/test_unreconstructable_errors.py @@ -10,7 +10,7 @@ def setUp(self): ray.init( num_cpus=1, object_store_memory=150 * 1024 * 1024, - _redis_max_memory=10000000) + _redis_max_memory=10**8) def tearDown(self): ray.shutdown() From fe8262afd02087436639e715326e0fa883e7c4d8 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 25 Jan 2021 16:53:52 -0800 Subject: [PATCH 044/245] Add K8s test to release process (#13694) --- release/RELEASE_CHECKLIST.md | 4 ++++ release/RELEASE_PROCESS.rst | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/release/RELEASE_CHECKLIST.md b/release/RELEASE_CHECKLIST.md index 50b30f8ff54c..9ab85f30bac0 100644 --- a/release/RELEASE_CHECKLIST.md +++ b/release/RELEASE_CHECKLIST.md @@ -56,6 +56,10 @@ This checklist is meant to be used in conjunction with the RELEASE_PROCESS.rst d - [ ] Results added to `release/release_logs` - [ ] stress_tests - [ ] unit_gpu_tests +- [ ] ASAN Test +- [ ] K8s Test + - [ ] K8s cluster launcher test + - [ ] K8s operator test ## Final Steps - [ ] Wheels uploaded to Test PyPI diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index 287ba870c661..c60e1c4aa789 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -136,8 +136,11 @@ is generally the easiest way to run release tests. 5. **ASAN tests** - Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the - whole Python tests to detect memory leaks. + Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. + +6. **K8s operator tests** + + Run the ``python/ray/tests/test_k8s_*`` to make sure K8s cluster launcher and operator works. Make sure the docker image is the released version. Identify and Resolve Release Blockers ------------------------------------- From f2867b060966e8810034c1aec186a5ac042095e1 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 25 Jan 2021 17:33:41 -0800 Subject: [PATCH 045/245] [CI] Remove object_manager_test (#13703) https://github.com/ray-project/ray/commit/0998d69968608012ca6cdd1ee166961df1aa0f0b removed the object_manager_test. --- .buildkite/pipeline.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0544234af182..ebfd96322ecf 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,6 +1,5 @@ - label: ":cpp: Tests" commands: - - bash src/ray/test/run_object_manager_tests.sh - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only -- //:all -rllib/... -core_worker_test From 840987c7aff50bc246ca0d22eb94225d1f82293c Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Mon, 25 Jan 2021 18:48:31 -0800 Subject: [PATCH 046/245] Scalability Envelope Tests (#13464) --- benchmarks/README.md | 35 ++++ benchmarks/distributed/config.yaml | 58 ++++++ benchmarks/distributed/test_distributed.py | 204 +++++++++++++++++++ benchmarks/object_store/config.yaml | 48 +++++ benchmarks/object_store/test_object_store.py | 61 ++++++ benchmarks/single_node/config.yaml | 41 ++++ benchmarks/single_node/test_single_node.py | 175 ++++++++++++++++ release/RELEASE_PROCESS.rst | 8 +- 8 files changed, 629 insertions(+), 1 deletion(-) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/distributed/config.yaml create mode 100644 benchmarks/distributed/test_distributed.py create mode 100644 benchmarks/object_store/config.yaml create mode 100644 benchmarks/object_store/test_object_store.py create mode 100644 benchmarks/single_node/config.yaml create mode 100644 benchmarks/single_node/test_single_node.py diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000000..2167151656a9 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,35 @@ +# Ray Scalability Envelope + +### Note: This document is a WIP. This is not a scalability guarantee (yet). + +## Distributed Benchmarks + +All distributed tests are run on 64 nodes with 64 cores/node. Maximum number of nodes is achieved by adding 4 core nodes. + +| Dimension | Quantity | +| --------- | -------- | +| # nodes in cluster (with trivial task workload) | 250+ | +| # actors in cluster (with trivial workload) | 10k+ | +| # simultaneously running tasks | 10k+ | +| # simultaneously running placement groups | 1k+ | + +## Object Store Benchmarks + +| Dimension | Quantity | +| --------- | -------- | +| 1 GiB object broadcast (# of nodes) | 50+ | + + +## Single Node Benchmarks. + +All single node benchmarks are run on a single m4.16xlarge. + +| Dimension | Quantity | +| --------- | -------- | +| # of object artuments to a single task | 10000+ | +| # of objects returned from a single task | 3000+ | +| # of plasma objects in a single `ray.get` call | 10000+ | +| # of tasks queued on a single node | 1,000,000+ | +| Maximum `ray.get` numpy object size | 100GiB+ | + + diff --git a/benchmarks/distributed/config.yaml b/benchmarks/distributed/config.yaml new file mode 100644 index 000000000000..630de0eef265 --- /dev/null +++ b/benchmarks/distributed/config.yaml @@ -0,0 +1,58 @@ +cluster_name: distributed-benchmarks +min_workers: 0 +max_workers: 999999 + +upscaling_speed: 9999999 + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a, us-west-2b, us-west-2c, us-west-2d + +auth: + ssh_user: ubuntu + +available_node_types: + head_node: + node_config: + InstanceType: m5.16xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + small: 1 + max_workers: 999999 + worker_node: + node_config: + InstanceType: m5.16xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + min_workers: 63 + max_workers: 63 + small_worker_node: + node_config: + InstanceType: m5.xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + max_workers: 999999 + +head_node_type: head_node + +worker_default_node_type: worker_node + +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + - pip install tqdm + - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;' + +idle_timeout_minutes: 1 + +head_start_ray_commands: + - ray stop + - ulimit -n 65535; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65535; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/distributed/test_distributed.py b/benchmarks/distributed/test_distributed.py new file mode 100644 index 000000000000..c929cdba8c1a --- /dev/null +++ b/benchmarks/distributed/test_distributed.py @@ -0,0 +1,204 @@ +import ray +import ray.autoscaler.sdk +from ray.test_utils import Semaphore +from ray.util.placement_group import placement_group, remove_placement_group + +from time import sleep, perf_counter +from tqdm import tqdm, trange + +TEST_NUM_NODES = 64 +MAX_ACTORS_IN_CLUSTER = 10000 +MAX_RUNNING_TASKS_IN_CLUSTER = 10000 +MAX_PLACEMENT_GROUPS = 1000 +MAX_NUM_NODES = 250 + + +def num_alive_nodes(): + n = 0 + for node in ray.nodes(): + if node["Alive"]: + n += 1 + return n + + +def scale_to(target): + while num_alive_nodes() != target: + ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) + print(f"Current # nodes: {num_alive_nodes()}, target: {target}") + print("Waiting ...") + sleep(5) + + +def test_nodes(): + scale_to(MAX_NUM_NODES) + assert num_alive_nodes() == MAX_NUM_NODES + # Treat this as a trivial task to ensure the nodes are all functioning + test_max_running_tasks() + + +def test_max_actors(): + # TODO (Alex): Dynamically set this based on number of cores + cpus_per_actor = 0.25 + + @ray.remote(num_cpus=cpus_per_actor) + class Actor: + def foo(self): + pass + + actors = [ + Actor.remote() + for _ in trange(MAX_ACTORS_IN_CLUSTER, desc="Launching actors") + ] + + for actor in tqdm(actors, desc="Ensuring actors have started"): + assert ray.get(actor.foo.remote()) is None + + +def test_max_running_tasks(): + counter = Semaphore.remote(0) + blocker = Semaphore.remote(0) + + @ray.remote(num_cpus=0.25) + def task(counter, blocker): + sleep(300) + + refs = [ + task.remote(counter, blocker) + for _ in trange(MAX_RUNNING_TASKS_IN_CLUSTER, desc="Launching tasks") + ] + + max_cpus = ray.cluster_resources()["CPU"] + min_cpus_available = max_cpus + for _ in trange(int(300 / 0.1), desc="Waiting"): + try: + cur_cpus = ray.available_resources().get("CPU", 0) + min_cpus_available = min(min_cpus_available, cur_cpus) + except Exception: + # There are race conditions `.get` can fail if a new heartbeat + # comes at the same time. + pass + sleep(0.1) + + # There are some relevant magic numbers in this check. 10k tasks each + # require 1/4 cpus. Therefore, ideally 2.5k cpus will be used. + err_str = f"Only {max_cpus - min_cpus_available}/{max_cpus} cpus used." + assert max_cpus - min_cpus_available > 2000, err_str + + for _ in trange( + MAX_RUNNING_TASKS_IN_CLUSTER, + desc="Ensuring all tasks have finished"): + done, refs = ray.wait(refs) + assert ray.get(done[0]) is None + + +def test_many_placement_groups(): + @ray.remote(num_cpus=1, resources={"node": 0.02}) + def f1(): + sleep(10) + pass + + @ray.remote(num_cpus=1) + def f2(): + sleep(10) + pass + + @ray.remote(resources={"node": 0.02}) + def f3(): + sleep(10) + pass + + bundle1 = {"node": 0.02, "CPU": 1} + bundle2 = {"CPU": 1} + bundle3 = {"node": 0.02} + + pgs = [] + for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): + pg = placement_group(bundles=[bundle1, bundle2, bundle3]) + pgs.append(pg) + + for pg in tqdm(pgs, desc="Waiting for pgs to be ready"): + ray.get(pg.ready()) + + refs = [] + for pg in tqdm(pgs, desc="Scheduling tasks"): + ref1 = f1.options(placement_group=pg).remote() + ref2 = f2.options(placement_group=pg).remote() + ref3 = f3.options(placement_group=pg).remote() + refs.extend([ref1, ref2, ref3]) + + for _ in trange(10, desc="Waiting"): + sleep(1) + + with tqdm() as p_bar: + while refs: + done, refs = ray.wait(refs) + p_bar.update() + + for pg in tqdm(pgs, desc="Cleaning up pgs"): + remove_placement_group(pg) + + +ray.init(address="auto") + +scale_to(TEST_NUM_NODES) +assert num_alive_nodes( +) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) + +cluster_resources = ray.cluster_resources() + +available_resources = ray.available_resources() +assert available_resources == cluster_resources, ( + str(available_resources) + " != " + str(cluster_resources)) +print("Done launching nodes") + +actor_start = perf_counter() +test_max_actors() +actor_end = perf_counter() + +sleep(1) +assert num_alive_nodes( +) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +assert available_resources == cluster_resources, ( + str(available_resources) + " != " + str(cluster_resources)) +print("Done testing actors") + +task_start = perf_counter() +test_max_running_tasks() +task_end = perf_counter() + +sleep(1) +assert num_alive_nodes( +) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +assert available_resources == cluster_resources, ( + str(available_resources) + " != " + str(cluster_resources)) +print("Done testing tasks") + +pg_start = perf_counter() +test_many_placement_groups() +pg_end = perf_counter() + +sleep(1) +assert num_alive_nodes( +) == TEST_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +assert available_resources == cluster_resources, ( + str(available_resources) + " != " + str(cluster_resources)) +print("Done testing placement groups") + +launch_start = perf_counter() +test_nodes() +launch_end = perf_counter() + +sleep(1) +assert num_alive_nodes( +) == MAX_NUM_NODES, "Wrong number of nodes in cluster " + len(ray.nodes()) +print("Done.") + +actor_time = actor_end - actor_start +task_time = task_end - task_start +pg_time = pg_end - pg_start +launch_time = launch_end - launch_start + +print(f"Actor time: {actor_time} ({MAX_ACTORS_IN_CLUSTER} actors)") +print(f"Task time: {task_time} ({MAX_RUNNING_TASKS_IN_CLUSTER} tasks)") +print(f"PG time: {pg_time} ({MAX_PLACEMENT_GROUPS} placement groups)") +print(f"Node launch time: {launch_time} ({MAX_NUM_NODES} nodes)") diff --git a/benchmarks/object_store/config.yaml b/benchmarks/object_store/config.yaml new file mode 100644 index 000000000000..5ea3ce8352af --- /dev/null +++ b/benchmarks/object_store/config.yaml @@ -0,0 +1,48 @@ +cluster_name: object-store-benchmarks +min_workers: 0 +max_workers: 999999 + +upscaling_speed: 9999999 + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + +auth: + ssh_user: ubuntu + +available_node_types: + head_node: + node_config: + InstanceType: m4.4xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + max_workers: 999999 + worker_node: + node_config: + InstanceType: m4.xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + max_workers: 999999 + +head_node_type: head_node + +worker_default_node_type: worker_node + +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + - pip install tqdm numpy + +idle_timeout_minutes: 5 + +head_start_ray_commands: + - ray stop + - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 1000000; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/benchmarks/object_store/test_object_store.py b/benchmarks/object_store/test_object_store.py new file mode 100644 index 000000000000..83312fddd90e --- /dev/null +++ b/benchmarks/object_store/test_object_store.py @@ -0,0 +1,61 @@ +import numpy as np + +import ray +import ray.autoscaler.sdk + +from time import sleep, perf_counter +from tqdm import tqdm + +NUM_NODES = 50 +OBJECT_SIZE = 2**30 + + +def num_alive_nodes(): + n = 0 + for node in ray.nodes(): + if node["Alive"]: + n += 1 + return n + + +def scale_to(target): + while num_alive_nodes() != target: + ray.autoscaler.sdk.request_resources(bundles=[{"node": 1}] * target) + print(f"Current # nodes: {num_alive_nodes()}, target: {target}") + print("Waiting ...") + sleep(5) + + +def test_object_broadcast(): + scale_to(NUM_NODES) + + @ray.remote(num_cpus=1, resources={"node": 1}) + class Actor: + def foo(self): + pass + + def sum(self, arr): + return np.sum(arr) + + actors = [Actor.remote() for _ in range(NUM_NODES)] + + arr = np.ones(OBJECT_SIZE, dtype=np.uint8) + ref = ray.put(arr) + + for actor in tqdm(actors, desc="Ensure all actors have started."): + ray.get(actor.foo.remote()) + + result_refs = [] + for actor in tqdm(actors, desc="Broadcasting objects"): + result_refs.append(actor.sum.remote(ref)) + + results = ray.get(result_refs) + for result in results: + assert result == OBJECT_SIZE + + +ray.init(address="auto") +start = perf_counter() +test_object_broadcast() +end = perf_counter() +print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") diff --git a/benchmarks/single_node/config.yaml b/benchmarks/single_node/config.yaml new file mode 100644 index 000000000000..e5798541f9c1 --- /dev/null +++ b/benchmarks/single_node/config.yaml @@ -0,0 +1,41 @@ +cluster_name: single-node-benchmarks +min_workers: 0 +max_workers: 0 + +upscaling_speed: 9999999 + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + +auth: + ssh_user: ubuntu + +available_node_types: + head_node: + node_config: + InstanceType: m4.16xlarge + ImageId: ami-098555c9b343eb09c + resources: + node: 1 + max_workers: 999999 + worker_node: + node_config: + InstanceType: m4.xlarge + ImageId: ami-098555c9b343eb09c + +head_node_type: head_node + +worker_default_node_type: worker_node + +setup_commands: + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-1.2.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + - pip install numpy tqdm + - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1000000" >> /etc/security/limits.conf; echo "* hard nofile 1000000" >> /etc/security/limits.conf;' + +idle_timeout_minutes: 5 + +head_start_ray_commands: + - ray stop + - ulimit -n 1000000; ray start --head --port=6379 --object-manager-port=8076 --object-store-memory=128000000000 --autoscaling-config=~/ray_bootstrap_config.yaml diff --git a/benchmarks/single_node/test_single_node.py b/benchmarks/single_node/test_single_node.py new file mode 100644 index 000000000000..75d783124523 --- /dev/null +++ b/benchmarks/single_node/test_single_node.py @@ -0,0 +1,175 @@ +import numpy as np +import ray +import ray.autoscaler.sdk +from ray.test_utils import Semaphore + +from time import perf_counter +from tqdm import trange, tqdm + +MAX_ARGS = 10000 +MAX_RETURNS = 3000 +MAX_RAY_GET_ARGS = 10000 +MAX_QUEUED_TASKS = 1_000_000 +MAX_RAY_GET_SIZE = 100 * 2**30 + + +def test_many_args(): + @ray.remote + def sum_args(*args): + return sum(sum(arg) for arg in args) + + args = [[1 for _ in range(10000)] for _ in range(MAX_ARGS)] + result = ray.get(sum_args.remote(*args)) + assert result == MAX_ARGS * 10000 + + +def test_many_returns(): + @ray.remote(num_returns=MAX_RETURNS) + def f(): + to_return = [] + for _ in range(MAX_RETURNS): + obj = list(range(10000)) + to_return.append(obj) + + return tuple(to_return) + + returned_refs = f.remote() + assert len(returned_refs) == MAX_RETURNS + + for ref in returned_refs: + expected = list(range(10000)) + obj = ray.get(ref) + assert obj == expected + + +def test_ray_get_args(): + def with_dese(): + print("Putting test objects:") + refs = [] + for _ in trange(MAX_RAY_GET_ARGS): + obj = list(range(10000)) + refs.append(ray.put(obj)) + + print("Getting objects") + results = ray.get(refs) + assert len(results) == MAX_RAY_GET_ARGS + + print("Asserting correctness") + for obj in tqdm(results): + expected = list(range(10000)) + assert obj == expected + + def with_zero_copy(): + print("Putting test objects:") + refs = [] + for _ in trange(MAX_RAY_GET_ARGS): + obj = np.arange(10000) + refs.append(ray.put(obj)) + + print("Getting objects") + results = ray.get(refs) + assert len(results) == MAX_RAY_GET_ARGS + + print("Asserting correctness") + for obj in tqdm(results): + expected = np.arange(10000) + assert (obj == expected).all() + + with_dese() + print("Done with dese") + with_zero_copy() + print("Done with zero copy") + + +def test_many_queued_tasks(): + sema = Semaphore.remote(0) + + @ray.remote(num_cpus=1) + def block(): + ray.get(sema.acquire.remote()) + + @ray.remote(num_cpus=1) + def f(): + pass + + num_cpus = int(ray.cluster_resources()["CPU"]) + blocked_tasks = [] + for _ in range(num_cpus): + blocked_tasks.append(block.remote()) + + print("Submitting many tasks") + pending_tasks = [] + for _ in trange(MAX_QUEUED_TASKS): + pending_tasks.append(f.remote()) + + # Make sure all the tasks can actually run. + for _ in range(num_cpus): + sema.release.remote() + + print("Unblocking tasks") + for ref in tqdm(pending_tasks): + assert ray.get(ref) is None + + +def test_large_object(): + print("Generating object") + obj = np.zeros(MAX_RAY_GET_SIZE, dtype=np.int8) + print("Putting object") + ref = ray.put(obj) + del obj + print("Getting object") + big_obj = ray.get(ref) + + assert big_obj[0] == 0 + assert big_obj[-1] == 0 + + +ray.init(address="auto") + +args_start = perf_counter() +test_many_args() +args_end = perf_counter() + +assert ray.cluster_resources() == ray.available_resources() +print("Finished many args") + +returns_start = perf_counter() +test_many_returns() +returns_end = perf_counter() + +assert ray.cluster_resources() == ray.available_resources() +print("Finished many returns") + +get_start = perf_counter() +test_ray_get_args() +get_end = perf_counter() + +assert ray.cluster_resources() == ray.available_resources() +print("Finished ray.get on many objects") + +queued_start = perf_counter() +test_many_queued_tasks() +queued_end = perf_counter() + +assert ray.cluster_resources() == ray.available_resources() +print("Finished queueing many tasks") + +large_object_start = perf_counter() +test_large_object() +large_object_end = perf_counter() + +assert ray.cluster_resources() == ray.available_resources() +print("Done") + +args_time = args_end - args_start +returns_time = returns_end - returns_start +get_time = get_end - get_start +queued_time = queued_end - queued_start +large_object_time = large_object_end - large_object_start + +print(f"Many args time: {args_time} ({MAX_ARGS} args)") +print(f"Many returns time: {returns_time} ({MAX_RETURNS} returns)") +print(f"Ray.get time: {get_time} ({MAX_RAY_GET_ARGS} args)") +print(f"Queued task time: {queued_time} ({MAX_QUEUED_TASKS} tasks)") +print(f"Ray.get large object time: {large_object_time} " + f"({MAX_RAY_GET_SIZE} bytes)") diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index c60e1c4aa789..018f56bdf941 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -134,7 +134,13 @@ is generally the easiest way to run release tests. The summaries printed by each test should be checked in under ``release_logs/`` on the **master** branch (make a pull request). -5. **ASAN tests** +5. **Scalability envelope tests** + + - Run the tests in `benchmarks/` (with `ray submit --start cluster.yaml `) + - Record the outputted times. + - Whether the results are acceptable is a judgement call. + +6. **ASAN tests** Run the ``ci/asan_tests`` with the commit. This will enable ASAN build and run the whole Python tests to detect memory leaks. From 7a78f4e95960bf8560b0547802f171e2b40e4f6b Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Tue, 26 Jan 2021 04:05:21 -0500 Subject: [PATCH 047/245] [Collective][PR 4/6] NCCL Communicator caching and preliminary stream management (#13030) Co-authored-by: Dacheng Li --- python/ray/util/collective/__init__.py | 18 +- python/ray/util/collective/collective.py | 327 ++++++++-- .../collective_group/nccl_collective_group.py | 609 ++++++++++++------ .../collective/collective_group/nccl_util.py | 50 +- .../examples/nccl_allreduce_example.py | 7 +- ...reduce_example_declare_collective_group.py | 1 - .../nccl_allreduce_multigpu_example.py | 43 ++ .../examples/nccl_p2p_example_multigpu.py | 53 ++ python/ray/util/collective/tests/conftest.py | 39 +- .../distributed_multigpu_tests/__init__.py | 0 .../test_distributed_multigpu_allgather.py | 82 +++ .../test_distributed_multigpu_allreduce.py | 160 +++++ .../test_distributed_multigpu_basic_apis.py | 117 ++++ .../test_distributed_multigpu_broadcast.py | 92 +++ .../test_distributed_multigpu_reduce.py | 173 +++++ ...test_distributed_multigpu_reducescatter.py | 82 +++ .../test_distributed_multigpu_sendrecv.py | 47 ++ .../test_distributed_basic_apis.py | 6 +- .../test_distributed_broadcast.py | 3 +- .../tests/sinlge_node_tests/__init__.py | 0 .../{ => sinlge_node_tests}/test_allgather.py | 0 .../{ => sinlge_node_tests}/test_allreduce.py | 0 .../test_basic_apis.py | 6 +- .../{ => sinlge_node_tests}/test_broadcast.py | 0 .../{ => sinlge_node_tests}/test_reduce.py | 0 .../test_reducescatter.py | 0 .../{ => sinlge_node_tests}/test_sendrecv.py | 0 python/ray/util/collective/tests/util.py | 272 +++++++- python/ray/util/collective/types.py | 19 + 29 files changed, 1930 insertions(+), 276 deletions(-) create mode 100644 python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py create mode 100644 python/ray/util/collective/examples/nccl_p2p_example_multigpu.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py create mode 100644 python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py create mode 100644 python/ray/util/collective/tests/sinlge_node_tests/__init__.py rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_allgather.py (100%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_allreduce.py (100%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_basic_apis.py (97%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_broadcast.py (100%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_reduce.py (100%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_reducescatter.py (100%) rename python/ray/util/collective/tests/{ => sinlge_node_tests}/test_sendrecv.py (100%) diff --git a/python/ray/util/collective/__init__.py b/python/ray/util/collective/__init__.py index 4ae88660702f..694698474062 100644 --- a/python/ray/util/collective/__init__.py +++ b/python/ray/util/collective/__init__.py @@ -1,11 +1,15 @@ -from ray.util.collective.collective import nccl_available, mpi_available, \ +from ray.util.collective.collective import nccl_available, gloo_available, \ is_group_initialized, init_collective_group, destroy_collective_group, \ - get_rank, get_world_size, allreduce, barrier, reduce, broadcast, \ - allgather, reducescatter, send, recv + declare_collective_group, get_rank, get_world_size, allreduce, \ + allreduce_multigpu, barrier, reduce, reduce_multigpu, broadcast, \ + broadcast_multigpu, allgather, allgather_multigpu, reducescatter, \ + reducescatter_multigpu, send, send_multigpu, recv, recv_multigpu __all__ = [ - "nccl_available", "mpi_available", "is_group_initialized", - "init_collective_group", "destroy_collective_group", "get_rank", - "get_world_size", "allreduce", "barrier", "reduce", "broadcast", - "allgather", "reducescatter", "send", "recv" + "nccl_available", "gloo_available", "is_group_initialized", + "init_collective_group", "destroy_collective_group", + "declare_collective_group", "get_rank", "get_world_size", "allreduce", + "allreduce_multigpu", "barrier", "reduce", "reduce_multigpu", "broadcast", + "broadcast_multigpu", "allgather", "allgather_multigpu", "reducescatter", + "reducescatter_multigpu", "send", "send_multigpu", "recv", "recv_multigpu" ] diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index 08f9026b0467..afd523e6bf37 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -7,14 +7,9 @@ import ray from ray.util.collective import types -_MPI_AVAILABLE = False +_GLOO_AVAILABLE = False _NCCL_AVAILABLE = True -# try: -# from ray.util.collective.collective_group.mpi_collective_group \ -# import MPIGroup -# except ImportError: -# _MPI_AVAILABLE = False try: from ray.util.collective.collective_group import NCCLGroup except ImportError: @@ -27,8 +22,8 @@ def nccl_available(): return _NCCL_AVAILABLE -def mpi_available(): - return _MPI_AVAILABLE +def gloo_available(): + return _GLOO_AVAILABLE class GroupManager(object): @@ -51,9 +46,11 @@ def create_collective_group(self, backend, world_size, rank, group_name): """ backend = types.Backend(backend) if backend == types.Backend.MPI: + raise RuntimeError("Ray does not support MPI.") + elif backend == types.Backend.GLOO: raise NotImplementedError() elif backend == types.Backend.NCCL: - logger.debug("creating NCCL group: '{}'".format(group_name)) + logger.debug("Creating NCCL group: '{}'...".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name @@ -100,9 +97,9 @@ def init_collective_group(world_size: int, """Initialize a collective group inside an actor process. Args: - world_size (int): the total number of processed in the group. + world_size (int): the total number of processes in the group. rank (int): the rank of the current process. - backend: the CCL backend to use, NCCL or MPI. + backend: the CCL backend to use, NCCL or GLOO. group_name (str): the name of the collective group. Returns: @@ -137,10 +134,13 @@ def declare_collective_group(actors, Args: actors (list): a list of actors to be set in a collective group. - group_options (dict): a dictionary that contains group_name(str), - world_size(int), rank(list of int, e.g. [0,1] - means the first actor is rank 0, and the second - actor is rank 1), backend(str). + world_size (int): the total number of processes in the group. + ranks (List[int]): the rank of each actor. + backend: the CCL backend to use, NCCL or GLOO. + group_name (str): the name of the collective group. + + Returns: + None """ backend = types.Backend(backend) _check_backend_availability(backend) @@ -162,18 +162,25 @@ def declare_collective_group(actors, "Ranks must be a permutation from 0 to '{}'. Got '{}'.".format( len(ranks), "".join([str(r) for r in ranks]))) - assert world_size > 0 - assert all(ranks) >= 0 and all(ranks) < world_size + if world_size <= 0: + raise RuntimeError("World size must be greater than zero. " + "Got '{}'.".format(world_size)) + if not all(ranks) >= 0: + raise RuntimeError("Ranks must be non-negative.") + if not all(ranks) < world_size: + raise RuntimeError("Ranks cannot be greater than world_size.") # avoid a circular dependency from ray.util.collective.util import Info - # store the information into a NamedActor that can be accessed later/ + # store the information into a NamedActor that can be accessed later. name = "info_" + group_name actors_id = [a._ray_actor_id for a in actors] + # TODO (Dacheng): how do we recycle this name actor? info = Info.options(name=name, lifetime="detached").remote() ray.get([info.set_info.remote(actors_id, world_size, ranks, backend)]) +# TODO (we need a declarative destroy() API here.) def destroy_collective_group(group_name: str = "default") -> None: """Destroy a collective group given its group name.""" _check_inside_actor() @@ -206,9 +213,8 @@ def get_world_size(group_name: str = "default") -> int: group_name: the name of the group to query Returns: - The world size of the collective group, - -1 if the group does not exist or the process does - not belong to the group. + The world size of the collective group, -1 if the group does + not exist or the process does not belong to the group. """ _check_inside_actor() if not is_group_initialized(group_name): @@ -232,7 +238,29 @@ def allreduce(tensor, group_name: str = "default", op=types.ReduceOp.SUM): g = _check_and_get_group(group_name) opts = types.AllReduceOptions opts.reduceOp = op - g.allreduce(tensor, opts) + g.allreduce([tensor], opts) + + +def allreduce_multigpu(tensor_list: list, + group_name: str = "default", + op=types.ReduceOp.SUM): + """Collective allreduce a list of tensors across the group. + + Args: + tensor_list (List[tensor]): list of tensors to be allreduced, + each on a GPU. + group_name (str): the collective group name to perform allreduce. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("Multigpu calls requires NCCL and Cupy.") + _check_tensor_list_input(tensor_list) + g = _check_and_get_group(group_name) + opts = types.AllReduceOptions + opts.reduceOp = op + g.allreduce(tensor_list, opts) def barrier(group_name: str = "default"): @@ -256,8 +284,8 @@ def reduce(tensor, Args: tensor: the tensor to be reduced on this process. - dst_rank: the rank of the destination process. - group_name: the collective group name to perform reduce. + dst_rank (int): the rank of the destination process. + group_name (str): the collective group name to perform reduce. op: The reduce operation. Returns: @@ -271,7 +299,42 @@ def reduce(tensor, opts = types.ReduceOptions() opts.reduceOp = op opts.root_rank = dst_rank - g.reduce(tensor, opts) + opts.root_tensor = 0 + g.reduce([tensor], opts) + + +def reduce_multigpu(tensor_list: list, + dst_rank: int = 0, + dst_tensor: int = 0, + group_name: str = "default", + op=types.ReduceOp.SUM): + """Reduce the tensor across the group to the destination rank + and destination tensor. + + Args: + tensor_list: the list of tensors to be reduced on this process; + each tensor located on a GPU. + dst_rank (int): the rank of the destination process. + dst_tensor: the index of GPU at the destination. + group_name (str): the collective group name to perform reduce. + op: The reduce operation. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("Multigpu calls requires NCCL and Cupy.") + _check_tensor_list_input(tensor_list) + g = _check_and_get_group(group_name) + + # check dst rank + _check_rank_valid(g, dst_rank) + _check_root_tensor_valid(len(tensor_list), dst_tensor) + opts = types.ReduceOptions() + opts.reduceOp = op + opts.root_rank = dst_rank + opts.root_tensor = dst_tensor + g.reduce(tensor_list, opts) def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): @@ -279,8 +342,8 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): Args: tensor: the tensor to be broadcasted (src) or received (destination). - src_rank: the rank of the source process. - group_name: he collective group name to perform broadcast. + src_rank (int): the rank of the source process. + group_name (str): the collective group name to perform broadcast. Returns: None @@ -292,7 +355,37 @@ def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): _check_rank_valid(g, src_rank) opts = types.BroadcastOptions() opts.root_rank = src_rank - g.broadcast(tensor, opts) + opts.root_tensor = 0 + g.broadcast([tensor], opts) + + +def broadcast_multigpu(tensor_list, + src_rank: int = 0, + src_tensor: int = 0, + group_name: str = "default"): + """Broadcast the tensor from a source GPU to all other GPUs. + + Args: + tensor_list: the tensors to broadcast (src) or receive (dst). + src_rank (int): the rank of the source process. + src_tensor (int): the index of the source GPU on the source process. + group_name (str): the collective group name to perform broadcast. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("Multigpu calls requires NCCL and Cupy.") + _check_tensor_list_input(tensor_list) + g = _check_and_get_group(group_name) + + # check src rank + _check_rank_valid(g, src_rank) + _check_root_tensor_valid(len(tensor_list), src_tensor) + opts = types.BroadcastOptions() + opts.root_rank = src_rank + opts.root_tensor = src_tensor + g.broadcast(tensor_list, opts) def allgather(tensor_list: list, tensor, group_name: str = "default"): @@ -301,7 +394,7 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): Args: tensor_list (list): the results, stored as a list of tensors. tensor: the tensor (to be gathered) in the current process - group_name: the name of the collective group. + group_name (str): the name of the collective group. Returns: None @@ -314,9 +407,33 @@ def allgather(tensor_list: list, tensor, group_name: str = "default"): # Here we make it more strict: len(tensor_list) == world_size. raise RuntimeError( "The length of the tensor list operands to allgather " - "must not be equal to world_size.") + "must be equal to world_size.") + opts = types.AllGatherOptions() + g.allgather([tensor_list], [tensor], opts) + + +def allgather_multigpu(output_tensor_lists: list, + input_tensor_list: list, + group_name: str = "default"): + """Allgather tensors from each gpus of the group into lists. + + Args: + output_tensor_lists (List[List[tensor]]): gathered results, with shape + must be num_gpus * world_size * shape(tensor). + input_tensor_list: (List[tensor]): a list of tensors, with shape + num_gpus * shape(tensor). + group_name (str): the name of the collective group. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("Multigpu calls requires NCCL and Cupy.") + _check_tensor_lists_input(output_tensor_lists) + _check_tensor_list_input(input_tensor_list) + g = _check_and_get_group(group_name) opts = types.AllGatherOptions() - g.allgather(tensor_list, tensor, opts) + g.allgather(output_tensor_lists, input_tensor_list, opts) def reducescatter(tensor, @@ -346,11 +463,38 @@ def reducescatter(tensor, "must not be equal to world_size.") opts = types.ReduceScatterOptions() opts.reduceOp = op - g.reducescatter(tensor, tensor_list, opts) + g.reducescatter([tensor], [tensor_list], opts) + + +def reducescatter_multigpu(output_tensor_list, + input_tensor_lists, + group_name: str = "default", + op=types.ReduceOp.SUM): + """Reducescatter a list of tensors across all GPUs. + + Args: + output_tensor_list: the resulted list of tensors, with + shape: num_gpus * shape(tensor). + input_tensor_lists: the original tensors, with shape: + num_gpus * world_size * shape(tensor). + group_name (str): the name of the collective group. + op: The reduce operation. + + Returns: + None. + """ + if not types.cupy_available(): + raise RuntimeError("Multigpu calls requires NCCL and Cupy.") + _check_tensor_lists_input(input_tensor_lists) + _check_tensor_list_input(output_tensor_list) + g = _check_and_get_group(group_name) + opts = types.ReduceScatterOptions() + opts.reduceOp = op + g.reducescatter(output_tensor_list, input_tensor_lists, opts) def send(tensor, dst_rank: int, group_name: str = "default"): - """Send a tensor to a remote processes synchronously. + """Send a tensor to a remote process synchronously. Args: tensor: the tensor to send. @@ -366,7 +510,41 @@ def send(tensor, dst_rank: int, group_name: str = "default"): if dst_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(dst_rank)) - g.send(tensor, dst_rank) + opts = types.SendOptions() + opts.dst_rank = dst_rank + g.send([tensor], opts) + + +def send_multigpu(tensor, + dst_rank: int, + dst_gpu_index: int, + group_name: str = "default"): + """Send a tensor to a remote GPU synchronously. + + The function asssume each process owns >1 GPUs, and the sender + process and receiver process has equal nubmer of GPUs. + + Args: + tensor: the tensor to send, located on a GPU. + dst_rank (int): the rank of the destination process. + dst_gpu_index (int): the destination gpu index. + group_name (str): the name of the collective group. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("send_multigpu call requires NCCL.") + _check_single_tensor_input(tensor) + g = _check_and_get_group(group_name) + _check_rank_valid(g, dst_rank) + if dst_rank == g.rank: + raise RuntimeError("The dst_rank '{}' is self. Considering " + "doing GPU to GPU memcpy instead?".format(dst_rank)) + opts = types.SendOptions() + opts.dst_rank = dst_rank + opts.dst_gpu_index = dst_gpu_index + g.send([tensor], opts) def recv(tensor, src_rank: int, group_name: str = "default"): @@ -386,7 +564,41 @@ def recv(tensor, src_rank: int, group_name: str = "default"): if src_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(src_rank)) - g.recv(tensor, src_rank) + opts = types.RecvOptions() + opts.src_rank = src_rank + g.recv([tensor], opts) + + +def recv_multigpu(tensor, + src_rank: int, + src_gpu_index: int, + group_name: str = "default"): + """Receive a tensor from a remote GPU synchronously. + + The function asssume each process owns >1 GPUs, and the sender + process and receiver process has equal nubmer of GPUs. + + Args: + tensor: the received tensor, located on a GPU. + src_rank (int): the rank of the source process. + src_gpu_index (int): the index of the source gpu on the src process. + group_name (str): the name of the collective group. + + Returns: + None + """ + if not types.cupy_available(): + raise RuntimeError("recv_multigpu call requires NCCL.") + _check_single_tensor_input(tensor) + g = _check_and_get_group(group_name) + _check_rank_valid(g, src_rank) + if src_rank == g.rank: + raise RuntimeError("The dst_rank '{}' is self. Considering " + "doing GPU to GPU memcpy instead?".format(src_rank)) + opts = types.RecvOptions() + opts.src_rank = src_rank + opts.src_gpu_index = src_gpu_index + g.recv([tensor], opts) def _check_and_get_group(group_name): @@ -423,16 +635,6 @@ def _check_and_get_group(group_name): return g -def _check_backend_availability(backend: types.Backend): - """Check whether the backend is available.""" - if backend == types.Backend.MPI: - if not mpi_available(): - raise RuntimeError("MPI is not available.") - elif backend == types.Backend.NCCL: - if not nccl_available(): - raise RuntimeError("NCCL is not available.") - - def _check_single_tensor_input(tensor): """Check if the tensor is with a supported type.""" if isinstance(tensor, np.ndarray): @@ -448,6 +650,16 @@ def _check_single_tensor_input(tensor): type(tensor))) +def _check_backend_availability(backend: types.Backend): + """Check whether the backend is available.""" + if backend == types.Backend.GLOO: + if not gloo_available(): + raise RuntimeError("GLOO is not available.") + elif backend == types.Backend.NCCL: + if not nccl_available(): + raise RuntimeError("NCCL is not available.") + + def _check_inside_actor(): """Check if currently it is inside a Ray actor/task.""" worker = ray.worker.global_worker @@ -462,8 +674,8 @@ def _check_rank_valid(g, rank: int): """Check the rank: 0 <= rank < world_size.""" if rank < 0: raise ValueError("rank '{}' is negative.".format(rank)) - if rank > g.world_size: - raise ValueError("rank '{}' is greater than world size " + if rank >= g.world_size: + raise ValueError("rank '{}' must be less than world size " "'{}'".format(rank, g.world_size)) @@ -476,3 +688,24 @@ def _check_tensor_list_input(tensor_list): raise RuntimeError("Got an empty list of tensors.") for t in tensor_list: _check_single_tensor_input(t) + + +def _check_tensor_lists_input(tensor_lists): + """Check if the input is a list of lists of supported tensor types.""" + if not isinstance(tensor_lists, list): + raise RuntimeError("The input must be a list of lists of tensors. " + "Got '{}'.".format(type(tensor_lists))) + if not tensor_lists: + raise RuntimeError(f"Did not receive tensors. Got: {tensor_lists}") + for t in tensor_lists: + _check_tensor_list_input(t) + + +def _check_root_tensor_valid(length, root_tensor): + """Check the root_tensor device is 0 <= root_tensor < length""" + if root_tensor < 0: + raise ValueError("root_tensor '{}' is negative.".format(root_tensor)) + if root_tensor >= length: + raise ValueError( + "root_tensor '{}' is greater than the number of GPUs: " + "'{}'".format(root_tensor, length)) diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index ba8c7d2dbb08..4cc693f11479 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -11,15 +11,11 @@ from ray.util.collective.const import get_nccl_store_name from ray.util.collective.types import AllReduceOptions, \ BarrierOptions, Backend, ReduceOptions, BroadcastOptions, \ - AllGatherOptions, ReduceScatterOptions + AllGatherOptions, ReduceScatterOptions, SendOptions, \ + RecvOptions logger = logging.getLogger(__name__) -# TODO(Hao): -# (1) stream management, instead of using the default stream, -# using a dedicate stream -# (2) communicator management and support num_gpus > 2 per actor. - class Rendezvous: """A rendezvous class for different actor/task processes to meet. @@ -31,13 +27,18 @@ class Rendezvous: process. Args: - group_name (str): the unique user-specified group name. + store_key (str): the unique store key, usually as a concatanation + of group_name and communicator key. See `get_nccl_communicator` + for more details. """ - def __init__(self, group_name): - if not group_name: - raise ValueError("Invalid group name.") - self._group_name = group_name + def __init__(self, store_key): + if not store_key: + raise ValueError( + "Invalid store_key. The store_key is a concatenation of " + "'group_name' and the 'communicator_key'. See the " + "docstring of `get_nccl_communicator` for details.") + self._store_key = store_key self._store_name = None self._store = None @@ -53,7 +54,7 @@ def meet(self, timeout_s=180): if timeout_s <= 0: raise ValueError("The 'timeout' argument must be positive. " "Got '{}'.".format(timeout_s)) - self._store_name = get_nccl_store_name(self._group_name) + self._store_name = get_nccl_store_name(self._store_key) timeout_delta = datetime.timedelta(seconds=timeout_s) elapsed = datetime.timedelta(seconds=0) start_time = datetime.datetime.now() @@ -72,7 +73,9 @@ def meet(self, timeout_s=180): break if not self._store: raise RuntimeError("Unable to meet other processes " - "at the rendezvous store.") + "at the rendezvous store. If you are using " + "P2P communication, please check if tensors " + "are put in the correct GPU. ") @property def store(self): @@ -83,8 +86,9 @@ def get_nccl_id(self, timeout_s=180): Args: timeout_s: timeout in seconds. + Return: - str: the NCCLUniqueID if successful. + uid (str): the NCCLUniqueID if successful. """ if not self._store: raise ValueError("Rendezvous store is not setup.") @@ -110,55 +114,52 @@ def __init__(self, world_size, rank, group_name): """Init an NCCL collective group.""" super(NCCLGroup, self).__init__(world_size, rank, group_name) - # TODO(Hao): change this to a be a cache - self._collective_comm_cache = None - self._p2p_comm_cache = {} + # communicator and stream cache. + # TODO (Hao): we need a lock here... + self._dev_comm_map = {} + self._dev_streams_map = {} + + # record the used GPU IDs. + self._used_gpu_indices = set() if nccl_util.get_nccl_build_version() < 2000: raise RuntimeError("NCCL in Ray requires NCCL >= 2.0.") - # TODO(Hao): check version here if nccl_util.get_nccl_runtime_version() < 2704: logger.warning("NCCL send/recv calls requires NCCL>=2.7.4") - # Setup a tensor for barrier calls - self._barrier_tensor = cupy.array([1]) - def destroy_group(self): """Destroy the group and release NCCL communicators.""" - if self._collective_comm_cache: - self.barrier() - # We also need a barrier call here. - stream = self._get_cuda_stream() - stream.synchronize() - # destroy the communicator - self._collective_comm_cache.destroy() - self._collective_comm_cache = None - - if self.rank == 0: - self._destroy_store(self.group_name) - - if self._p2p_comm_cache: - for key, comm in self._p2p_comm_cache.items(): - comm.destroy() - min_rank, max_rank = self._parse_p2p_group_key(key) - if self.rank == min_rank: - self._destroy_store(key) - self._p2p_comm_cache[key] = None - for key in list(self._p2p_comm_cache.keys()): - del self._p2p_comm_cache[key] - self._p2p_comm_cache = None - + if len(self._dev_comm_map.keys()) > 0: + + # TODO(Hao): check this barrier call + # self.barrier() + + # Destroy the communicators and streams. + for comm_key, comms in self._dev_comm_map.items(): + for c in comms: + c.destroy() + self._dev_comm_map[comm_key] = None + + if self.rank == 0: + for comm_key in self._dev_comm_map: + assert not self._dev_comm_map[comm_key] + group_key = self._generate_group_key(comm_key) + self._destroy_store(group_key) + self._barrier_tensor = None + self._dev_comm_map = None + self._dev_streams_map = None super(NCCLGroup, self).destroy_group() @classmethod def backend(cls): return Backend.NCCL - def allreduce(self, tensor, allreduce_options=AllReduceOptions()): - """AllReduce the tensor across the collective group following options. + def allreduce(self, tensors, allreduce_options=AllReduceOptions()): + """AllReduce tensors across the collective group following options. Args: - tensor: the tensor to be reduced, each tensor locates on a GPU. + tensors (List): the list of tensors to be reduced. Each tensor must + reside on one GPU of the current process. allreduce_options: allreduce options. Returns: @@ -174,29 +175,41 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp), stream.ptr) - self._collective(tensor, tensor, collective_fn) + self._collective(tensors, tensors, collective_fn) def barrier(self, barrier_options=BarrierOptions()): """Blocks until all processes reach this barrier. Args: - barrier_options: + barrier_options: barrier options. Returns: None """ - self.allreduce(self._barrier_tensor) - - def reduce(self, tensor, reduce_options=ReduceOptions()): - """Reduce tensor to a destination process following options. + # Get the device list. + if self._used_gpu_indices: + devices = list(self._used_gpu_indices) + else: + devices = list(range(nccl_util.get_num_gpus())) + barrier_tensors = [None] * len(devices) + for i, d in enumerate(devices): + with nccl_util.Device(d): + barrier_tensors[i] = cupy.array([1]) + self.allreduce(barrier_tensors) + + def reduce(self, tensors, reduce_options=ReduceOptions()): + """Reduce tensors to a destination gpu following options. Args: - tensor: the tensor to be reduced. - reduce_options: reduce options + tensors (List): the list of tensors to be reduced, each tensor + must reside on one gpu of the current process. + reduce_options: reduce options. Returns: None """ + root_rank = len(tensors) * reduce_options.root_rank \ + + reduce_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.reduce( @@ -205,40 +218,43 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), nccl_util.get_nccl_reduce_op(reduce_options.reduceOp), - reduce_options.root_rank, stream.ptr) + root_rank, stream.ptr) - self._collective(tensor, tensor, collective_fn) + self._collective(tensors, tensors, collective_fn) - def broadcast(self, tensor, broadcast_options=BroadcastOptions()): - """Broadcast tensor to all other processes following options. + def broadcast(self, tensors, broadcast_options=BroadcastOptions()): + """Broadcast tensors to all other gpus following options. Args: - tensor: the tensor to be broadcasted. + tensors (List): tensors to be broadcast or received. broadcast_options: broadcast options. Returns: None """ + root_rank = len(tensors) * broadcast_options.root_rank \ + + broadcast_options.root_tensor def collective_fn(input_tensor, output_tensor, comm, stream): comm.broadcast( nccl_util.get_tensor_ptr(input_tensor), nccl_util.get_tensor_ptr(output_tensor), nccl_util.get_tensor_n_elements(input_tensor), - nccl_util.get_nccl_tensor_dtype(input_tensor), - broadcast_options.root_rank, stream.ptr) + nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank, + stream.ptr) - self._collective(tensor, tensor, collective_fn) + self._collective(tensors, tensors, collective_fn) def allgather(self, - tensor_list, - tensor, + tensor_lists, + tensors, allgather_options=AllGatherOptions()): - """Allgather tensors across the group into a list of tensors. + """Allgather tensors across gpus into a list of tensors. Args: - tensor_list: the tensor list to store the results. - tensor: the tensor to be allgather-ed across the group. + tensor_lists (List[List[Tensor]]): allgathered tensors. + tensors: the list of tensors to allgather across the group. + Each tensor must lolcate on a GPU of the process. allgather_options: allgather options. Returns: @@ -252,30 +268,36 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_tensor_n_elements(input_tensor), nccl_util.get_nccl_tensor_dtype(input_tensor), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) - flattened_output_tensor = _flatten_for_scatter_gather( - tensor_list, copy=False) + _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) + output_flattened = [ + _flatten_for_scatter_gather(tensor_list, copy=False) + for tensor_list in tensor_lists + ] def postprocess_fn(stream): - for i, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(tensor, flattened_output_tensor[i]) + # TODO(Hao): designate a copy stream. + for i, tensor_list in enumerate(tensor_lists): + for j, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(tensor, output_flattened[i][j]) self._collective( - tensor, - flattened_output_tensor, + tensors, + output_flattened, collective_fn, postprocess_fn=postprocess_fn) def reducescatter(self, - tensor, - tensor_list, + tensors, + tensor_lists, reducescatter_options=ReduceScatterOptions()): - """Reducescatter a list of tensors across the group. + """Reduce the scatter a list of tensors across the group. Args: - tensor: the output tensor (could be unspecified). - tensor_list: the list of tensor to be reduced then scattered. - reducescatter_options: reducescatter options. + tensors (List): the output tensors (could be unspecified), each + located on a GPU of the current process. + tensor_lists (List[List]): the list of tensors to be reduced then + scattered. + reducescatter_options: reduce-scatter options. Returns: None @@ -290,26 +312,30 @@ def collective_fn(input_tensor, output_tensor, comm, stream): nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp), stream.ptr) - _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list) - flattened_input_tensor = _flatten_for_scatter_gather( - tensor_list, copy=False) + _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists) + input_flattened = [ + _flatten_for_scatter_gather(tensor_list, copy=False) + for tensor_list in tensor_lists + ] def preprocess_fn(stream): - for i, tensor in enumerate(tensor_list): - nccl_util.copy_tensor(flattened_input_tensor[i], tensor) + # TODO(Hao): designate a copy stream. + for i, tensor_list in enumerate(tensor_lists): + for j, tensor in enumerate(tensor_list): + nccl_util.copy_tensor(input_flattened[i][j], tensor) self._collective( - flattened_input_tensor, - tensor, + input_flattened, + tensors, collective_fn, preprocess_fn=preprocess_fn) - def send(self, tensor, dst_rank): - """Send tensor to a destination process in the group. + def send(self, tensors, send_options=SendOptions()): + """Send a tensor to a destination gpu in the group. Args: - tensor: the tensor to send. - dst_rank: the rank of the destination process. + tensors (List): the tensor to send. + send_options: send options. Returns: None @@ -321,14 +347,15 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensor, p2p_fn, dst_rank) + self._point2point(tensors, p2p_fn, send_options.dst_rank, + send_options.dst_gpu_index) - def recv(self, tensor, src_rank): - """Receive tensor from a source process in the group. + def recv(self, tensors, recv_options=RecvOptions()): + """Receive a tensor from a source gpu in the group. Args: - tensor: the received tensor. - src_rank: the rank of the source process. + tensors (List): the received tensor. + recv_options: Receive options. Returns: None @@ -340,128 +367,218 @@ def p2p_fn(tensor, comm, stream, peer): nccl_util.get_tensor_n_elements(tensor), nccl_util.get_nccl_tensor_dtype(tensor), peer, stream.ptr) - self._point2point(tensor, p2p_fn, src_rank) + self._point2point(tensors, p2p_fn, recv_options.src_rank, + recv_options.src_gpu_index) + + def _get_nccl_collective_communicator(self, comm_key, device_list): + """Create or retrieve an NCCL communicator from cache. + + If the communicator is found in cache, return the communicator. If not, + a communicator and a stream will be created and put in cache. + TODO(Hao): this function is not thread-safe now. - def _get_nccl_collective_communicator(self): - """Create or retrieve a cached NCCL communicator. + Args: + comm_key (str): the key to query the communicator cache. + device_list (List): a list of GPU devices of the current process + that participates into the collective. Returns: - communicator + communicator: the NCCL communicator corresponded to the devices. """ - if not self._collective_comm_cache: - # create the communicator - if self.rank == 0: - group_uid = self._generate_nccl_uid(self.group_name) - else: - rendezvous = Rendezvous(self.group_name) - rendezvous.meet() - group_uid = rendezvous.get_nccl_id() - self._collective_comm_cache = \ - nccl_util.create_nccl_communicator(self.world_size, - group_uid, - self.rank) - return self._collective_comm_cache - - def _get_nccl_p2p_communicator(self, rank1, rank2): + if not comm_key: + raise RuntimeError("Got empty communicator key.") + for d in device_list: + self._used_gpu_indices.add(d) + + # TODO(Hao): lock the _dev_comm_map here. + if comm_key in self._dev_comm_map: + return self._dev_comm_map[comm_key] + + group_key = self._generate_group_key(comm_key) + if self.rank == 0: + nccl_uid = self._generate_nccl_uid(group_key) + else: + rendezvous = Rendezvous(group_key) + rendezvous.meet() + nccl_uid = rendezvous.get_nccl_id() + + # Now create the communicators + actual_world_size = len(device_list) * self.world_size + comms = [None] * len(device_list) + streams = [None] * len(device_list) + nccl_util.groupStart() + for i, device in enumerate(device_list): + actual_rank = self.rank * len(device_list) + i + with nccl_util.Device(device): + comms[i] = nccl_util.create_nccl_communicator( + actual_world_size, nccl_uid, actual_rank) + streams[i] = cupy.cuda.Stream.null + # Stream(non_blocking=True) + nccl_util.groupEnd() + self._dev_comm_map[comm_key] = comms + self._dev_streams_map[comm_key] = streams + return comms + + @staticmethod + def _sync_streams(): + """Let NCCL streams wait for current streams for every device.""" + # FIXME: This behavior is different from nccl document. It seems like + # cupy allocate tensors on null streams. + cupy.cuda.Stream.null.synchronize() + + def _get_nccl_p2p_communicator(self, comm_key, my_gpu_idx, peer_rank, + peer_gpu_idx): """Create or retrieve an NCCL communicator for p2p tasks. - Args: - rank1 (int): source rank. - rank2 (int): destination rank. + Note(Hao): this function is not thread-safe now. + Args: + comm_key (str): communicator key. + my_gpu_idx (int): the gpu index on the current process. + peer_rank (int): the rank of the destination process. + peer_gpu_idx (int): the gpu index on the peer process. Returns: communicator """ - min_rank = min(rank1, rank2) - max_rank = max(rank1, rank2) - my_rank = 0 if self.rank == min_rank else 1 - p2p_group_key = self._generate_p2p_group_key(min_rank, max_rank) - comm = self._p2p_comm_cache.get(p2p_group_key) - if not comm: - if self.rank == min_rank: - group_uid = self._generate_nccl_uid(p2p_group_key) - else: - rendezvous = Rendezvous(p2p_group_key) - rendezvous.meet() - group_uid = rendezvous.get_nccl_id() - comm = nccl_util.create_nccl_communicator(2, group_uid, my_rank) - self._p2p_comm_cache[p2p_group_key] = comm - return comm - - def _generate_p2p_group_key(self, min_rank, max_rank): - return self.group_name + "_" + str(min_rank) + "_" + str(max_rank) + if not comm_key: + raise RuntimeError("Got empty communicator key.") + + # TODO(Hao): lock the _dev_comm_map here. + if comm_key in self._dev_comm_map: + return self._dev_comm_map[comm_key] + + # Note (Hao): This is a bit complex so I decide to take a note here. + # Here we need to consider three cases: + # Case 1: src_rank != dst_rank, hence the send and recv happen on + # different process (actors/tasks); each process makes independent + # collective calls and manages corresponding communicators. + # Case 2: src_rank == dst_rank, src_gpu_idx == dst_gpu_idx; for + # this case, we simply throw a RuntimeError; + # Case 3: src_rank == dst_rank, src_gpu_idx != dst_gpu_idx, which + # means the send and recv will be called on the same process. We + # DO NOT support this case for now. We need to properly scope: + # (1) communicators creation, and + # (2) send/recv calls + # using groupStart(( and groupEnd() calls to avoid deadlocks. + if self.rank < peer_rank: + my_p2p_rank = 0 + elif self.rank > peer_rank: + my_p2p_rank = 1 + else: + raise RuntimeError( + "Send and recv happens on the same process! " + "ray.util.collective does not support this case as of now. " + "Alternatively, consider doing GPU to GPU memcpy?") + + group_key = self._generate_group_key(comm_key) + if my_p2p_rank == 0: + nccl_uid = self._generate_nccl_uid(group_key) + else: + rendezvous = Rendezvous(group_key) + rendezvous.meet() + nccl_uid = rendezvous.get_nccl_id() + + # create the p2p communicators + with nccl_util.Device(my_gpu_idx): + comm = nccl_util.create_nccl_communicator(2, nccl_uid, my_p2p_rank) + stream = cupy.cuda.Stream.null + # Stream(non_blocking=True) + self._dev_comm_map[comm_key] = [comm] + self._dev_streams_map[comm_key] = [stream] + return [comm] + + def _generate_group_key(self, comm_key): + """Generate a unique key used to initialize the KV store. + + The group key is a concatenation of the communicator key and + the group name, following: [comm_key]@[group_name]. + """ + return comm_key + "@" + self.group_name @staticmethod - def _parse_p2p_group_key(key): - strs = key.split("_") - return int(strs[-2]), int(strs[-1]) + def _destroy_store(group_key): + """Destroy the KV store (Ray named actor). - @staticmethod - def _destroy_store(group_name): - store_name = get_nccl_store_name(group_name) + Args: + group_key (str): the unique key to retrieve the KV store. + + Returns: + None + """ + store_name = get_nccl_store_name(group_key) store = ray.get_actor(store_name) # ray.get([store.__ray_terminate__.remote()]) ray.kill(store) - def _generate_nccl_uid(self, name): - """Generate an NCCL UID by calling the NCCL API. + def _generate_nccl_uid(self, key): + """Generate an NCCL unique ID for initializing communicators. + + The method will also create a KV store using Ray named actor and store + the NCCLUniqueID in the store. The store needs to be garbage collected + when destroying the collective group. Args: - name: the name of the collective group. + key (str): the key of the . Returns: - str: NCCL uid. + NCCLUniqueID (str): NCCL unique ID. """ group_uid = nccl_util.get_nccl_unique_id() - store_name = get_nccl_store_name(name) + store_name = get_nccl_store_name(key) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) - ray.wait([store.set_id.remote(group_uid)]) + ray.get([store.set_id.remote(group_uid)]) return group_uid - @staticmethod - def _get_cuda_stream(): - """Obtain an idle stream from a stream pool for the collective task.""" - # TODO: implement a simple stream manager. - return cupy.cuda.Stream.null - def _collective(self, - input_tensor, - output_tensor, + input_tensors, + output_tensors, collective_fn, preprocess_fn=None, postprocess_fn=None): """A method to encapsulate all collective calls. Args: - input_tensor: the input tensor. - output_tensor: the output tensor. + input_tensors: the list of the input tensors. + output_tensors: the list of the output tensors. collective_fn: the collective function call. - preprocess_fn: preprocess function to call before collectives. - postprocess_fn: postprocess function to call after collectives. + preprocess_fn: preprocess procedures before collective calls. + postprocess_fn: postprocess procedures after collective calls. Returns: None """ - comm = self._get_nccl_collective_communicator() - stream = self._get_cuda_stream() + _check_gpu_tensors(input_tensors) + _check_gpu_tensors(output_tensors) + + devices = nccl_util.get_tensor_device_list(input_tensors) + key = _get_comm_key_from_devices(devices) + comms = self._get_nccl_collective_communicator(key, devices) + streams = self._dev_streams_map[key] + + # TODO(Hao): sync streams and events + self._sync_streams() # Make the collective call if preprocess_fn: - preprocess_fn(stream) - collective_fn(input_tensor, output_tensor, comm, stream) + preprocess_fn(streams) + nccl_util.groupStart() + for i, tensor in enumerate(input_tensors): + collective_fn(tensor, output_tensors[i], comms[i], streams[i]) + nccl_util.groupEnd() if postprocess_fn: - postprocess_fn(stream) + postprocess_fn(streams) - def _point2point(self, tensor, p2p_fn, peer_rank: int): - """A method to encapsulate all p2p calls. + def _point2point(self, tensors, p2p_fn, peer_rank: int, peer_gpu_idx: int): + """A method to encapsulate all peer-to-peer calls (i.e., send/recv). Args: - tensor: the tensor to be sent/received. + tensors: the tensor to send or receive. p2p_fn: the p2p function call. - peer_rank (int): the peer rank of the current process. + peer_rank (int): the rank of the peer process. + peer_gpu_idx (int): the index of the gpu on the peer process. Returns: None @@ -471,13 +588,24 @@ def _point2point(self, tensor, p2p_fn, peer_rank: int): raise RuntimeError("P2p send/recv requires NCCL >= 2.7.4. " "Got '{}'.".format( nccl_util.get_nccl_runtime_version())) + _check_gpu_tensors(tensors) + + # we currently only support single device to single device send/recv. + assert len(tensors) == 1 + my_gpu_idx = nccl_util.get_tensor_device(tensors[0]) + comm_key = _get_comm_key_send_recv(self.rank, my_gpu_idx, peer_rank, + peer_gpu_idx) + comms = self._get_nccl_p2p_communicator(comm_key, my_gpu_idx, + peer_rank, peer_gpu_idx) + streams = self._dev_streams_map[comm_key] + + # TODO(Hao): sync streams and events + self._sync_streams() # We have made sure that self.rank != peer_rank during API check. peer_p2p_rank = 0 if self.rank > peer_rank else 1 - comm = self._get_nccl_p2p_communicator(self.rank, peer_rank) - stream = self._get_cuda_stream() - # Make the p2p call: - p2p_fn(tensor, comm, stream, peer_p2p_rank) + for i, tensor in enumerate(tensors): + p2p_fn(tensors[i], comms[i], streams[i], peer_p2p_rank) def _flatten_for_scatter_gather(tensor_list, copy=False): @@ -496,29 +624,130 @@ def _flatten_for_scatter_gather(tensor_list, copy=False): # note we need a cupy dtype here. dtype = nccl_util.get_cupy_tensor_dtype(t) buffer_shape = [len(tensor_list)] + nccl_util.get_tensor_shape(t) - buffer = cupy.empty(buffer_shape, dtype=dtype) + device = nccl_util.get_tensor_device(t) + with nccl_util.Device(device): + buffer = cupy.empty(buffer_shape, dtype=dtype) if copy: for i, tensor in enumerate(tensor_list): nccl_util.copy_tensor(buffer[i], tensor) return buffer -def _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list): - """Check the compatibility between tensor input and tensor list inputs.""" - if not tensor_list: - raise RuntimeError("Got empty list of tensors.") - dtype = nccl_util.get_nccl_tensor_dtype(tensor) - shape = nccl_util.get_tensor_shape(tensor) - for t in tensor_list: - # check dtype - dt = nccl_util.get_nccl_tensor_dtype(t) +def _check_inputs_compatibility_for_scatter_gather(tensors, tensor_lists): + """Check the compatibility between tensor input and tensor list input.""" + if not tensors or not isinstance(tensors, list): + raise RuntimeError( + "The first argument 'tensors' expects a list of tensors.") + if not tensor_lists or not isinstance(tensor_lists, list): + raise RuntimeError("The second argument 'tensor_lists' " + "expects a list of tensor list.") + dtype = nccl_util.get_nccl_tensor_dtype(tensors[0]) + shape = nccl_util.get_tensor_shape(tensors[0]) + for i, tensor_list in enumerate(tensor_lists): + # check all tensor in `tensors` match. + dt = nccl_util.get_nccl_tensor_dtype(tensors[i]) if dt != dtype: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same dtype. Got '{}' and '{}'" - "".format(dt, dtype)) + "have the same dtype. Got '{}' and '{}'." + .format(dt, dtype)) # Note: typically CCL libraries only requires they have the same - # number of elements; - # Here we make it more strict -- we require exact shape match. - if nccl_util.get_tensor_shape(t) != shape: + # number of elements; Here we make it more strict -- we require + # exact shape match. + s = nccl_util.get_tensor_shape(tensors[i]) + if s != shape: raise RuntimeError("All tensor operands to scatter/gather must " - "have the same shape.") + "have the same shape. Got '{}' and '{}'." + .format(s, shape)) + # check all tensors in `tensor_lists` match. + for t in tensor_lists[i]: + # check dtype + dt = nccl_util.get_nccl_tensor_dtype(t) + if dt != dtype: + raise RuntimeError( + "All tensor operands to scatter/gather must " + "have the same dtype. Got '{}' and '{}'.".format( + dt, dtype)) + s = nccl_util.get_tensor_shape(t) + if s != shape: + raise RuntimeError( + "All tensor operands to scatter/gather must " + "have the same shape. Got '{}' and '{}'.".format(s, shape)) + + +def _check_gpu_tensors(tensors): + """Check all tensors are distributed on different GPUs.""" + if not tensors or not isinstance(tensors, list): + raise RuntimeError("'tensors' must be a nonempty list.") + if len(tensors) > nccl_util.get_num_gpus(): + raise RuntimeError("Tensor list cannot be larger than the number" + "of available GPUs. Got {} > {}.".format( + len(tensors), nccl_util.get_num_gpus())) + t0 = tensors[0] + dt = nccl_util.get_nccl_tensor_dtype(t0) + s = nccl_util.get_tensor_shape(t0) + d = nccl_util.get_tensor_device(t0) + for i, t in enumerate(tensors): + if i == 0: + continue + # We need to check the following: + # (1) tensor is cuda (already checked during API) + # (2) tensor dtype + # (3) tensor shape match + # (4) each tensor is on a different GPU + dtype = nccl_util.get_nccl_tensor_dtype(t) + if dt != dtype: + raise RuntimeError("Tensors must have identical dtype. Got: '{}'." + .format(dtype)) + shape = nccl_util.get_tensor_shape(t) + if s != shape: + raise RuntimeError("Tensor must have identical shape. Got: '{}'." + .format(shape)) + device = nccl_util.get_tensor_device(t) + if device == d: + raise RuntimeError("Tensor must be on distinct GPUs.") + + +def _get_comm_key_from_devices(devices): + """Return a key from a list of devices for collective calls. + + For example, if the tensors are on gpus 0, 1, 2, 3, + then the key would be "0,1,2,3". + + Args: + devices(list): a list of GPU device indices + + Returns: + str: a string represents the key to query the communicator cache. + + """ + return ",".join([str(d) for d in devices]) + + +def _get_comm_key_send_recv(my_rank, my_gpu_idx, peer_rank, peer_gpu_idx): + """Return a key given source and destination ranks for p2p tasks. + + The p2p key is in the following form: + [min_rank]_[gpu_index]:[max_rank]_[gpu_index]. + + Args: + my_rank (int): the rank of the source process. + my_gpu_idx (int): the source gpu index on the process. + peer_rank (int): the rank of the destination process. + peer_gpu_idx (int): the destination gpu index on the process. + + Returns: + comm_key (str): a string key to query the communication cache. + """ + if my_rank < peer_rank: + lower_key = str(my_rank) + "_" + str(my_gpu_idx) + higher_key = str(peer_rank) + "_" + str(peer_gpu_idx) + elif my_rank > peer_rank: + lower_key = str(peer_rank) + "_" + str(peer_gpu_idx) + higher_key = str(my_rank) + "_" + str(my_gpu_idx) + else: + raise RuntimeError( + "Send and recv happens on the same process. ray.util.collective " + "does not support this case as of now. Alternatively, consider " + "doing GPU to GPU memcpy?") + comm_key = lower_key + ":" + higher_key + return comm_key diff --git a/python/ray/util/collective/collective_group/nccl_util.py b/python/ray/util/collective/collective_group/nccl_util.py index 889c8c443f36..36895d79b884 100644 --- a/python/ray/util/collective/collective_group/nccl_util.py +++ b/python/ray/util/collective/collective_group/nccl_util.py @@ -3,9 +3,12 @@ try: import cupy from cupy.cuda import nccl + from cupy.cuda import Device # noqa: F401 from cupy.cuda.nccl import get_version from cupy.cuda.nccl import get_build_version from cupy.cuda.nccl import NcclCommunicator + from cupy.cuda.nccl import groupStart # noqa: F401 + from cupy.cuda.nccl import groupEnd # noqa: F401 except ImportError: raise ImportError("NCCL in Ray requires Cupy being available!") @@ -74,6 +77,11 @@ } +def get_num_gpus(): + """Returns the number of compute-capable GPUs.""" + return cupy.cuda.runtime.getDeviceCount() + + def get_nccl_build_version(): return get_build_version() @@ -90,14 +98,12 @@ def create_nccl_communicator(world_size, nccl_unique_id, rank): """Create an NCCL communicator using NCCL APIs. Args: - world_size (int): the number of processes of this communcator group. + world_size (int): the number of processes of this communicator group. nccl_unique_id (str): the NCCLUniqueID for this group. rank (int): the rank of this process. Returns: comm (nccl.ncclComm_t): an NCCL communicator. """ - # TODO(Hao): make this inside the NCCLComm class, - # and implement the abort method. Make it RAII. comm = NcclCommunicator(world_size, nccl_unique_id, rank) return comm @@ -149,7 +155,7 @@ def get_tensor_ptr(tensor): if torch_available(): if isinstance(tensor, torch.Tensor): if not tensor.is_cuda: - raise RuntimeError("torch tensor must be on gpu.") + raise RuntimeError("Torch tensor must be on GPU.") return tensor.data_ptr() raise ValueError("Unsupported tensor type. Got: {}. Supported " "GPU tensor types are: torch.Tensor, " @@ -194,6 +200,24 @@ def get_tensor_strides(tensor): "cupy.ndarray.".format(type(tensor))) +def get_tensor_device(tensor): + """Return the GPU index of a tensor.""" + if isinstance(tensor, cupy.ndarray): + try: + device = tensor.device.id + except AttributeError as exec: + raise RuntimeError("The tensor is not on a valid GPU.") \ + from exec + elif torch_available() and isinstance(tensor, torch.Tensor): + device = tensor.device.index + if not isinstance(device, int): + raise RuntimeError("The tensor is not on a valid GPU.") + else: + raise ValueError("Unsupported tensor type. " + "Got: {}.".format(type(tensor))) + return device + + def copy_tensor(dst_tensor, src_tensor): """Copy the content from src_tensor to dst_tensor. @@ -228,3 +252,21 @@ def copy_tensor(dst_tensor, src_tensor): raise ValueError("Unsupported tensor type. Got: {} and {}. Supported " "GPU tensor types are: torch.Tensor, cupy.ndarray." .format(type(dst_tensor), type(src_tensor))) + + +def get_tensor_device_list(tensors): + """Returns the gpu devices of the list of input tensors. + + Args: + tensors(list): a list of tensors, each locates on a GPU. + + Returns: + list: the list of GPU devices. + + """ + if not isinstance(tensors, list): + raise RuntimeError( + "Expect a list of tensors each locates on a GPU device. " + "Got: '{}'.".format(type(tensors))) + devices = [get_tensor_device(t) for t in tensors] + return devices diff --git a/python/ray/util/collective/examples/nccl_allreduce_example.py b/python/ray/util/collective/examples/nccl_allreduce_example.py index 7010d69249f2..797924621a52 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example.py @@ -11,12 +11,11 @@ def __init__(self): self.recv = cp.zeros((4, ), dtype=cp.float32) def setup(self, world_size, rank): - collective.init_collective_group("nccl", world_size, rank, "default") + collective.init_collective_group(world_size, rank, "nccl", "default") return True def compute(self): collective.allreduce(self.send, "default") - print(self.send) return self.send def destroy(self): @@ -24,11 +23,8 @@ def destroy(self): if __name__ == "__main__": - send = cp.ones((4, ), dtype=cp.float32) - ray.init(num_gpus=2) - num_workers = 2 workers = [] init_rets = [] @@ -38,5 +34,4 @@ def destroy(self): init_rets.append(w.setup.remote(num_workers, i)) _ = ray.get(init_rets) results = ray.get([w.compute.remote() for w in workers]) - # print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py index 9d0335dbab11..106ea31b2b7f 100644 --- a/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py +++ b/python/ray/util/collective/examples/nccl_allreduce_example_declare_collective_group.py @@ -30,5 +30,4 @@ def compute(self): } collective.declare_collective_group(workers, **_options) results = ray.get([w.compute.remote() for w in workers]) - print(results) ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py new file mode 100644 index 000000000000..88b75802e880 --- /dev/null +++ b/python/ray/util/collective/examples/nccl_allreduce_multigpu_example.py @@ -0,0 +1,43 @@ +import ray +import cupy as cp + +import ray.util.collective as collective +from cupy.cuda import Device + + +@ray.remote(num_gpus=2) +class Worker: + def __init__(self): + with Device(0): + self.send1 = cp.ones((4, ), dtype=cp.float32) + with Device(1): + self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 + + self.recv = cp.zeros((4, ), dtype=cp.float32) + + def setup(self, world_size, rank): + collective.init_collective_group(world_size, rank, "nccl", "177") + return True + + def compute(self): + collective.allreduce_multigpu([self.send1, self.send2], "177") + return [self.send1, self.send2], self.send1.device, self.send2.device + + def destroy(self): + collective.destroy_collective_group("177") + + +if __name__ == "__main__": + ray.init(address="auto") + num_workers = 2 + workers = [] + init_rets = [] + for i in range(num_workers): + w = Worker.remote() + workers.append(w) + init_rets.append(w.setup.remote(num_workers, i)) + a = ray.get(init_rets) + results = ray.get([w.compute.remote() for w in workers]) + print(results) + ray.get([w.destroy.remote() for w in workers]) + ray.shutdown() diff --git a/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py new file mode 100644 index 000000000000..7ff637a5bd68 --- /dev/null +++ b/python/ray/util/collective/examples/nccl_p2p_example_multigpu.py @@ -0,0 +1,53 @@ +import ray +import cupy as cp + +import ray.util.collective as collective +from cupy.cuda import Device + + +@ray.remote(num_gpus=2) +class Worker: + def __init__(self): + with Device(0): + self.send1 = cp.ones((4, ), dtype=cp.float32) + with Device(1): + self.send2 = cp.ones((4, ), dtype=cp.float32) * 2 + + with Device(0): + self.recv1 = cp.zeros((4, ), dtype=cp.float32) + with Device(1): + self.recv2 = cp.zeros((4, ), dtype=cp.float32) + self.rank = -1 + + def setup(self, world_size, rank): + self.rank = rank + collective.init_collective_group(world_size, rank, "nccl", "8") + return True + + def compute(self): + if self.rank == 0: + with Device(0): + collective.send_multigpu(self.send1 * 2, 1, 1, "8") + else: + # with Device(1): + collective.recv_multigpu(self.recv2, 0, 0, "8") + return self.recv2 + + def destroy(self): + collective.destroy_collective_group("8") + + +if __name__ == "__main__": + ray.init(address="auto") + num_workers = 2 + workers = [] + init_rets = [] + for i in range(num_workers): + w = Worker.remote() + workers.append(w) + init_rets.append(w.setup.remote(num_workers, i)) + a = ray.get(init_rets) + results = ray.get([w.compute.remote() for w in workers]) + print(results) + ray.get([w.destroy.remote() for w in workers]) + ray.shutdown() diff --git a/python/ray/util/collective/tests/conftest.py b/python/ray/util/collective/tests/conftest.py index ab5b3765d166..341142ec050d 100644 --- a/python/ray/util/collective/tests/conftest.py +++ b/python/ray/util/collective/tests/conftest.py @@ -1,30 +1,41 @@ """Some fixtures for collective tests.""" -import pytest +import logging +import pytest import ray +from ray.util.collective.collective_group.nccl_collective_group \ + import _get_comm_key_from_devices, _get_comm_key_send_recv from ray.util.collective.const import get_nccl_store_name +logger = logging.getLogger(__name__) +logger.setLevel("INFO") + # TODO (Hao): remove this clean_up function as it sometimes crashes Ray. def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 - p2p_group_names = [] + all_keys = [] for name in group_names: + devices = [[0], [0, 1], [1, 0]] + for d in devices: + collective_communicator_key = _get_comm_key_from_devices(d) + all_keys.append(collective_communicator_key + "@" + name) for i in range(max_world_size): for j in range(max_world_size): - if i <= j: - p2p_group_name = name + "_" + str(i) + "_" + str(j) - p2p_group_names.append(p2p_group_name) - all_names = group_names + p2p_group_names - for group_name in all_names: - store_name = get_nccl_store_name(group_name) + if i < j: + p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0) + all_keys.append(p2p_communicator_key + "@" + name) + for group_key in all_keys: + store_name = get_nccl_store_name(group_key) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: + logger.debug("Killing actor with group_key: '{}' and store: '{}'." + .format(group_key, store_name)) ray.kill(actor) @@ -41,6 +52,18 @@ def ray_start_single_node_2_gpus(): # my own on-premise cluster before run this fixture. @pytest.fixture def ray_start_distributed_2_nodes_4_gpus(): + # The cluster has a setup of 2 nodes, each node with 2 + # GPUs. Each actor will be allocated 1 GPU. + ray.init("auto") + yield + clean_up() + ray.shutdown() + + +@pytest.fixture +def ray_start_distributed_multigpu_2_nodes_4_gpus(): + # The cluster has a setup of 2 nodes, each node with 2 + # GPUs. Each actor will be allocated 2 GPUs. ray.init("auto") yield clean_up() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py b/python/ray/util/collective/tests/distributed_multigpu_tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py new file mode 100644 index 000000000000..c4cabcd45524 --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allgather.py @@ -0,0 +1,82 @@ +"""Test the allgather API on a distributed Ray cluster.""" +import pytest +import ray + +import cupy as cp +import torch + +from ray.util.collective.tests.util import \ + create_collective_multigpu_workers, \ + init_tensors_for_gather_scatter_multigpu + + +@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) +@pytest.mark.parametrize("array_size", + [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) +def test_allgather_different_array_size( + ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, + tensor_backend): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + init_tensors_for_gather_scatter_multigpu( + actors, array_size=array_size, tensor_backend=tensor_backend) + results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + for k in range(actual_world_size): + if tensor_backend == "cupy": + assert (results[i][j][k] == cp.ones( + array_size, dtype=cp.float32)).all() + else: + assert (results[i][j][k] == torch.ones( + array_size, dtype=torch.float32).cuda(j)).all() + + +def test_allgather_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + shape = [10, 10] + actors, _ = create_collective_multigpu_workers(world_size) + + # tensor is pytorch, list is cupy + for i, a in enumerate(actors): + ray.get([ + a.set_buffer.remote( + shape, tensor_type0="torch", tensor_type1="torch") + ]) + ray.get([ + a.set_list_buffer.remote( + shape, tensor_type0="cupy", tensor_type1="cupy") + ]) + results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + for k in range(actual_world_size): + assert (results[i][j][k] == cp.ones(shape, + dtype=cp.float32)).all() + + # tensor is cupy, list is pytorch + for i, a in enumerate(actors): + ray.get([ + a.set_buffer.remote( + shape, tensor_type0="cupy", tensor_type1="cupy") + ]) + ray.get([ + a.set_list_buffer.remote( + shape, tensor_type0="torch", tensor_type1="torch") + ]) + results = ray.get([a.do_allgather_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + for k in range(actual_world_size): + assert (results[i][j][k] == torch.ones( + shape, dtype=torch.float32).cuda(j)).all() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py new file mode 100644 index 000000000000..b681a08490b0 --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_allreduce.py @@ -0,0 +1,160 @@ +"""Test the collective allreduice API on a distributed Ray cluster.""" +import pytest +import logging + +import cupy as cp + +import ray +from ray.util.collective.types import ReduceOp +from ray.util.collective.tests.util import create_collective_multigpu_workers + +logger = logging.getLogger(__name__) +logger.setLevel("DEBUG") + + +@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) +def test_allreduce_multigpu_different_name( + ray_start_distributed_multigpu_2_nodes_4_gpus, group_name): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers( + num_workers=world_size, group_name=group_name) + results = ray.get( + [a.do_allreduce_multigpu.remote(group_name) for a in actors]) + assert (results[0] == cp.ones( + (10, ), dtype=cp.float32) * actual_world_size).all() + assert (results[1] == cp.ones( + (10, ), dtype=cp.float32) * actual_world_size).all() + + +@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) +def test_allreduce_multigpu_different_array_size( + ray_start_distributed_multigpu_2_nodes_4_gpus, array_size): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + ray.get([a.set_buffer.remote(array_size) for a in actors]) + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones( + (array_size, ), dtype=cp.float32) * actual_world_size).all() + assert (results[1] == cp.ones( + (array_size, ), dtype=cp.float32) * actual_world_size).all() + + +def test_allreduce_multigpu_destroy( + ray_start_distributed_multigpu_2_nodes_4_gpus, + backend="nccl", + group_name="default"): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones( + (10, ), dtype=cp.float32) * actual_world_size).all() + assert (results[1] == cp.ones( + (10, ), dtype=cp.float32) * actual_world_size).all() + + # destroy the group and try do work, should fail + ray.get([a.destroy_group.remote() for a in actors]) + with pytest.raises(RuntimeError): + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + + # reinit the same group and all reduce + ray.get([ + actor.init_group.remote(world_size, i, backend, group_name) + for i, actor in enumerate(actors) + ]) + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones((10, ), dtype=cp.float32) * actual_world_size + * actual_world_size).all() + assert (results[1] == cp.ones((10, ), dtype=cp.float32) * actual_world_size + * actual_world_size).all() + + +def test_allreduce_multigpu_multiple_group( + ray_start_distributed_multigpu_2_nodes_4_gpus, + backend="nccl", + num_groups=5): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + for group_name in range(1, num_groups): + ray.get([ + actor.init_group.remote(world_size, i, backend, str(group_name)) + for i, actor in enumerate(actors) + ]) + for i in range(num_groups): + group_name = "default" if i == 0 else str(i) + results = ray.get( + [a.do_allreduce_multigpu.remote(group_name) for a in actors]) + assert (results[0] == cp.ones( + (10, ), dtype=cp.float32) * (actual_world_size**(i + 1))).all() + + +def test_allreduce_multigpu_different_op( + ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + # check product + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get( + [a.do_allreduce_multigpu.remote(op=ReduceOp.PRODUCT) for a in actors]) + assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 120).all() + assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 120).all() + + # check min + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get( + [a.do_allreduce_multigpu.remote(op=ReduceOp.MIN) for a in actors]) + assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 2).all() + assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 2).all() + + # check max + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get( + [a.do_allreduce_multigpu.remote(op=ReduceOp.MAX) for a in actors]) + assert (results[0] == cp.ones((10, ), dtype=cp.float32) * 5).all() + assert (results[1] == cp.ones((10, ), dtype=cp.float32) * 5).all() + + +@pytest.mark.parametrize("dtype", + [cp.uint8, cp.float16, cp.float32, cp.float64]) +def test_allreduce_multigpu_different_dtype( + ray_start_distributed_multigpu_2_nodes_4_gpus, dtype): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + ray.get([a.set_buffer.remote([10], dtype=dtype) for a in actors]) + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones( + (10, ), dtype=dtype) * actual_world_size).all() + assert (results[1] == cp.ones( + (10, ), dtype=dtype) * actual_world_size).all() + + +def test_allreduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus): + # import torch + world_size = 2 + actual_world_size = 4 + actors, _ = create_collective_multigpu_workers(world_size) + ray.get(actors[0].set_buffer.remote([10])) + ray.get(actors[1].set_buffer.remote( + [10], tensor_type0="torch", tensor_type1="torch")) + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones((10, )) * actual_world_size).all() + + ray.get(actors[0].set_buffer.remote( + [10], tensor_type0="cupy", tensor_type1="torch")) + ray.get(actors[1].set_buffer.remote( + [10], tensor_type0="torch", tensor_type1="cupy")) + results = ray.get([a.do_allreduce_multigpu.remote() for a in actors]) + assert (results[0] == cp.ones((10, )) * actual_world_size).all() diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py new file mode 100644 index 000000000000..40be55dd2e0b --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_basic_apis.py @@ -0,0 +1,117 @@ +"""Test the collective group APIs.""" +import pytest +import ray +from random import shuffle + +from ray.util.collective.tests.util import create_collective_multigpu_workers + + +@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) +def test_init_two_actors(ray_start_distributed_multigpu_2_nodes_4_gpus, + group_name): + world_size = 2 + actors, results = create_collective_multigpu_workers( + world_size, group_name) + for i in range(world_size): + assert (results[i]) + + +def test_report_num_gpus(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, results = create_collective_multigpu_workers(world_size) + num_gpus = ray.get([actor.report_num_gpus.remote() for actor in actors]) + assert num_gpus == [2, 2] + + +def test_get_rank(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + actor0_rank = ray.get(actors[0].report_rank.remote()) + assert actor0_rank == 0 + actor1_rank = ray.get(actors[1].report_rank.remote()) + assert actor1_rank == 1 + + # create a second group with a different name, and different + # orders of ranks. + new_group_name = "default2" + ranks = list(range(world_size)) + shuffle(ranks) + _ = ray.get([ + actor.init_group.remote( + world_size, ranks[i], group_name=new_group_name) + for i, actor in enumerate(actors) + ]) + actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) + assert actor0_rank == ranks[0] + actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) + assert actor1_rank == ranks[1] + + +def test_availability(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + actor0_nccl_availability = ray.get( + actors[0].report_nccl_availability.remote()) + assert actor0_nccl_availability + actor0_gloo_availability = ray.get( + actors[0].report_gloo_availability.remote()) + assert not actor0_gloo_availability + + +def test_is_group_initialized(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + # check group is_init + actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) + assert actor0_is_init + actor0_is_init = ray.get( + actors[0].report_is_group_initialized.remote("random")) + assert not actor0_is_init + actor0_is_init = ray.get( + actors[0].report_is_group_initialized.remote("123")) + assert not actor0_is_init + actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) + assert actor1_is_init + actor1_is_init = ray.get( + actors[0].report_is_group_initialized.remote("456")) + assert not actor1_is_init + + +def test_destroy_group(ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + # Now destroy the group at actor0 + ray.wait([actors[0].destroy_group.remote()]) + actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) + assert not actor0_is_init + + # should go well as the group `random` does not exist at all + ray.wait([actors[0].destroy_group.remote("random")]) + + actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) + assert actor1_is_init + ray.wait([actors[1].destroy_group.remote("random")]) + actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) + assert actor1_is_init + ray.wait([actors[1].destroy_group.remote("default")]) + actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) + assert not actor1_is_init + + # Now reconstruct the group using the same name + init_results = ray.get([ + actor.init_group.remote(world_size, i) + for i, actor in enumerate(actors) + ]) + for i in range(world_size): + assert init_results[i] + actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) + assert actor0_is_init + actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) + assert actor1_is_init + + +if __name__ == "__main__": + import pytest + import sys + + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py new file mode 100644 index 000000000000..5ded5bce35e8 --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_broadcast.py @@ -0,0 +1,92 @@ +"""Test the broadcast API.""" +import pytest +import cupy as cp +import ray + +from ray.util.collective.tests.util import create_collective_multigpu_workers + + +@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) +@pytest.mark.parametrize("src_rank", [0, 1]) +@pytest.mark.parametrize("src_gpu_index", [0, 1]) +def test_broadcast_different_name( + ray_start_distributed_multigpu_2_nodes_4_gpus, group_name, src_rank, + src_gpu_index): + world_size = 2 + num_gpu_per_worker = 2 + actors, _ = create_collective_multigpu_workers( + num_workers=world_size, group_name=group_name) + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + + results = ray.get([ + a.do_broadcast_multigpu.remote( + group_name=group_name, + src_rank=src_rank, + src_gpu_index=src_gpu_index) for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + val = (src_rank + 1) * 2 + src_gpu_index + assert ( + results[i][j] == cp.ones([10], dtype=cp.float32) * val).all() + + +@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) +@pytest.mark.parametrize("src_rank", [0, 1]) +@pytest.mark.parametrize("src_gpu_index", [0, 1]) +def test_broadcast_different_array_size( + ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, src_rank, + src_gpu_index): + world_size = 2 + num_gpu_per_worker = 2 + actors, _ = create_collective_multigpu_workers(world_size) + ray.get(actors[0].set_buffer.remote([array_size], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([array_size], value0=4, value1=5)) + results = ray.get([ + a.do_broadcast_multigpu.remote( + src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + val = (src_rank + 1) * 2 + src_gpu_index + assert (results[i][j] == cp.ones( + (array_size, ), dtype=cp.float32) * val).all() + + +@pytest.mark.parametrize("src_rank", [0, 1]) +@pytest.mark.parametrize("src_gpu_index", [0, 1]) +def test_broadcast_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, + src_rank, src_gpu_index): + import torch + world_size = 2 + num_gpu_per_worker = 2 + actors, _ = create_collective_multigpu_workers(world_size) + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote( + [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) + results = ray.get([ + a.do_broadcast_multigpu.remote( + src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + val = (src_rank + 1) * 2 + src_gpu_index + if i == 0: + assert (results[i][j] == cp.ones([10], dtype=cp.float32) * + val).all() + else: + assert (results[i][j] == torch.ones([10]).cuda(j) * val).all() + + +@pytest.mark.parametrize("src_rank", [3, 4]) +@pytest.mark.parametrize("src_gpu_index", [2, 3]) +def test_broadcast_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, + src_rank, src_gpu_index): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + with pytest.raises(ValueError): + _ = ray.get([ + a.do_broadcast_multigpu.remote( + src_rank=src_rank, src_gpu_index=src_gpu_index) for a in actors + ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py new file mode 100644 index 000000000000..8ac5d54c1c12 --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reduce.py @@ -0,0 +1,173 @@ +"""Test the reduce API.""" +import pytest +import cupy as cp +import ray +from ray.util.collective.types import ReduceOp + +from ray.util.collective.tests.util import create_collective_multigpu_workers + + +@pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) +@pytest.mark.parametrize("dst_rank", [0, 1]) +@pytest.mark.parametrize("dst_gpu_index", [0, 1]) +def test_reduce_different_name(ray_start_distributed_multigpu_2_nodes_4_gpus, + group_name, dst_rank, dst_gpu_index): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers( + num_workers=world_size, group_name=group_name) + results = ray.get([ + a.do_reduce_multigpu.remote( + group_name, dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) + for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if i == dst_rank and j == dst_gpu_index: + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * actual_world_size).all() + else: + assert (results[i][j] == cp.ones((10, ), + dtype=cp.float32)).all() + + +@pytest.mark.parametrize("array_size", [2, 2**5, 2**10, 2**15, 2**20]) +@pytest.mark.parametrize("dst_rank", [0, 1]) +@pytest.mark.parametrize("dst_gpu_index", [0, 1]) +def test_reduce_different_array_size( + ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, dst_rank, + dst_gpu_index): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(num_workers=world_size) + + ray.get(actors[0].set_buffer.remote(array_size)) + ray.get(actors[1].set_buffer.remote(array_size)) + results = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if i == dst_rank and j == dst_gpu_index: + assert (results[i][j] == cp.ones( + (array_size, ), dtype=cp.float32) * + actual_world_size).all() + else: + assert (results[i][j] == cp.ones( + (array_size, ), dtype=cp.float32)).all() + + +@pytest.mark.parametrize("dst_rank", [0, 1]) +@pytest.mark.parametrize("dst_gpu_index", [0, 1]) +def test_reduce_different_op(ray_start_distributed_multigpu_2_nodes_4_gpus, + dst_rank, dst_gpu_index): + world_size = 2 + num_gpu_per_worker = 2 + actors, _ = create_collective_multigpu_workers(world_size) + + # check product + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, + dst_gpu_index=dst_gpu_index, + op=ReduceOp.PRODUCT) for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if i == dst_rank and j == dst_gpu_index: + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * 120).all() + else: + val = (i + 1) * 2 + j + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * val).all() + + # check min + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MIN) + for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if i == dst_rank and j == dst_gpu_index: + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * 2).all() + else: + val = (i + 1) * 2 + j + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * val).all() + + # check max + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote([10], value0=4, value1=5)) + results = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, dst_gpu_index=dst_gpu_index, op=ReduceOp.MAX) + for a in actors + ]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if i == dst_rank and j == dst_gpu_index: + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * 5).all() + else: + val = (i + 1) * 2 + j + assert (results[i][j] == cp.ones( + (10, ), dtype=cp.float32) * val).all() + + +@pytest.mark.parametrize("dst_rank", [0, 1]) +@pytest.mark.parametrize("dst_gpu_index", [0, 1]) +def test_reduce_torch_cupy(ray_start_distributed_multigpu_2_nodes_4_gpus, + dst_rank, dst_gpu_index): + import torch + world_size = 2 + num_gpu_per_worker = 2 + actors, _ = create_collective_multigpu_workers(world_size) + ray.get(actors[0].set_buffer.remote([10], value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote( + [10], value0=4, value1=5, tensor_type0="torch", tensor_type1="torch")) + + results = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors + ]) + + for i in range(world_size): + for j in range(num_gpu_per_worker): + val = (i + 1) * 2 + j + if dst_rank == i and dst_gpu_index == j: + if i == 0: + assert (results[i][j] == cp.ones([10], dtype=cp.float32) * + 14).all() + else: + assert ( + results[i][j] == torch.ones([10]).cuda(j) * 14).all() + else: + if i == 0: + assert (results[i][j] == cp.ones([10], dtype=cp.float32) * + val).all() + else: + assert ( + results[i][j] == torch.ones([10]).cuda(j) * val).all() + + +@pytest.mark.parametrize("dst_rank", [3, 4]) +@pytest.mark.parametrize("dst_gpu_index", [2, 3]) +def test_reduce_invalid_rank(ray_start_distributed_multigpu_2_nodes_4_gpus, + dst_rank, dst_gpu_index): + world_size = 2 + actors, _ = create_collective_multigpu_workers(world_size) + with pytest.raises(ValueError): + _ = ray.get([ + a.do_reduce_multigpu.remote( + dst_rank=dst_rank, dst_gpu_index=dst_gpu_index) for a in actors + ]) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py new file mode 100644 index 000000000000..48f72389bf89 --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_reducescatter.py @@ -0,0 +1,82 @@ +"""Test the collective reducescatter API on a distributed Ray cluster.""" +import pytest +import ray + +import cupy as cp +import torch + +from ray.util.collective.tests.util import \ + create_collective_multigpu_workers, \ + init_tensors_for_gather_scatter_multigpu + + +@pytest.mark.parametrize("tensor_backend", ["cupy", "torch"]) +@pytest.mark.parametrize("array_size", + [2, 2**5, 2**10, 2**15, 2**20, [2, 2], [5, 5, 5]]) +def test_reducescatter_different_array_size( + ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, + tensor_backend): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + actors, _ = create_collective_multigpu_workers(world_size) + + init_tensors_for_gather_scatter_multigpu( + actors, array_size=array_size, tensor_backend=tensor_backend) + results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + if tensor_backend == "cupy": + assert (results[i][j] == cp.ones(array_size, dtype=cp.float32) + * actual_world_size).all() + else: + assert (results[i][j] == torch.ones( + array_size, dtype=torch.float32).cuda(j) * + actual_world_size).all() + + +def test_reducescatter_torch_cupy( + ray_start_distributed_multigpu_2_nodes_4_gpus): + world_size = 2 + num_gpu_per_worker = 2 + actual_world_size = world_size * num_gpu_per_worker + shape = [10, 10] + actors, _ = create_collective_multigpu_workers(world_size) + + # tensor is pytorch, list is cupy + for i, a in enumerate(actors): + ray.get([ + a.set_buffer.remote( + shape, tensor_type0="torch", tensor_type1="torch") + ]) + ray.get([ + a.set_list_buffer.remote( + shape, tensor_type0="cupy", tensor_type1="cupy") + ]) + results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + assert (results[i][j] == torch.ones( + shape, dtype=torch.float32).cuda(j) * actual_world_size).all() + + # tensor is cupy, list is pytorch + for i, a in enumerate(actors): + ray.get([ + a.set_buffer.remote( + shape, tensor_type0="cupy", tensor_type1="cupy") + ]) + ray.get([ + a.set_list_buffer.remote( + shape, tensor_type0="torch", tensor_type1="torch") + ]) + results = ray.get([a.do_reducescatter_multigpu.remote() for a in actors]) + for i in range(world_size): + for j in range(num_gpu_per_worker): + assert (results[i][j] == cp.ones(shape, dtype=cp.float32) * + actual_world_size).all() + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", "-x", __file__])) diff --git a/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py new file mode 100644 index 000000000000..a88fdb34ec8f --- /dev/null +++ b/python/ray/util/collective/tests/distributed_multigpu_tests/test_distributed_multigpu_sendrecv.py @@ -0,0 +1,47 @@ +"""Test the send/recv API.""" +import cupy as cp +import pytest +import ray + +from ray.util.collective.tests.util import create_collective_multigpu_workers + + +# @pytest.mark.parametrize("group_name", ["default", "test", "123?34!"]) +@pytest.mark.parametrize("dst_rank", [0, 1]) +@pytest.mark.parametrize("src_rank", [0, 1]) +@pytest.mark.parametrize("dst_gpu_index", [0, 1]) +@pytest.mark.parametrize("src_gpu_index", [0, 1]) +@pytest.mark.parametrize("array_size", + [2**10, 2**15, 2**20, [2, 2], [5, 9, 10, 85]]) +def test_sendrecv(ray_start_distributed_multigpu_2_nodes_4_gpus, array_size, + src_rank, dst_rank, src_gpu_index, dst_gpu_index): + if src_rank == dst_rank: + return + world_size = 2 + actors, _ = create_collective_multigpu_workers(num_workers=world_size) + + ray.get(actors[0].set_buffer.remote(array_size, value0=2, value1=3)) + ray.get(actors[1].set_buffer.remote(array_size, value0=4, value1=5)) + + refs = [] + for i in range(world_size): + refs.append(actors[i].get_buffer.remote()) + refs[src_rank][src_gpu_index] = actors[src_rank].do_send_multigpu.remote( + dst_rank=dst_rank, + dst_gpu_index=dst_gpu_index, + src_gpu_index=src_gpu_index) + refs[dst_rank][dst_gpu_index] = actors[dst_rank].do_recv_multigpu.remote( + src_rank=src_rank, + src_gpu_index=src_gpu_index, + dst_gpu_index=dst_gpu_index) + results = [] + results_flattend = ray.get(refs[0] + refs[1]) + results.append([results_flattend[0], results_flattend[1]]) + results.append([results_flattend[2], results_flattend[3]]) + assert (results[src_rank][src_gpu_index] == cp.ones( + array_size, dtype=cp.float32) * ( + (src_rank + 1) * 2 + src_gpu_index)).all() + assert (results[dst_rank][dst_gpu_index] == cp.ones( + array_size, dtype=cp.float32) * ( + (src_rank + 1) * 2 + src_gpu_index)).all() + ray.get([a.destroy_group.remote() for a in actors]) diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py index 0f17b79ba63e..a0dd4508001f 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_basic_apis.py @@ -69,9 +69,9 @@ def test_availability(ray_start_distributed_2_nodes_4_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_mpi_availability = ray.get( - actors[0].report_mpi_availability.remote()) - assert not actor0_mpi_availability + actor0_gloo_availability = ray.get( + actors[0].report_gloo_availability.remote()) + assert not actor0_gloo_availability def test_is_group_initialized(ray_start_distributed_2_nodes_4_gpus): diff --git a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py index 408ebce76b8a..5c1ecd7f14d8 100644 --- a/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py +++ b/python/ray/util/collective/tests/distributed_tests/test_distributed_broadcast.py @@ -60,7 +60,8 @@ def test_broadcast_torch_cupy(ray_start_distributed_2_nodes_4_gpus, src_rank): assert (results[1] == torch.ones((10, )).cuda() * world_size).all() -def test_broadcast_invalid_rank(ray_start_single_node_2_gpus, src_rank=3): +def test_broadcast_invalid_rank(ray_start_distributed_2_nodes_4_gpus, + src_rank=3): world_size = 2 actors, _ = create_collective_workers(world_size) with pytest.raises(ValueError): diff --git a/python/ray/util/collective/tests/sinlge_node_tests/__init__.py b/python/ray/util/collective/tests/sinlge_node_tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/util/collective/tests/test_allgather.py b/python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py similarity index 100% rename from python/ray/util/collective/tests/test_allgather.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_allgather.py diff --git a/python/ray/util/collective/tests/test_allreduce.py b/python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py similarity index 100% rename from python/ray/util/collective/tests/test_allreduce.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_allreduce.py diff --git a/python/ray/util/collective/tests/test_basic_apis.py b/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py similarity index 97% rename from python/ray/util/collective/tests/test_basic_apis.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py index 8c23442a3b4c..29a3ec3f4a15 100644 --- a/python/ray/util/collective/tests/test_basic_apis.py +++ b/python/ray/util/collective/tests/sinlge_node_tests/test_basic_apis.py @@ -64,9 +64,9 @@ def test_availability(ray_start_single_node_2_gpus): actor0_nccl_availability = ray.get( actors[0].report_nccl_availability.remote()) assert actor0_nccl_availability - actor0_mpi_availability = ray.get( - actors[0].report_mpi_availability.remote()) - assert not actor0_mpi_availability + actor0_gloo_availability = ray.get( + actors[0].report_gloo_availability.remote()) + assert not actor0_gloo_availability def test_is_group_initialized(ray_start_single_node_2_gpus): diff --git a/python/ray/util/collective/tests/test_broadcast.py b/python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py similarity index 100% rename from python/ray/util/collective/tests/test_broadcast.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_broadcast.py diff --git a/python/ray/util/collective/tests/test_reduce.py b/python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py similarity index 100% rename from python/ray/util/collective/tests/test_reduce.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_reduce.py diff --git a/python/ray/util/collective/tests/test_reducescatter.py b/python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py similarity index 100% rename from python/ray/util/collective/tests/test_reducescatter.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_reducescatter.py diff --git a/python/ray/util/collective/tests/test_sendrecv.py b/python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py similarity index 100% rename from python/ray/util/collective/tests/test_sendrecv.py rename to python/ray/util/collective/tests/sinlge_node_tests/test_sendrecv.py diff --git a/python/ray/util/collective/tests/util.py b/python/ray/util/collective/tests/util.py index 259ee24c9727..a5fb97a53ad5 100644 --- a/python/ray/util/collective/tests/util.py +++ b/python/ray/util/collective/tests/util.py @@ -1,20 +1,29 @@ import cupy as cp +import logging import ray import ray.util.collective as col from ray.util.collective.types import Backend, ReduceOp +from ray.util.collective.collective_group.nccl_util import get_num_gpus import torch +logger = logging.getLogger(__name__) + @ray.remote(num_gpus=1) class Worker: def __init__(self): + self.buffer = None + self.list_buffer = None + + def init_tensors(self): self.buffer = cp.ones((10, ), dtype=cp.float32) self.list_buffer = [ - cp.ones((10, ), dtype=cp.float32), - cp.ones((10, ), dtype=cp.float32) + cp.ones((10, ), dtype=cp.float32) for _ in range(2) ] + cp.cuda.Stream.null.synchronize() + return True def init_group(self, world_size, @@ -79,8 +88,8 @@ def report_nccl_availability(self): avail = col.nccl_available() return avail - def report_mpi_availability(self): - avail = col.mpi_available() + def report_gloo_availability(self): + avail = col.gloo_available() return avail def report_is_group_initialized(self, group_name="default"): @@ -91,7 +100,11 @@ def report_is_group_initialized(self, group_name="default"): def create_collective_workers(num_workers=2, group_name="default", backend="nccl"): - actors = [Worker.remote() for _ in range(num_workers)] + actors = [None] * num_workers + for i in range(num_workers): + actor = Worker.remote() + ray.get([actor.init_tensors.remote()]) + actors[i] = actor world_size = num_workers init_results = ray.get([ actor.init_group.remote(world_size, i, backend, group_name) @@ -112,7 +125,7 @@ def init_tensors_for_gather_scatter(actors, t = torch.ones(array_size, dtype=torch.float32).cuda() * (i + 1) else: raise RuntimeError("Unsupported tensor backend.") - ray.wait([a.set_buffer.remote(t)]) + ray.get([a.set_buffer.remote(t)]) if tensor_backend == "cupy": list_buffer = [ cp.ones(array_size, dtype=dtype) for _ in range(world_size) @@ -125,3 +138,250 @@ def init_tensors_for_gather_scatter(actors, else: raise RuntimeError("Unsupported tensor backend.") ray.get([a.set_list_buffer.remote(list_buffer) for a in actors]) + + +@ray.remote(num_gpus=2) +class MultiGPUWorker: + def __init__(self): + self.buffer0 = None + self.buffer1 = None + self.list_buffer0 = None + self.list_buffer1 = None + + def __del__(self): + self.buffer0 = None + self.buffer1 = None + self.list_buffer0 = None + self.list_buffer1 = None + + def init_tensors(self): + with cp.cuda.Device(0): + self.buffer0 = cp.ones((10, ), dtype=cp.float32) + self.list_buffer0 = [ + cp.ones((10, ), dtype=cp.float32) for _ in range(4) + ] + with cp.cuda.Device(1): + self.buffer1 = cp.ones((10, ), dtype=cp.float32) + self.list_buffer1 = [ + cp.ones((10, ), dtype=cp.float32) for _ in range(4) + ] + cp.cuda.Stream.null.synchronize() + return True + + def init_group(self, + world_size, + rank, + backend=Backend.NCCL, + group_name="default"): + col.init_collective_group(world_size, rank, backend, group_name) + return True + + def set_buffer(self, + size, + value0=1.0, + value1=1.0, + dtype=cp.float32, + tensor_type0="cupy", + tensor_type1="cupy"): + if tensor_type0 == "cupy": + with cp.cuda.Device(0): + self.buffer0 = cp.ones(size, dtype=dtype) * value0 + elif tensor_type0 == "torch": + self.buffer0 = torch.ones( + size, dtype=torch.float32).cuda(0) * value0 + else: + raise RuntimeError() + + if tensor_type1 == "cupy": + with cp.cuda.Device(1): + self.buffer1 = cp.ones(size, dtype=dtype) * value1 + elif tensor_type1 == "torch": + self.buffer1 = torch.ones( + size, dtype=torch.float32).cuda(1) * value1 + else: + raise RuntimeError() + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + # cp.cuda.Stream.null.synchronize() + return True + + def set_list_buffer(self, + size, + value0=1.0, + value1=1.0, + dtype=cp.float32, + tensor_type0="cupy", + tensor_type1="cupy"): + if tensor_type0 == "cupy": + with cp.cuda.Device(0): + self.list_buffer0 = [ + cp.ones(size, dtype=dtype) * value0 for _ in range(4) + ] + elif tensor_type0 == "torch": + self.list_buffer0 = [ + torch.ones(size, dtype=torch.float32).cuda(0) * value0 + for _ in range(4) + ] + else: + raise RuntimeError() + + if tensor_type1 == "cupy": + with cp.cuda.Device(1): + self.list_buffer1 = [ + cp.ones(size, dtype=dtype) * value1 for _ in range(4) + ] + elif tensor_type1 == "torch": + self.list_buffer1 = [ + torch.ones(size, dtype=torch.float32).cuda(1) * value1 + for _ in range(4) + ] + else: + raise RuntimeError() + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + return True + + @ray.method(num_returns=2) + def get_buffer(self): + return self.buffer0, self.buffer1 + + def do_allreduce_multigpu(self, group_name="default", op=ReduceOp.SUM): + col.allreduce_multigpu([self.buffer0, self.buffer1], group_name, op) + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + return self.buffer0 + + def do_reduce_multigpu(self, + group_name="default", + dst_rank=0, + dst_gpu_index=0, + op=ReduceOp.SUM): + col.reduce_multigpu([self.buffer0, self.buffer1], dst_rank, + dst_gpu_index, group_name, op) + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + return self.buffer0, self.buffer1 + + def do_broadcast_multigpu(self, + group_name="default", + src_rank=0, + src_gpu_index=0): + col.broadcast_multigpu([self.buffer0, self.buffer1], src_rank, + src_gpu_index, group_name) + return self.buffer0, self.buffer1 + + def do_allgather_multigpu(self, group_name="default"): + col.allgather_multigpu([self.list_buffer0, self.list_buffer1], + [self.buffer0, self.buffer1], group_name) + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + return self.list_buffer0, self.list_buffer1 + + def do_reducescatter_multigpu(self, group_name="default", op=ReduceOp.SUM): + col.reducescatter_multigpu([self.buffer0, self.buffer1], + [self.list_buffer0, self.list_buffer1], + group_name, op) + cp.cuda.Device(0).synchronize() + cp.cuda.Device(1).synchronize() + return self.buffer0, self.buffer1 + + def do_send_multigpu(self, + group_name="default", + dst_rank=0, + dst_gpu_index=0, + src_gpu_index=0): + if src_gpu_index == 0: + col.send_multigpu(self.buffer0, dst_rank, dst_gpu_index, + group_name) + cp.cuda.Device(0).synchronize() + return self.buffer0 + elif src_gpu_index == 1: + col.send_multigpu(self.buffer1, dst_rank, dst_gpu_index, + group_name) + cp.cuda.Device(1).synchronize() + return self.buffer1 + else: + raise RuntimeError() + + def do_recv_multigpu(self, + group_name="default", + src_rank=0, + src_gpu_index=0, + dst_gpu_index=0): + if dst_gpu_index == 0: + col.recv_multigpu(self.buffer0, src_rank, src_gpu_index, + group_name) + cp.cuda.Device(0).synchronize() + return self.buffer0 + elif dst_gpu_index == 1: + col.recv_multigpu(self.buffer1, src_rank, src_gpu_index, + group_name) + cp.cuda.Device(1).synchronize() + return self.buffer1 + else: + raise RuntimeError() + + def destroy_group(self, group_name="default"): + col.destroy_collective_group(group_name) + return True + + def report_rank(self, group_name="default"): + rank = col.get_rank(group_name) + return rank + + def report_world_size(self, group_name="default"): + ws = col.get_world_size(group_name) + return ws + + def report_nccl_availability(self): + avail = col.nccl_available() + return avail + + def report_gloo_availability(self): + avail = col.gloo_available() + return avail + + def report_is_group_initialized(self, group_name="default"): + is_init = col.is_group_initialized(group_name) + return is_init + + def report_num_gpus(self): + n_gpus = get_num_gpus() + return n_gpus + + +def create_collective_multigpu_workers(num_workers=2, + group_name="default", + backend="nccl"): + actors = [None] * num_workers + for i in range(num_workers): + actor = MultiGPUWorker.remote() + ray.get([actor.set_buffer.remote([10])], timeout=10) + ray.get([actor.set_list_buffer.remote([10])], timeout=10) + actors[i] = actor + world_size = num_workers + init_results = ray.get([ + actor.init_group.remote(world_size, i, backend, group_name) + for i, actor in enumerate(actors) + ]) + return actors, init_results + + +def init_tensors_for_gather_scatter_multigpu(actors, + array_size=10, + tensor_backend="cupy"): + for i, a in enumerate(actors): + if tensor_backend == "cupy": + ray.get([a.set_buffer.remote(array_size)]) + ray.get([a.set_list_buffer.remote(array_size)]) + elif tensor_backend == "torch": + ray.get([ + a.set_buffer.remote( + array_size, tensor_type0="torch", tensor_type1="torch") + ]) + ray.get([ + a.set_list_buffer.remote( + array_size, tensor_type0="torch", tensor_type1="torch") + ]) + else: + raise RuntimeError("Unsupported tensor backend.") diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index c12dde84cb6a..d3e964486f77 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -30,6 +30,7 @@ class Backend(object): """A class to represent different backends.""" NCCL = "nccl" MPI = "mpi" + GLOO = "gloo" UNRECOGNIZED = "unrecognized" def __new__(cls, name: str): @@ -38,6 +39,8 @@ def __new__(cls, name: str): raise ValueError("Unrecognized backend: '{}'. " "Only NCCL is supported".format(name)) if backend == Backend.MPI: + raise RuntimeError("Ray does not support MPI backend.") + if backend == Backend.GLOO: raise NotImplementedError() return backend @@ -67,6 +70,7 @@ class BarrierOptions: class ReduceOptions: reduceOp = ReduceOp.SUM root_rank = 0 + root_tensor = 0 # index for multi-gpu reduce operations timeout_ms = unset_timeout_ms @@ -85,6 +89,7 @@ class AllGatherOptions: @dataclass class BroadcastOptions: root_rank = 0 + root_tensor = 0 timeout_ms = unset_timeout_ms @@ -92,3 +97,17 @@ class BroadcastOptions: class ReduceScatterOptions: reduceOp = ReduceOp.SUM timeout_ms = unset_timeout_ms + + +@dataclass +class SendOptions: + dst_rank = 0 + dst_gpu_index = 0 + timeout_ms = unset_timeout_ms + + +@dataclass +class RecvOptions: + src_rank = 0 + src_gpu_index = 0 + unset_timeout_ms = unset_timeout_ms From ef1f7e4d4215164ae17dbbd7f344939194970b40 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Jan 2021 11:32:17 +0100 Subject: [PATCH 048/245] [tune](deps): Bump smart-open[s3] in /python/requirements (#13699) Bumps [smart-open[s3]](https://github.com/piskvorky/smart_open) from 4.0.1 to 4.1.2. - [Release notes](https://github.com/piskvorky/smart_open/releases) - [Changelog](https://github.com/RaRe-Technologies/smart_open/blob/develop/CHANGELOG.md) - [Commits](https://github.com/piskvorky/smart_open/compare/4.0.1...v4.1.2) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index bae7f20ae363..eb72499c1ed9 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -735,7 +735,7 @@ six==1.15.0 # traitlets # wandb # websocket-client -smart_open[s3]==4.0.1 +smart_open[s3]==4.1.2 # via # -c ../requirements.txt # -r requirements_tune.in diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index bb10df777068..99e7fe1a9b53 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -725,7 +725,7 @@ six==1.15.0 # tensorflow-probability # wandb # websocket-client -smart_open[s3]==4.0.1 +smart_open[s3]==4.1.2 # via # -c ../requirements.txt # -r requirements_tune.in From 148b1022d622a552951874340e000448db92dddb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Jan 2021 11:32:56 +0100 Subject: [PATCH 049/245] [tune](deps): Bump autogluon-core in /python/requirements (#13698) Bumps [autogluon-core](https://github.com/awslabs/autogluon) from 0.0.16b20210122 to 0.0.16b20210125. - [Release notes](https://github.com/awslabs/autogluon/releases) - [Changelog](https://github.com/awslabs/autogluon/blob/master/docs/ReleaseInstructions.md) - [Commits](https://github.com/awslabs/autogluon/commits) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/requirements/linux-py3.6-requirements_tune.txt | 2 +- python/requirements/linux-py3.7-requirements_tune.txt | 2 +- python/requirements/linux-py3.8-requirements_tune.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/requirements/linux-py3.6-requirements_tune.txt b/python/requirements/linux-py3.6-requirements_tune.txt index eb72499c1ed9..1bafdac84b67 100644 --- a/python/requirements/linux-py3.6-requirements_tune.txt +++ b/python/requirements/linux-py3.6-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210125 # via gluoncv autograd==1.3 # via autogluon.core diff --git a/python/requirements/linux-py3.7-requirements_tune.txt b/python/requirements/linux-py3.7-requirements_tune.txt index 99e7fe1a9b53..920222b459ef 100644 --- a/python/requirements/linux-py3.7-requirements_tune.txt +++ b/python/requirements/linux-py3.7-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210125 # via gluoncv autograd==1.3 # via autogluon.core diff --git a/python/requirements/linux-py3.8-requirements_tune.txt b/python/requirements/linux-py3.8-requirements_tune.txt index 8ef61bd51b63..14aade6549ee 100644 --- a/python/requirements/linux-py3.8-requirements_tune.txt +++ b/python/requirements/linux-py3.8-requirements_tune.txt @@ -27,7 +27,7 @@ attrs==20.3.0 # pytest autocfg==0.0.6 # via gluoncv -autogluon.core==0.0.16b20210122 +autogluon.core==0.0.16b20210125 # via gluoncv autograd==1.3 # via autogluon.core From 5d882b062d3d7ae75475615d4147269a99b2db9c Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Tue, 26 Jan 2021 12:09:13 -0600 Subject: [PATCH 050/245] [Serve] fix k8s doc (#13713) --- doc/source/serve/deployment.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/deployment.rst b/doc/source/serve/deployment.rst index 5ab65a7a35c1..1ab190595796 100644 --- a/doc/source/serve/deployment.rst +++ b/doc/source/serve/deployment.rst @@ -225,7 +225,7 @@ With the cluster now running, we can run a simple script to start Ray Serve and # Connect to the running Ray cluster. ray.init(address="auto") # Bind on 0.0.0.0 to expose the HTTP server on external IPs. - client = serve.start(http_options={"host": "0.0.0.0"}) + client = serve.start(detached=True, http_options={"host": "0.0.0.0"}) def hello(): return "hello world" From 4aff86bfa709aa90c1a014d1322ee023a1f5457b Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Tue, 26 Jan 2021 10:17:58 -0800 Subject: [PATCH 051/245] [CI] skip failing java tests (#13702) --- java/test/src/main/java/io/ray/test/ActorRestartTest.java | 4 +++- java/test/src/main/java/io/ray/test/ExitActorTest.java | 4 +++- java/test/src/main/java/io/ray/test/MultiDriverTest.java | 4 +++- java/test/src/main/java/io/ray/test/PlacementGroupTest.java | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/java/test/src/main/java/io/ray/test/ActorRestartTest.java b/java/test/src/main/java/io/ray/test/ActorRestartTest.java index fe70e086764d..26326073c634 100644 --- a/java/test/src/main/java/io/ray/test/ActorRestartTest.java +++ b/java/test/src/main/java/io/ray/test/ActorRestartTest.java @@ -9,7 +9,9 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test(groups = {"cluster"}) +@Test( + groups = {"cluster"}, + enabled = false) public class ActorRestartTest extends BaseTest { public static class Counter { diff --git a/java/test/src/main/java/io/ray/test/ExitActorTest.java b/java/test/src/main/java/io/ray/test/ExitActorTest.java index 279af55c05e5..a1c40e2ac8a1 100644 --- a/java/test/src/main/java/io/ray/test/ExitActorTest.java +++ b/java/test/src/main/java/io/ray/test/ExitActorTest.java @@ -15,7 +15,9 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test(groups = {"cluster"}) +@Test( + groups = {"cluster"}, + enabled = false) public class ExitActorTest extends BaseTest { private static class ExitingActor { diff --git a/java/test/src/main/java/io/ray/test/MultiDriverTest.java b/java/test/src/main/java/io/ray/test/MultiDriverTest.java index 9c781f56283f..3feb981927c0 100644 --- a/java/test/src/main/java/io/ray/test/MultiDriverTest.java +++ b/java/test/src/main/java/io/ray/test/MultiDriverTest.java @@ -17,7 +17,9 @@ import org.testng.Assert; import org.testng.annotations.Test; -@Test(groups = {"cluster"}) +@Test( + groups = {"cluster"}, + enabled = false) public class MultiDriverTest extends BaseTest { private static final int DRIVER_COUNT = 10; diff --git a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java index edbd2c30e4d6..89d1fab69452 100644 --- a/java/test/src/main/java/io/ray/test/PlacementGroupTest.java +++ b/java/test/src/main/java/io/ray/test/PlacementGroupTest.java @@ -83,7 +83,9 @@ public void testGetPlacementGroup() { Assert.assertEquals(placementGroupRes.getStrategy(), expectPlacementGroup.getStrategy()); } - @Test(groups = {"cluster"}) + @Test( + groups = {"cluster"}, + enabled = false) public void testRemovePlacementGroup() { PlacementGroupTestUtils.createNameSpecifiedSimpleGroup( "CPU", 1, PlacementStrategy.PACK, 1.0, "first_placement_group"); From ddcbd229ba68e502ced116445dce2808af454331 Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Tue, 26 Jan 2021 10:29:07 -0800 Subject: [PATCH 052/245] Rename the ray.operator module to ray.ray_operator (#13705) * Rename ray.operator module * mypy --- ci/travis/format.sh | 4 ++-- python/ray/{operator => ray_operator}/__init__.py | 0 python/ray/{operator => ray_operator}/operator.py | 2 +- python/ray/{operator => ray_operator}/operator_utils.py | 0 python/ray/setup-dev.py | 2 +- python/setup.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename python/ray/{operator => ray_operator}/__init__.py (100%) rename python/ray/{operator => ray_operator}/operator.py (99%) rename python/ray/{operator => ray_operator}/operator_utils.py (100%) diff --git a/ci/travis/format.sh b/ci/travis/format.sh index 3f4b753f4d12..bb916869cca2 100755 --- a/ci/travis/format.sh +++ b/ci/travis/format.sh @@ -107,8 +107,8 @@ MYPY_FILES=( 'autoscaler/node_provider.py' 'autoscaler/sdk.py' 'autoscaler/_private/commands.py' - 'operator/operator.py' - 'operator/operator_utils.py' + 'ray_operator/operator.py' + 'ray_operator/operator_utils.py' ) YAPF_EXCLUDES=( diff --git a/python/ray/operator/__init__.py b/python/ray/ray_operator/__init__.py similarity index 100% rename from python/ray/operator/__init__.py rename to python/ray/ray_operator/__init__.py diff --git a/python/ray/operator/operator.py b/python/ray/ray_operator/operator.py similarity index 99% rename from python/ray/operator/operator.py rename to python/ray/ray_operator/operator.py index cf83eaa240d5..cc03c2fefc8f 100644 --- a/python/ray/operator/operator.py +++ b/python/ray/ray_operator/operator.py @@ -9,7 +9,7 @@ from ray._private import services from ray.autoscaler._private import commands from ray import monitor -from ray.operator import operator_utils +from ray.ray_operator import operator_utils from ray import ray_constants diff --git a/python/ray/operator/operator_utils.py b/python/ray/ray_operator/operator_utils.py similarity index 100% rename from python/ray/operator/operator_utils.py rename to python/ray/ray_operator/operator_utils.py diff --git a/python/ray/setup-dev.py b/python/ray/setup-dev.py index 285c0028e159..dcbb622ad16d 100755 --- a/python/ray/setup-dev.py +++ b/python/ray/setup-dev.py @@ -66,7 +66,7 @@ def do_link(package, force=False, local_path=None): do_link("rllib", force=args.yes, local_path="../../../rllib") do_link("tune", force=args.yes) do_link("autoscaler", force=args.yes) - do_link("operator", force=args.yes) + do_link("ray_operator", force=args.yes) do_link("cloudpickle", force=args.yes) do_link("scripts", force=args.yes) do_link("internal", force=args.yes) diff --git a/python/setup.py b/python/setup.py index a1542a7a292c..e00fcc0820bb 100644 --- a/python/setup.py +++ b/python/setup.py @@ -449,7 +449,7 @@ def has_ext_modules(self): "ray=ray.scripts.scripts:main", "rllib=ray.rllib.scripts:cli [rllib]", "tune=ray.tune.scripts:cli", - "ray-operator=ray.operator.operator:main", + "ray-operator=ray.ray_operator.operator:main", "serve=ray.serve.scripts:cli", ] }, From 5d82654022307a8da7bdcfd8ebf211e7c29f5bc8 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Tue, 26 Jan 2021 10:29:42 -0800 Subject: [PATCH 053/245] [CLI] Fix Ray Status with ENV Variable set (#13707) --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 19 +++++++++++++++++++ .../test_cli_patterns/test_ray_status.txt | 12 ++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index c9ea996f9c0c..435c16d4eebc 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return "auto" # Avoid conflict with RAY_ADDRESS env var + return os.environ.get("RAY_ADDRESS") return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index 57bf61419690..a6f1b1989ae9 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -415,5 +415,24 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) +def test_ray_status(): + import ray + address = ray.init().get("redis_address") + runner = CliRunner() + result = runner.invoke(scripts.status, []) + _check_output_via_pattern("test_ray_status.txt", result) + + result_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_arg) + + # Try to check status with RAY_ADDRESS set + os.environ["RAY_ADDRESS"] = address + result_env = runner.invoke(scripts.status) + _check_output_via_pattern("test_ray_status.txt", result_env) + + result_env_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_env_arg) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt new file mode 100644 index 000000000000..7169c5f0f096 --- /dev/null +++ b/python/ray/tests/test_cli_patterns/test_ray_status.txt @@ -0,0 +1,12 @@ +======== Cluster status: .+ +Node status +------------------------------------------------------------ + + +Resources +------------------------------------------------------------ +Usage: + + +Demands: + \(no resource demands\) From 0c46d09940724d82b7cb1d838f0a10553c2bc5ac Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Tue, 26 Jan 2021 10:56:56 -0800 Subject: [PATCH 054/245] [ray_client]: Monitor client stream errors (#13386) --- python/ray/tests/test_client.py | 27 +++++++++++++++++++++++++++ python/ray/util/client/__init__.py | 4 +++- python/ray/util/client/dataclient.py | 19 +++++++++++++++++-- python/ray/util/client/logsclient.py | 14 ++++++++++++-- python/ray/util/client/worker.py | 10 ++++++++++ 5 files changed, 69 insertions(+), 5 deletions(-) diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index dc5de2470e6e..30d6faccbad9 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -364,5 +364,32 @@ def run_client(): ray_client._inside_client_test = False +def test_dataclient_server_drop(ray_start_regular_shared): + from ray.util.client import ray as ray_client + ray_client._inside_client_test = True + + @ray_client.remote + def f(x): + time.sleep(4) + return x + + def stop_server(server): + time.sleep(2) + server.stop(0) + + server = ray_client_server.serve("localhost:50051") + ray_client.connect("localhost:50051") + thread = threading.Thread(target=stop_server, args=(server, )) + thread.start() + x = f.remote(2) + with pytest.raises(ConnectionError): + _ = ray_client.get(x) + thread.join() + ray_client.disconnect() + ray_client._inside_client_test = False + # Wait for f(x) to finish before ray.shutdown() in the fixture + time.sleep(3) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/client/__init__.py b/python/ray/util/client/__init__.py index 02aab93ff5ae..1c28dc53c64a 100644 --- a/python/ray/util/client/__init__.py +++ b/python/ray/util/client/__init__.py @@ -89,7 +89,9 @@ def __getattr__(self, key: str): return getattr(self.api, key) def is_connected(self) -> bool: - return self.client_worker is not None + if self.client_worker is None: + return False + return self.client_worker.is_connected() def init(self, *args, **kwargs): if self._server is not None: diff --git a/python/ray/util/client/dataclient.py b/python/ray/util/client/dataclient.py index 6e29ea927b83..a0750b790bb6 100644 --- a/python/ray/util/client/dataclient.py +++ b/python/ray/util/client/dataclient.py @@ -37,6 +37,7 @@ def __init__(self, channel: "grpc._channel.Channel", client_id: str, self._req_id = 0 self._client_id = client_id self._metadata = metadata + self._in_shutdown = False self.data_thread.start() def _next_id(self) -> int: @@ -67,9 +68,19 @@ def _data_main(self) -> None: self.ready_data[response.req_id] = response self.cv.notify_all() except grpc.RpcError as e: - if grpc.StatusCode.CANCELLED == e.code(): + with self.cv: + self._in_shutdown = True + self.cv.notify_all() + if e.code() == grpc.StatusCode.CANCELLED: # Gracefully shutting down logger.info("Cancelling data channel") + elif e.code() == grpc.StatusCode.UNAVAILABLE: + # TODO(barakmich): The server may have + # dropped. In theory, we can retry, as per + # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but + # in practice we may need to think about the correct semantics + # here. + logger.info("Server disconnected from data channel") else: logger.error( f"Got Error from data channel -- shutting down: {e}") @@ -88,7 +99,11 @@ def _blocking_send(self, req: ray_client_pb2.DataRequest self.request_queue.put(req) data = None with self.cv: - self.cv.wait_for(lambda: req_id in self.ready_data) + self.cv.wait_for( + lambda: req_id in self.ready_data or self._in_shutdown) + if self._in_shutdown: + raise ConnectionError( + f"cannot send request {req}: data channel shutting down") data = self.ready_data[req_id] del self.ready_data[req_id] return data diff --git a/python/ray/util/client/logsclient.py b/python/ray/util/client/logsclient.py index 0e4d02846a37..f7902024d256 100644 --- a/python/ray/util/client/logsclient.py +++ b/python/ray/util/client/logsclient.py @@ -44,8 +44,18 @@ def _log_main(self) -> None: self.stdstream(level=record.level, msg=record.msg) self.log(level=record.level, msg=record.msg) except grpc.RpcError as e: - if grpc.StatusCode.CANCELLED != e.code(): - # Not just shutting down normally + if e.code() == grpc.StatusCode.CANCELLED: + # Graceful shutdown. We've cancelled our own connection. + logger.info("Cancelling logs channel") + elif e.code() == grpc.StatusCode.UNAVAILABLE: + # TODO(barakmich): The server may have + # dropped. In theory, we can retry, as per + # https://grpc.github.io/grpc/core/md_doc_statuscodes.html but + # in practice we may need to think about the correct semantics + # here. + logger.info("Server disconnected from logs channel") + else: + # Some other, unhandled, gRPC error logger.error( f"Got Error from logger channel -- shutting down: {e}") raise e diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index d62173be745f..9f2f189c6ae2 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -60,6 +60,7 @@ def __init__(self, """ self.metadata = metadata if metadata else [] self.channel = None + self._conn_state = grpc.ChannelConnectivity.IDLE self._client_id = make_client_id() if secure: credentials = grpc.ssl_channel_credentials() @@ -67,6 +68,8 @@ def __init__(self, else: self.channel = grpc.insecure_channel(conn_str) + self.channel.subscribe(self._on_channel_state_change) + # Retry the connection until the channel responds to something # looking like a gRPC connection, though it may be a proxy. conn_attempts = 0 @@ -128,6 +131,10 @@ def __init__(self, self.log_client.set_logstream_level(logging.INFO) self.closed = False + def _on_channel_state_change(self, conn_state: grpc.ChannelConnectivity): + logger.debug(f"client gRPC channel state change: {conn_state}") + self._conn_state = conn_state + def connection_info(self): try: data = self.data_client.ConnectionInfo() @@ -357,6 +364,9 @@ def is_initialized(self) -> bool: ray_client_pb2.ClusterInfoType.IS_INITIALIZED) return False + def is_connected(self) -> bool: + return self._conn_state == grpc.ChannelConnectivity.READY + def make_client_id() -> str: id = uuid.uuid4() From 6b477dd37affb5c216dcd0053f45c431f65012c3 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Tue, 26 Jan 2021 12:06:19 -0800 Subject: [PATCH 055/245] [CI] Split test_multi_node to avoid timeouts (#13712) --- python/ray/tests/BUILD | 1 + python/ray/tests/test_multi_node.py | 389 +------------------------ python/ray/tests/test_multi_node_3.py | 397 ++++++++++++++++++++++++++ 3 files changed, 402 insertions(+), 385 deletions(-) create mode 100644 python/ray/tests/test_multi_node_3.py diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 2ccdb4be2644..97980a641a4a 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -48,6 +48,7 @@ py_test_module_list( "test_metrics.py", "test_multi_node.py", "test_multi_node_2.py", + "test_multi_node_3.py", "test_multi_tenancy.py", "test_multinode_failures.py", "test_multinode_failures_2.py", diff --git a/python/ray/tests/test_multi_node.py b/python/ray/tests/test_multi_node.py index fbce475c12af..ae9ae1c1e981 100644 --- a/python/ray/tests/test_multi_node.py +++ b/python/ray/tests/test_multi_node.py @@ -1,15 +1,13 @@ import os import pytest -import subprocess import sys import time import ray -from ray.test_utils import ( - RayTestTimeoutException, check_call_ray, run_string_as_driver, - run_string_as_driver_nonblocking, wait_for_children_of_pid, - wait_for_children_of_pid_to_exit, wait_for_condition, kill_process_by_name, - Semaphore, init_error_pubsub, get_error_message) +from ray.test_utils import (RayTestTimeoutException, run_string_as_driver, + run_string_as_driver_nonblocking, + wait_for_condition, init_error_pubsub, + get_error_message) def test_remote_raylet_cleanup(ray_start_cluster): @@ -368,385 +366,6 @@ def wait_for_success_output(process_handle, timeout=10): process_handle.kill() -def test_calling_start_ray_head(call_ray_stop_only): - - # Test that we can call ray start with various command line - # parameters. TODO(rkn): This test only tests the --head code path. We - # should also test the non-head node code path. - - # Test starting Ray with a redis port specified. - check_call_ray(["start", "--head", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a node IP address specified. - check_call_ray( - ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with a system config parameter set. - check_call_ray([ - "start", "--head", "--system-config", - "{\"metrics_report_interval_ms\":100}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the object manager and node manager ports - # specified. - check_call_ray([ - "start", "--head", "--object-manager-port", "12345", - "--node-manager-port", "54321", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with the worker port range specified. - check_call_ray([ - "start", "--head", "--min-worker-port", "50000", "--max-worker-port", - "51000", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with a worker port list. - check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) - check_call_ray(["stop"]) - - # Test starting Ray with a non-int in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) - check_call_ray(["stop"]) - - # Test starting Ray with an invalid port in the worker port list. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray(["start", "--head", "--worker-port-list", "100"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of CPUs specified. - check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with the number of GPUs specified. - check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) - check_call_ray(["stop"]) - - # Test starting Ray with redis shard ports specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", - "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with all arguments specified. - check_call_ray([ - "start", "--head", "--redis-shard-ports", "6380,6381,6382", - "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", - "--resources", "{\"Custom\": 1}", "--port", "0" - ]) - check_call_ray(["stop"]) - - # Test starting Ray with invalid arguments. - with pytest.raises(subprocess.CalledProcessError): - check_call_ray( - ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) - check_call_ray(["stop"]) - - # Test --block. Killing a child process should cause the command to exit. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.poll() - assert blocked.returncode is None - - kill_process_by_name("raylet") - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - # Test --block. Killing the command should clean up all child processes. - blocked = subprocess.Popen( - ["ray", "start", "--head", "--block", "--port", "0"]) - blocked.poll() - assert blocked.returncode is None - - wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) - - blocked.terminate() - wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) - blocked.wait() - assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" - - -@pytest.mark.parametrize( - "call_ray_start", - ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], - indirect=True) -def test_using_hostnames(call_ray_start): - ray.init(_node_ip_address="localhost", address="localhost:6379") - - @ray.remote - def f(): - return 1 - - assert ray.get(f.remote()) == 1 - - -def test_connecting_in_local_case(ray_start_regular): - address_info = ray_start_regular - - # Define a driver that just connects to Redis. - driver_script = """ -import ray -ray.init(address="{}") -print("success") -""".format(address_info["redis_address"]) - - out = run_string_as_driver(driver_script) - # Make sure the other driver succeeded. - assert "success" in out - - -def test_run_driver_twice(ray_start_regular): - # We used to have issue 2165 and 2288: - # https://github.com/ray-project/ray/issues/2165 - # https://github.com/ray-project/ray/issues/2288 - # both complain that driver will hang when run for the second time. - # This test is used to verify the fix for above issue, it will run the - # same driver for twice and verify whether both of them succeed. - address_info = ray_start_regular - driver_script = """ -import ray -import ray.tune as tune -import os -import time - -def train_func(config, reporter): # add a reporter arg - for i in range(2): - time.sleep(0.1) - reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics - -os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" -ray.init(address="{}") -ray.tune.register_trainable("train_func", train_func) - -tune.run_experiments({{ - "my_experiment": {{ - "run": "train_func", - "stop": {{"mean_accuracy": 99}}, - "config": {{ - "layer1": {{ - "class_name": tune.grid_search(["a"]), - "config": {{"lr": tune.grid_search([1, 2])}} - }}, - }}, - "local_dir": os.path.expanduser("~/tmp") - }} -}}) -print("success") -""".format(address_info["redis_address"]) - - for i in range(2): - out = run_string_as_driver(driver_script) - assert "success" in out - - -@pytest.mark.skip(reason="fate sharing not implemented yet") -def test_driver_exiting_when_worker_blocked(call_ray_start): - # This test will create some drivers that submit some tasks and then - # exit without waiting for the tasks to complete. - address = call_ray_start - - ray.init(address=address) - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.get`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.get(f.remote()) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates two tasks, one that runs forever and the - # other blocked on the first in a `ray.wait`. - driver_script = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def f(): - time.sleep(10**6) -@ray.remote -def g(): - ray.wait([f.remote()]) -g.remote() -time.sleep(1) -print("success") -""".format(address) - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - out = run_string_as_driver(driver_script) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that creates one task that depends on a nonexistent - # object. This task will be queued as waiting to execute. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(x): - return -g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - # Define a driver that calls `ray.wait` on a nonexistent object. - driver_script_template = """ -import time -import ray -ray.init(address="{}") -@ray.remote -def g(): - ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) -g.remote() -time.sleep(1) -print("success") -""" - - # Create some drivers and let them exit and make sure everything is - # still alive. - for _ in range(3): - nonexistent_id = ray.ObjectRef.from_random() - driver_script = driver_script_template.format(address, - nonexistent_id.hex()) - out = run_string_as_driver(driver_script) - # Simulate the nonexistent dependency becoming available. - ray.worker.global_worker.put_object(None, nonexistent_id) - # Make sure the first driver ran to completion. - assert "success" in out - - @ray.remote - def f(): - return 1 - - # Make sure we can still talk with the raylet. - ray.get(f.remote()) - - -def test_multi_driver_logging(ray_start_regular): - address_info = ray_start_regular - address = address_info["redis_address"] - - # ray.init(address=address) - driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) - driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) - main_wait = Semaphore.options(name="main_wait").remote(value=0) - - # The creation of an actor is asynchronous. - # We need to wait for the completion of the actor creation, - # otherwise we can't get the actor by name. - ray.get(driver1_wait.locked.remote()) - ray.get(driver2_wait.locked.remote()) - ray.get(main_wait.locked.remote()) - - # Params are address, semaphore name, output1, output2 - driver_script_template = """ -import ray -import sys -from ray.test_utils import Semaphore - -@ray.remote(num_cpus=0) -def remote_print(s, file=None): - print(s, file=file) - -ray.init(address="{}") - -driver_wait = ray.get_actor("{}") -main_wait = ray.get_actor("main_wait") - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s1 = "{}" -ray.get(remote_print.remote(s1)) - -ray.get(main_wait.release.remote()) -ray.get(driver_wait.acquire.remote()) - -s2 = "{}" -ray.get(remote_print.remote(s2)) - -ray.get(main_wait.release.remote()) - """ - - p1 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver1_wait", "1", "2")) - p2 = run_string_as_driver_nonblocking( - driver_script_template.format(address, "driver2_wait", "3", "4")) - - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - # At this point both of the other drivers are fully initialized. - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '1' and driver2 '3' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - ray.get(driver1_wait.release.remote()) - ray.get(driver2_wait.release.remote()) - - # At this point driver1 should receive '2' and driver2 '4' - ray.get(main_wait.acquire.remote()) - ray.get(main_wait.acquire.remote()) - - driver1_out = p1.stdout.read().decode("ascii") - driver2_out = p2.stdout.read().decode("ascii") - if sys.platform == "win32": - driver1_out = driver1_out.replace("\r", "") - driver2_out = driver2_out.replace("\r", "") - driver1_out_split = driver1_out.split("\n") - driver2_out_split = driver2_out.split("\n") - - assert driver1_out_split[0][-1] == "1", driver1_out_split - assert driver1_out_split[1][-1] == "2", driver1_out_split - assert driver2_out_split[0][-1] == "3", driver2_out_split - assert driver2_out_split[1][-1] == "4", driver2_out_split - - if __name__ == "__main__": import pytest # Make subprocess happy in bazel. diff --git a/python/ray/tests/test_multi_node_3.py b/python/ray/tests/test_multi_node_3.py new file mode 100644 index 000000000000..9c270b64da55 --- /dev/null +++ b/python/ray/tests/test_multi_node_3.py @@ -0,0 +1,397 @@ +import os +import pytest +import subprocess +import sys + +import ray +from ray.test_utils import ( + check_call_ray, run_string_as_driver, run_string_as_driver_nonblocking, + wait_for_children_of_pid, wait_for_children_of_pid_to_exit, + kill_process_by_name, Semaphore) + + +def test_calling_start_ray_head(call_ray_stop_only): + + # Test that we can call ray start with various command line + # parameters. TODO(rkn): This test only tests the --head code path. We + # should also test the non-head node code path. + + # Test starting Ray with a redis port specified. + check_call_ray(["start", "--head", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a node IP address specified. + check_call_ray( + ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with a system config parameter set. + check_call_ray([ + "start", "--head", "--system-config", + "{\"metrics_report_interval_ms\":100}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the object manager and node manager ports + # specified. + check_call_ray([ + "start", "--head", "--object-manager-port", "12345", + "--node-manager-port", "54321", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with the worker port range specified. + check_call_ray([ + "start", "--head", "--min-worker-port", "50000", "--max-worker-port", + "51000", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with a worker port list. + check_call_ray(["start", "--head", "--worker-port-list", "10000,10001"]) + check_call_ray(["stop"]) + + # Test starting Ray with a non-int in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "10000,a"]) + check_call_ray(["stop"]) + + # Test starting Ray with an invalid port in the worker port list. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray(["start", "--head", "--worker-port-list", "100"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of CPUs specified. + check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with the number of GPUs specified. + check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) + check_call_ray(["stop"]) + + # Test starting Ray with redis shard ports specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", + "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with all arguments specified. + check_call_ray([ + "start", "--head", "--redis-shard-ports", "6380,6381,6382", + "--object-manager-port", "12345", "--num-cpus", "2", "--num-gpus", "0", + "--resources", "{\"Custom\": 1}", "--port", "0" + ]) + check_call_ray(["stop"]) + + # Test starting Ray with invalid arguments. + with pytest.raises(subprocess.CalledProcessError): + check_call_ray( + ["start", "--head", "--address", "127.0.0.1:6379", "--port", "0"]) + check_call_ray(["stop"]) + + # Test --block. Killing a child process should cause the command to exit. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.poll() + assert blocked.returncode is None + + kill_process_by_name("raylet") + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + # Test --block. Killing the command should clean up all child processes. + blocked = subprocess.Popen( + ["ray", "start", "--head", "--block", "--port", "0"]) + blocked.poll() + assert blocked.returncode is None + + wait_for_children_of_pid(blocked.pid, num_children=7, timeout=30) + + blocked.terminate() + wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) + blocked.wait() + assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" + + +@pytest.mark.parametrize( + "call_ray_start", + ["ray start --head --num-cpus=1 " + "--node-ip-address=localhost"], + indirect=True) +def test_using_hostnames(call_ray_start): + ray.init(_node_ip_address="localhost", address="localhost:6379") + + @ray.remote + def f(): + return 1 + + assert ray.get(f.remote()) == 1 + + +def test_connecting_in_local_case(ray_start_regular): + address_info = ray_start_regular + + # Define a driver that just connects to Redis. + driver_script = """ +import ray +ray.init(address="{}") +print("success") +""".format(address_info["redis_address"]) + + out = run_string_as_driver(driver_script) + # Make sure the other driver succeeded. + assert "success" in out + + +def test_run_driver_twice(ray_start_regular): + # We used to have issue 2165 and 2288: + # https://github.com/ray-project/ray/issues/2165 + # https://github.com/ray-project/ray/issues/2288 + # both complain that driver will hang when run for the second time. + # This test is used to verify the fix for above issue, it will run the + # same driver for twice and verify whether both of them succeed. + address_info = ray_start_regular + driver_script = """ +import ray +import ray.tune as tune +import os +import time + +def train_func(config, reporter): # add a reporter arg + for i in range(2): + time.sleep(0.1) + reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics + +os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" +ray.init(address="{}") +ray.tune.register_trainable("train_func", train_func) + +tune.run_experiments({{ + "my_experiment": {{ + "run": "train_func", + "stop": {{"mean_accuracy": 99}}, + "config": {{ + "layer1": {{ + "class_name": tune.grid_search(["a"]), + "config": {{"lr": tune.grid_search([1, 2])}} + }}, + }}, + "local_dir": os.path.expanduser("~/tmp") + }} +}}) +print("success") +""".format(address_info["redis_address"]) + + for i in range(2): + out = run_string_as_driver(driver_script) + assert "success" in out + + +@pytest.mark.skip(reason="fate sharing not implemented yet") +def test_driver_exiting_when_worker_blocked(call_ray_start): + # This test will create some drivers that submit some tasks and then + # exit without waiting for the tasks to complete. + address = call_ray_start + + ray.init(address=address) + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.get`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.get(f.remote()) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates two tasks, one that runs forever and the + # other blocked on the first in a `ray.wait`. + driver_script = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def f(): + time.sleep(10**6) +@ray.remote +def g(): + ray.wait([f.remote()]) +g.remote() +time.sleep(1) +print("success") +""".format(address) + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + out = run_string_as_driver(driver_script) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that creates one task that depends on a nonexistent + # object. This task will be queued as waiting to execute. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(x): + return +g.remote(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + # Define a driver that calls `ray.wait` on a nonexistent object. + driver_script_template = """ +import time +import ray +ray.init(address="{}") +@ray.remote +def g(): + ray.wait(ray.ObjectRef(ray.utils.hex_to_binary("{}"))) +g.remote() +time.sleep(1) +print("success") +""" + + # Create some drivers and let them exit and make sure everything is + # still alive. + for _ in range(3): + nonexistent_id = ray.ObjectRef.from_random() + driver_script = driver_script_template.format(address, + nonexistent_id.hex()) + out = run_string_as_driver(driver_script) + # Simulate the nonexistent dependency becoming available. + ray.worker.global_worker.put_object(None, nonexistent_id) + # Make sure the first driver ran to completion. + assert "success" in out + + @ray.remote + def f(): + return 1 + + # Make sure we can still talk with the raylet. + ray.get(f.remote()) + + +def test_multi_driver_logging(ray_start_regular): + address_info = ray_start_regular + address = address_info["redis_address"] + + # ray.init(address=address) + driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) + driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) + main_wait = Semaphore.options(name="main_wait").remote(value=0) + + # The creation of an actor is asynchronous. + # We need to wait for the completion of the actor creation, + # otherwise we can't get the actor by name. + ray.get(driver1_wait.locked.remote()) + ray.get(driver2_wait.locked.remote()) + ray.get(main_wait.locked.remote()) + + # Params are address, semaphore name, output1, output2 + driver_script_template = """ +import ray +import sys +from ray.test_utils import Semaphore + +@ray.remote(num_cpus=0) +def remote_print(s, file=None): + print(s, file=file) + +ray.init(address="{}") + +driver_wait = ray.get_actor("{}") +main_wait = ray.get_actor("main_wait") + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s1 = "{}" +ray.get(remote_print.remote(s1)) + +ray.get(main_wait.release.remote()) +ray.get(driver_wait.acquire.remote()) + +s2 = "{}" +ray.get(remote_print.remote(s2)) + +ray.get(main_wait.release.remote()) + """ + + p1 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver1_wait", "1", "2")) + p2 = run_string_as_driver_nonblocking( + driver_script_template.format(address, "driver2_wait", "3", "4")) + + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + # At this point both of the other drivers are fully initialized. + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '1' and driver2 '3' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + ray.get(driver1_wait.release.remote()) + ray.get(driver2_wait.release.remote()) + + # At this point driver1 should receive '2' and driver2 '4' + ray.get(main_wait.acquire.remote()) + ray.get(main_wait.acquire.remote()) + + driver1_out = p1.stdout.read().decode("ascii") + driver2_out = p2.stdout.read().decode("ascii") + if sys.platform == "win32": + driver1_out = driver1_out.replace("\r", "") + driver2_out = driver2_out.replace("\r", "") + driver1_out_split = driver1_out.split("\n") + driver2_out_split = driver2_out.split("\n") + + assert driver1_out_split[0][-1] == "1", driver1_out_split + assert driver1_out_split[1][-1] == "2", driver1_out_split + assert driver2_out_split[0][-1] == "3", driver2_out_split + assert driver2_out_split[1][-1] == "4", driver2_out_split + + +if __name__ == "__main__": + import pytest + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) From f490e2be43fdb0275d4a713fb03954b643d38edf Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Tue, 26 Jan 2021 13:19:51 -0800 Subject: [PATCH 056/245] [ray_client] Fix and extend get_actor test to detached actors (#13016) --- python/ray/tests/test_client.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/python/ray/tests/test_client.py b/python/ray/tests/test_client.py index 30d6faccbad9..73b19a2f2ab9 100644 --- a/python/ray/tests/test_client.py +++ b/python/ray/tests/test_client.py @@ -322,12 +322,25 @@ def get(self): actor.inc.remote() actor.inc.remote() - del actor + # Make sure the get_actor call works new_actor = ray.get_actor("test_acc") new_actor.inc.remote() assert ray.get(new_actor.get.remote()) == 3 + del actor + + actor = Accumulator.options( + name="test_acc2", lifetime="detached").remote() + actor.inc.remote() + del actor + + detatched_actor = ray.get_actor("test_acc2") + for i in range(5): + detatched_actor.inc.remote() + + assert ray.get(detatched_actor.get.remote()) == 6 + @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_internal_kv(ray_start_regular_shared): From ab6a634a9492dd079278a30e9f8b0c2e960e8c16 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Tue, 26 Jan 2021 13:31:01 -0800 Subject: [PATCH 057/245] [Serve] Revert "Revert "[Serve] Refactor BackendState" (#13626) (#13697) --- python/ray/serve/backend_state.py | 533 +++++++++++++++---------- python/ray/serve/config.py | 4 +- python/ray/serve/controller.py | 4 +- python/ray/serve/tests/test_api.py | 3 + python/ray/serve/tests/test_failure.py | 3 + 5 files changed, 330 insertions(+), 217 deletions(-) diff --git a/python/ray/serve/backend_state.py b/python/ray/serve/backend_state.py index 673c4b2cfbc8..4aad2671ea4e 100644 --- a/python/ray/serve/backend_state.py +++ b/python/ray/serve/backend_state.py @@ -1,7 +1,8 @@ import asyncio -from asyncio.futures import Future from collections import defaultdict -from typing import Dict, Any, List, Optional, Set, Tuple +from enum import Enum +import time +from typing import Dict, List, Optional, Tuple import ray import ray.cloudpickle as pickle @@ -17,7 +18,6 @@ ) from ray.serve.config import BackendConfig, ReplicaConfig from ray.serve.constants import LongPollKey -from ray.serve.exceptions import RayServeException from ray.serve.kv_store import RayInternalKVStore from ray.serve.long_poll import LongPollHost from ray.serve.utils import (format_actor_name, get_random_letters, logger, @@ -30,6 +30,150 @@ _RESOURCE_CHECK_ENABLED = True +class ReplicaState(Enum): + SHOULD_START = 1 + STARTING = 2 + RUNNING = 3 + SHOULD_STOP = 4 + STOPPING = 5 + STOPPED = 6 + + +class BackendReplica: + def __init__(self, controller_name: str, detached: bool, + replica_tag: ReplicaTag, backend_tag: BackendTag): + self._actor_name = format_actor_name(replica_tag, controller_name) + self._controller_name = controller_name + self._detached = detached + self._replica_tag = replica_tag + self._backend_tag = backend_tag + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._state = ReplicaState.SHOULD_START + + def __get_state__(self): + clean_dict = self.__dict__.copy() + del clean_dict["_actor_handle"] + del clean_dict["_startup_obj_ref"] + del clean_dict["_drain_obj_ref"] + return clean_dict + + def __set_state__(self, d): + self.__dict__ = d + self._actor_handle = None + self._startup_obj_ref = None + self._drain_obj_ref = None + self._recover_from_checkpoint() + + def _recover_from_checkpoint(self): + if self._state == ReplicaState.STARTING: + # We do not need to pass in the class here because the actor + # creation has already been started if this class was checkpointed + # in the STARTING state. + self.start() + elif self._state == ReplicaState.RUNNING: + # Fetch actor handles for all backend replicas in the system. + # The actors must exist if this class was checkpointed in the + # RUNNING state. + self._actor_handle = ray.get_actor(self._actor_name) + elif self._state == ReplicaState.STOPPING: + self.stop() + + def start(self, backend_info: Optional[BackendInfo]): + assert self._state in { + ReplicaState.SHOULD_START, ReplicaState.STARTING + }, (f"State must be {ReplicaState.SHOULD_START} or " + f"{ReplicaState.STARTING}, *not* {self._state}") + try: + self._actor_handle = ray.get_actor(self._actor_name) + except ValueError: + logger.debug("Starting replica '{}' for backend '{}'.".format( + self._replica_tag, self._backend_tag)) + self._actor_handle = ray.remote(backend_info.worker_class).options( + name=self._actor_name, + lifetime="detached" if self._detached else None, + max_restarts=-1, + max_task_retries=-1, + **backend_info.replica_config.ray_actor_options).remote( + self._backend_tag, self._replica_tag, + backend_info.replica_config.actor_init_args, + backend_info.backend_config, self._controller_name) + self._startup_obj_ref = self._actor_handle.ready.remote() + self._state = ReplicaState.STARTING + + def check_started(self): + if self._state == ReplicaState.RUNNING: + return True + assert self._state == ReplicaState.STARTING, ( + f"State must be {ReplicaState.STARTING}, *not* {self._state}") + ready, _ = ray.wait([self._startup_obj_ref], timeout=0) + if len(ready) == 1: + self._state = ReplicaState.RUNNING + return True + return False + + def set_should_stop(self, graceful_shutdown_timeout_s: Duration): + self._state = ReplicaState.SHOULD_STOP + self._graceful_shutdown_timeout_s = graceful_shutdown_timeout_s + + def stop(self): + # We need to handle transitions from: + # SHOULD_START -> SHOULD_STOP -> STOPPING + # This means that the replica_handle may not have been created. + + assert self._state in { + ReplicaState.SHOULD_STOP, ReplicaState.STOPPING + }, (f"State must be {ReplicaState.SHOULD_STOP} or " + f"{ReplicaState.STOPPING}, *not* {self._state}") + + def drain_actor(actor_name): + # NOTE: the replicas may already be stopped if we failed + # after stopping them but before writing a checkpoint. + try: + replica = ray.get_actor(actor_name) + except ValueError: + return None + return replica.drain_pending_queries.remote() + + self._state = ReplicaState.STOPPING + self._drain_obj_ref = drain_actor(self._actor_name) + self._shutdown_deadline = time.time( + ) + self._graceful_shutdown_timeout_s + + def check_stopped(self): + if self._state == ReplicaState.STOPPED: + return True + assert self._state == ReplicaState.STOPPING, ( + f"State must be {ReplicaState.STOPPING}, *not* {self._state}") + + try: + replica = ray.get_actor(self._actor_name) + except ValueError: + self._state = ReplicaState.STOPPED + return True + + ready, _ = ray.wait([self._drain_obj_ref], timeout=0) + timeout_passed = time.time() > self._shutdown_deadline + + if len(ready) == 1 or timeout_passed: + if timeout_passed: + # Graceful period passed, kill it forcefully. + logger.debug( + f"{self._actor_name} did not shutdown after " + f"{self._graceful_shutdown_timeout_s}s, force-killing.") + + ray.kill(replica, no_restart=True) + self._state = ReplicaState.STOPPED + return True + return False + + def get_actor_handle(self): + assert self._state == ReplicaState.RUNNING, ( + f"State must be {ReplicaState.RUNNING}, *not* {self._state}") + return self._actor_handle + + class BackendState: """Manages all state for backends in the system. @@ -46,79 +190,65 @@ def __init__(self, controller_name: str, detached: bool, self._long_poll_host = long_poll_host self._goal_manager = goal_manager - # Non-checkpointed state. - self.currently_starting_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag, ActorHandle]] = dict() - self.currently_stopping_replicas: Dict[asyncio.Future, Tuple[ - BackendTag, ReplicaTag]] = dict() - - # Checkpointed state. - self.backends: Dict[BackendTag, BackendInfo] = dict() - self.backend_replicas: Dict[BackendTag, Dict[ - ReplicaTag, ActorHandle]] = defaultdict(dict) + self._replicas: Dict[BackendTag, Dict[ReplicaState, List[ + BackendReplica]]] = defaultdict(lambda: defaultdict(list)) + self._backend_metadata: Dict[BackendTag, BackendInfo] = dict() + self._target_replicas: Dict[BackendTag, int] = defaultdict(int) self.backend_goals: Dict[BackendTag, GoalId] = dict() - self.backend_replicas_to_start: Dict[BackendTag, List[ - ReplicaTag]] = defaultdict(list) - self.backend_replicas_to_stop: Dict[BackendTag, List[Tuple[ - ReplicaTag, Duration]]] = defaultdict(list) - self.backends_to_remove: List[BackendTag] = list() + + # Un-Checkpointed state. + self.pending_goals: Dict[GoalId, asyncio.Event] = dict() checkpoint = self._kv_store.get(CHECKPOINT_KEY) if checkpoint is not None: - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backend_to_remove, - pending_goal_ids) = pickle.loads(checkpoint) + (self._replicas, self._backend_metadata, self._target_replicas, + self.backend_goals, pending_goal_ids) = pickle.loads(checkpoint) for goal_id in pending_goal_ids: self._goal_manager.create_goal(goal_id) - # Fetch actor handles for all backend replicas in the system. - # All of these backend_replicas are guaranteed to already exist - # because they would not be written to a checkpoint in - # self.backend_replicas until they were created. - for backend_tag, replica_dict in self.backend_replicas.items(): - for replica_tag in replica_dict.keys(): - replica_name = format_actor_name(replica_tag, - self._controller_name) - self.backend_replicas[backend_tag][ - replica_tag] = ray.get_actor(replica_name) - self._notify_backend_configs_changed() self._notify_replica_handles_changed() def _checkpoint(self) -> None: self._kv_store.put( CHECKPOINT_KEY, - pickle.dumps( - (self.backends, self.backend_replicas, self.backend_goals, - self.backend_replicas_to_start, self.backend_replicas_to_stop, - self.backends_to_remove, - self._goal_manager.get_pending_goal_ids()))) + pickle.dumps((self._replicas, self._backend_metadata, + self._target_replicas, self.backend_goals, + self._goal_manager.get_pending_goal_ids()))) def _notify_backend_configs_changed(self) -> None: self._long_poll_host.notify_changed(LongPollKey.BACKEND_CONFIGS, self.get_backend_configs()) + def get_running_replica_handles( + self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: + return { + backend_tag: { + backend_replica._replica_tag: + backend_replica.get_actor_handle() + for backend_replica in state_to_replica_dict[ + ReplicaState.RUNNING] + } + for backend_tag, state_to_replica_dict in self._replicas.items() + } + def _notify_replica_handles_changed(self) -> None: self._long_poll_host.notify_changed( LongPollKey.REPLICA_HANDLES, { backend_tag: list(replica_dict.values()) - for backend_tag, replica_dict in self.backend_replicas.items() + for backend_tag, replica_dict in + self.get_running_replica_handles().items() }) def get_backend_configs(self) -> Dict[BackendTag, BackendConfig]: return { tag: info.backend_config - for tag, info in self.backends.items() + for tag, info in self._backend_metadata.items() } - def get_replica_handles( - self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: - return self.backend_replicas - def get_backend(self, backend_tag: BackendTag) -> Optional[BackendInfo]: - return self.backends.get(backend_tag) + return self._backend_metadata.get(backend_tag) def _set_backend_goal(self, backend_tag: BackendTag, backend_info: BackendInfo) -> None: @@ -126,7 +256,11 @@ def _set_backend_goal(self, backend_tag: BackendTag, new_goal_id = self._goal_manager.create_goal() if backend_info is not None: - self.backends[backend_tag] = backend_info + self._backend_metadata[backend_tag] = backend_info + self._target_replicas[ + backend_tag] = backend_info.backend_config.num_replicas + else: + self._target_replicas[backend_tag] = 0 self.backend_goals[backend_tag] = new_goal_id @@ -136,31 +270,25 @@ def create_backend(self, backend_tag: BackendTag, backend_config: BackendConfig, replica_config: ReplicaConfig) -> Optional[GoalId]: # Ensures this method is idempotent. - backend_info = self.backends.get(backend_tag) + backend_info = self._backend_metadata.get(backend_tag) if backend_info is not None: if (backend_info.backend_config == backend_config and backend_info.replica_config == replica_config): return None - backend_replica = create_backend_replica(replica_config.func_or_class) + backend_replica_class = create_backend_replica( + replica_config.func_or_class) # Save creator that starts replicas, the arguments to be passed in, # and the configuration for the backends. backend_info = BackendInfo( - worker_class=backend_replica, + worker_class=backend_replica_class, backend_config=backend_config, replica_config=replica_config) new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, backend_info) - try: - self.scale_backend_replicas(backend_tag, - backend_config.num_replicas) - except RayServeException as e: - del self.backends[backend_tag] - raise e - # NOTE(edoakes): we must write a checkpoint before starting new # or pushing the updated config to avoid inconsistent state if we # crash while making the change. @@ -175,20 +303,15 @@ def delete_backend(self, backend_tag: BackendTag, force_kill: bool = False) -> Optional[GoalId]: # This method must be idempotent. We should validate that the # specified backend exists on the client. - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: return None - # Scale its replicas down to 0. - self.scale_backend_replicas(backend_tag, 0, force_kill) - - # Remove the backend's metadata. - del self.backends[backend_tag] - - # Add the intention to remove the backend from the routers. - self.backends_to_remove.append(backend_tag) - new_goal_id, existing_goal_id = self._set_backend_goal( backend_tag, None) + if force_kill: + self._backend_metadata[ + backend_tag].backend_config.\ + experimental_graceful_shutdown_timeout_s = 0 self._checkpoint() if existing_goal_id is not None: @@ -197,20 +320,18 @@ def delete_backend(self, backend_tag: BackendTag, def update_backend_config(self, backend_tag: BackendTag, config_options: BackendConfig): - if backend_tag not in self.backends: + if backend_tag not in self._backend_metadata: raise ValueError(f"Backend {backend_tag} is not registered") - stored_backend_config = self.backends[backend_tag].backend_config + stored_backend_config = self._backend_metadata[ + backend_tag].backend_config updated_config = stored_backend_config.copy( update=config_options.dict(exclude_unset=True)) updated_config._validate_complete() - self.backends[backend_tag].backend_config = updated_config + self._backend_metadata[backend_tag].backend_config = updated_config new_goal_id, existing_goal_id = self._set_backend_goal( - backend_tag, self.backends[backend_tag]) - - # Scale the replicas with the new configuration. - self.scale_backend_replicas(backend_tag, updated_config.num_replicas) + backend_tag, self._backend_metadata[backend_tag]) # NOTE(edoakes): we must write a checkpoint before pushing the # update to avoid inconsistent state if we crash after pushing the @@ -260,31 +381,38 @@ def _start_backend_replica(self, backend_tag: BackendTag, def scale_backend_replicas( self, backend_tag: BackendTag, - num_replicas: int, - force_kill: bool = False, - ) -> None: + ) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead - adds the intention to start/stop them to self.backend_replicas_to_start - and self.backend_replicas_to_stop. The caller is responsible for then - first writing a checkpoint and then actually starting/stopping the - intended replicas. This avoids inconsistencies with starting/stopping a - replica and then crashing before writing a checkpoint. + adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. + The caller is responsible for then first writing a checkpoint and then + actually starting/stopping the intended replicas. This avoids + inconsistencies with starting/stopping a replica and then crashing + before writing a checkpoint. """ + num_replicas = self._target_replicas.get(backend_tag, 0) logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) - assert (backend_tag in self.backends + assert (backend_tag in self._backend_metadata ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") - current_num_replicas = len(self.backend_replicas[backend_tag]) + current_num_replicas = sum([ + len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), + len(self._replicas[backend_tag][ReplicaState.STARTING]), + len(self._replicas[backend_tag][ReplicaState.RUNNING]), + ]) + delta_num_replicas = num_replicas - current_num_replicas - backend_info: BackendInfo = self.backends[backend_tag] - if delta_num_replicas > 0: + backend_info: BackendInfo = self._backend_metadata[backend_tag] + if delta_num_replicas == 0: + return False + + elif delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) @@ -292,10 +420,11 @@ def scale_backend_replicas( if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) - raise RayServeException( + logger.error( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " - "to be added. To fix this, consider scaling to replica to " + "to be added. This is not a problem if the cluster is " + "autoscaling. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, @@ -305,154 +434,132 @@ def scale_backend_replicas( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) - self.backend_replicas_to_start[backend_tag].append(replica_tag) + self._replicas[backend_tag][ReplicaState.SHOULD_START].append( + BackendReplica(self._controller_name, self._detached, + replica_tag, backend_tag)) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) - assert len( - self.backend_replicas[backend_tag]) >= delta_num_replicas - replicas_copy = self.backend_replicas.copy() + assert self._target_replicas[backend_tag] >= delta_num_replicas + for _ in range(-delta_num_replicas): - replica_tag, _ = replicas_copy[backend_tag].popitem() + replica_state_dict = self._replicas[backend_tag] + list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ + or replica_state_dict[ReplicaState.STARTING] \ + or replica_state_dict[ReplicaState.RUNNING] + + assert len(list_to_use), replica_state_dict + replica_to_stop = list_to_use.pop() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) - if force_kill: - graceful_timeout_s = 0 - self.backend_replicas_to_stop[backend_tag].append(( - replica_tag, - graceful_timeout_s, - )) - - def _start_pending_replicas(self): - for backend_tag, replicas_to_create in self.backend_replicas_to_start.\ - items(): - for replica_tag in replicas_to_create: - replica_handle = self._start_backend_replica( - backend_tag, replica_tag) - ready_future = replica_handle.ready.remote().as_future() - self.currently_starting_replicas[ready_future] = ( - backend_tag, replica_tag, replica_handle) - - def _stop_pending_replicas(self): - for backend_tag, replicas_to_stop in ( - self.backend_replicas_to_stop.items()): - for replica_tag, shutdown_timeout in replicas_to_stop: - replica_name = format_actor_name(replica_tag, - self._controller_name) - - async def kill_actor(replica_name_to_use): - # NOTE: the replicas may already be stopped if we failed - # after stopping them but before writing a checkpoint. - try: - replica = ray.get_actor(replica_name_to_use) - except ValueError: - return - - try: - await asyncio.wait_for( - replica.drain_pending_queries.remote(), - timeout=shutdown_timeout) - except asyncio.TimeoutError: - # Graceful period passed, kill it forcefully. - logger.debug( - f"{replica_name_to_use} did not shutdown after " - f"{shutdown_timeout}s, killing.") - finally: - ray.kill(replica, no_restart=True) - - self.currently_stopping_replicas[asyncio.ensure_future( - kill_actor(replica_name))] = (backend_tag, replica_tag) - - async def _check_currently_starting_replicas(self) -> int: - """Returns the number of pending replicas waiting to start""" - in_flight: Set[Future[Any]] = set() - - if self.currently_starting_replicas: - done, in_flight = await asyncio.wait( - list(self.currently_starting_replicas.keys()), timeout=0) - for fut in done: - (backend_tag, replica_tag, - replica_handle) = self.currently_starting_replicas.pop(fut) - self.backend_replicas[backend_tag][ - replica_tag] = replica_handle - - backend = self.backend_replicas_to_start.get(backend_tag) - if backend: - try: - backend.remove(replica_tag) - except ValueError: - pass - if len(backend) == 0: - del self.backend_replicas_to_start[backend_tag] - - async def _check_currently_stopping_replicas(self) -> int: - """Returns the number of replicas waiting to stop""" - in_flight: Set[Future[Any]] = set() - - if self.currently_stopping_replicas: - done_stopping, in_flight = await asyncio.wait( - list(self.currently_stopping_replicas.keys()), timeout=0) - for fut in done_stopping: - (backend_tag, - replica_tag) = self.currently_stopping_replicas.pop(fut) - - backend_to_stop = self.backend_replicas_to_stop.get( - backend_tag) - - if backend_to_stop: - try: - backend_to_stop.remove(replica_tag) - except ValueError: - pass - if len(backend_to_stop) == 0: - del self.backend_replicas_to_stop[backend_tag] - - backend = self.backend_replicas.get(backend_tag) - if backend: - try: - del backend[replica_tag] - except KeyError: - pass - - if len(self.backend_replicas[backend_tag]) == 0: - del self.backend_replicas[backend_tag] + + replica_to_stop.set_should_stop(graceful_timeout_s) + self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( + replica_to_stop) + + return True + + def scale_all_backends(self): + checkpoint_needed = False + for backend_tag, num_replicas in list(self._target_replicas.items()): + checkpoint_needed = (checkpoint_needed + or self.scale_backend_replicas(backend_tag)) + if num_replicas == 0: + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if checkpoint_needed: + self._checkpoint() + + def _pop_replicas_of_state(self, state: ReplicaState + ) -> List[Tuple[ReplicaState, BackendTag]]: + replicas = [] + for backend_tag, state_to_replica_dict in self._replicas.items(): + if state in state_to_replica_dict: + replicas.extend( + (replica, backend_tag) + for replica in state_to_replica_dict.pop(state)) + + return replicas def _completed_goals(self) -> List[GoalId]: completed_goals = [] - all_tags = set(self.backend_replicas.keys()).union( - set(self.backends.keys())) + all_tags = set(self._replicas.keys()).union( + set(self._backend_metadata.keys())) for backend_tag in all_tags: - desired_info = self.backends.get(backend_tag) - existing_info = self.backend_replicas.get(backend_tag) + desired_num_replicas = self._target_replicas.get(backend_tag) + state_dict = self._replicas.get(backend_tag, {}) + existing_info = state_dict.get(ReplicaState.RUNNING, []) + + # If we have pending ops, the current goal is *not* ready + if (state_dict.get(ReplicaState.SHOULD_START) + or state_dict.get(ReplicaState.STARTING) + or state_dict.get(ReplicaState.SHOULD_STOP) + or state_dict.get(ReplicaState.STOPPING)): + continue + + # TODO(ilr): FIX # Check for deleting - if (not desired_info or - desired_info.backend_config.num_replicas == 0) and \ + if (not desired_num_replicas or + desired_num_replicas == 0) and \ (not existing_info or len(existing_info) == 0): - completed_goals.append(self.backend_goals.get(backend_tag)) + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) # Check for a non-zero number of backends - if desired_info and existing_info and desired_info.backend_config.\ - num_replicas == len(existing_info): - completed_goals.append(self.backend_goals.get(backend_tag)) + if (desired_num_replicas and existing_info) \ + and desired_num_replicas == len(existing_info): + completed_goals.append( + self.backend_goals.pop(backend_tag, None)) return [goal for goal in completed_goals if goal] async def update(self) -> bool: + self.scale_all_backends() + for goal_id in self._completed_goals(): self._goal_manager.complete_goal(goal_id) - self._start_pending_replicas() - self._stop_pending_replicas() - - num_starting = len(self.currently_starting_replicas) - num_stopping = len(self.currently_stopping_replicas) - - await self._check_currently_starting_replicas() - await self._check_currently_stopping_replicas() - - if (len(self.currently_starting_replicas) != num_starting) or \ - (len(self.currently_stopping_replicas) != num_stopping): + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_START): + replica_state.start(self._backend_metadata[backend_tag]) + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.SHOULD_STOP): + replica_state.stop() + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + transition_triggered = False + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STARTING): + if replica_state.check_started(): + self._replicas[backend_tag][ReplicaState.RUNNING].append( + replica_state) + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STARTING].append( + replica_state) + + for replica_state, backend_tag in self._pop_replicas_of_state( + ReplicaState.STOPPING): + if replica_state.check_stopped(): + transition_triggered = True + else: + self._replicas[backend_tag][ReplicaState.STOPPING].append( + replica_state) + + for backend_tag in list(self._replicas.keys()): + if not any(self._replicas[backend_tag]): + del self._replicas[backend_tag] + del self._backend_metadata[backend_tag] + del self._target_replicas[backend_tag] + + if transition_triggered: self._checkpoint() self._notify_replica_handles_changed() diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py index 205af81b065a..41a1eca08ae8 100644 --- a/python/ray/serve/config.py +++ b/python/ray/serve/config.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional import pydantic -from pydantic import BaseModel, PositiveFloat, PositiveInt, validator +from pydantic import BaseModel, confloat, PositiveFloat, PositiveInt, validator from ray.serve.constants import (ASYNC_CONCURRENCY, DEFAULT_HTTP_HOST, DEFAULT_HTTP_PORT) @@ -64,7 +64,7 @@ class BackendConfig(BaseModel): user_config: Any = None experimental_graceful_shutdown_wait_loop_s: PositiveFloat = 2.0 - experimental_graceful_shutdown_timeout_s: PositiveFloat = 20.0 + experimental_graceful_shutdown_timeout_s: confloat(ge=0) = 20.0 class Config: validate_assignment = True diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index a3c75c711878..b5c65111a8f9 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -118,7 +118,7 @@ async def run_control_loop(self) -> None: def _all_replica_handles( self) -> Dict[BackendTag, Dict[ReplicaTag, ActorHandle]]: """Used for testing.""" - return self.backend_state.get_replica_handles() + return self.backend_state.get_running_replica_handles() def get_all_backends(self) -> Dict[BackendTag, BackendConfig]: """Returns a dictionary of backend tag to backend config.""" @@ -235,7 +235,7 @@ async def shutdown(self) -> None: async with self.write_lock: for proxy in self.http_state.get_http_proxy_handles().values(): ray.kill(proxy, no_restart=True) - for replica_dict in self.backend_state.get_replica_handles( + for replica_dict in self.backend_state.get_running_replica_handles( ).values(): for replica in replica_dict.values(): ray.kill(replica, no_restart=True) diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index 202b01386059..a35f7e54b361 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -683,6 +683,9 @@ def f(): client.create_endpoint("endpoint", backend="backend") +# This error is only printed because creation is run in the control loop, not +# in the API path. +@pytest.mark.skip() def test_create_infeasible_error(serve_instance): client = serve_instance diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index 7ecba4d51735..de7003c39f8f 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -1,8 +1,10 @@ import os import requests +import sys import tempfile import time +import pytest import ray from ray.test_utils import wait_for_condition from ray import serve @@ -154,6 +156,7 @@ def __call__(self, *args): # Test that if there are multiple replicas for a worker and one dies # unexpectedly, the others continue to serve requests. +@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_worker_replica_failure(serve_instance): client = serve_instance From 2f482193b9f0c1146f66a629eb216968746c9b1e Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 26 Jan 2021 14:14:51 -0800 Subject: [PATCH 058/245] Revert "[CLI] Fix Ray Status with ENV Variable set (#13707)" (#13719) This reverts commit 5d82654022307a8da7bdcfd8ebf211e7c29f5bc8. --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 19 ------------------- .../test_cli_patterns/test_ray_status.txt | 12 ------------ 3 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 435c16d4eebc..c9ea996f9c0c 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return os.environ.get("RAY_ADDRESS") + return "auto" # Avoid conflict with RAY_ADDRESS env var return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index a6f1b1989ae9..57bf61419690 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -415,24 +415,5 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) -def test_ray_status(): - import ray - address = ray.init().get("redis_address") - runner = CliRunner() - result = runner.invoke(scripts.status, []) - _check_output_via_pattern("test_ray_status.txt", result) - - result_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_arg) - - # Try to check status with RAY_ADDRESS set - os.environ["RAY_ADDRESS"] = address - result_env = runner.invoke(scripts.status) - _check_output_via_pattern("test_ray_status.txt", result_env) - - result_env_arg = runner.invoke(scripts.status, ["--address", address]) - _check_output_via_pattern("test_ray_status.txt", result_env_arg) - - if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt deleted file mode 100644 index 7169c5f0f096..000000000000 --- a/python/ray/tests/test_cli_patterns/test_ray_status.txt +++ /dev/null @@ -1,12 +0,0 @@ -======== Cluster status: .+ -Node status ------------------------------------------------------------- - - -Resources ------------------------------------------------------------- -Usage: - - -Demands: - \(no resource demands\) From 4f4e1b664bc46d329ff67f29ce380b71c1af36dd Mon Sep 17 00:00:00 2001 From: Rand Xie Date: Tue, 26 Jan 2021 14:15:35 -0800 Subject: [PATCH 059/245] Fix multiprocessing starmap to allow passing in zip (#13664) --- python/ray/tests/test_multiprocessing.py | 1 + python/ray/util/multiprocessing/pool.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/tests/test_multiprocessing.py b/python/ray/tests/test_multiprocessing.py index 3f63b72db19a..8ec3cb43c7df 100644 --- a/python/ray/tests/test_multiprocessing.py +++ b/python/ray/tests/test_multiprocessing.py @@ -340,6 +340,7 @@ def f(*args): args = [tuple(range(i)) for i in range(100)] assert pool.starmap(f, args) == args + assert pool.starmap(lambda x, y: x + y, zip([1, 2], [3, 4])) == [4, 6] def test_callbacks(pool_4_processes): diff --git a/python/ray/util/multiprocessing/pool.py b/python/ray/util/multiprocessing/pool.py index 2d8f3d5fb911..9910bc3a46a9 100644 --- a/python/ray/util/multiprocessing/pool.py +++ b/python/ray/util/multiprocessing/pool.py @@ -494,7 +494,7 @@ def _submit_chunk(self, def _chunk_and_run(self, func, iterable, chunksize=None, unpack_args=False): if not hasattr(iterable, "__len__"): - iterable = [iterable] + iterable = list(iterable) if chunksize is None: chunksize = self._calculate_chunksize(iterable) From 4db0a31130832e1e8dde4d903635000c11f8b29a Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Tue, 26 Jan 2021 15:26:45 -0800 Subject: [PATCH 060/245] [Core] Better error if /dev/shm is too small (#13624) --- python/ray/_private/services.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index c9ea996f9c0c..688babad6ac9 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1622,10 +1622,11 @@ def determine_plasma_store_config(object_store_memory, "This will harm performance! You may be able to free up " "space by deleting files in /dev/shm. If you are inside a " "Docker container, you can increase /dev/shm size by " - "passing '--shm-size=Xgb' to 'docker run' (or add it to " - "the run_options list in a Ray cluster config). Make sure " - "to set this to more than 2gb.".format( - ray.utils.get_user_temp_dir(), shm_avail)) + "passing '--shm-size={:.2f}gb' to 'docker run' (or add it " + "to the run_options list in a Ray cluster config). Make " + "sure to set this to more than 30% of available RAM.". + format(ray.utils.get_user_temp_dir(), shm_avail, + object_store_memory * (1.1) / (2**30))) else: plasma_directory = ray.utils.get_user_temp_dir() From 9cf0c49015732d6f7cb0a8ff92ff95b12ff1965a Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 26 Jan 2021 16:12:13 -0800 Subject: [PATCH 061/245] [CI] Skip test_multi_node_3 on Windows (#13723) test_multi_node_3 was recently split from test_multi_node, but we forgot to skip it on Windows --- ci/travis/ci.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index d9c679bc7218..82286c8c211c 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -156,6 +156,7 @@ test_python() { -python/ray/tests:test_metrics_agent # timeout -python/ray/tests:test_multi_node -python/ray/tests:test_multi_node_2 + -python/ray/tests:test_multi_node_3 -python/ray/tests:test_multiprocessing # test_connect_to_ray() fails to connect to raylet -python/ray/tests:test_node_manager -python/ray/tests:test_object_manager From 8baafacb1eed91ea399dbf4c43221424d9e7ac6a Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Tue, 26 Jan 2021 20:15:55 -0800 Subject: [PATCH 062/245] [Logging] Log rotation config (#13375) * In Progress. * formatting. * in progress. * linting. * Done. * Fix typo. * Fixed the issue. --- python/ray/_private/services.py | 67 ++++++++++------ python/ray/node.py | 32 +++++++- python/ray/ray_constants.py | 11 ++- python/ray/tests/test_logging.py | 112 +++++++++++++++++++++++++++ python/ray/workers/default_worker.py | 15 ++++ src/ray/common/ray_config_def.h | 9 +++ src/ray/util/logging.cc | 16 +++- 7 files changed, 228 insertions(+), 34 deletions(-) create mode 100644 python/ray/tests/test_logging.py diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index 688babad6ac9..d0eafc9693c6 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1045,7 +1045,9 @@ def start_log_monitor(redis_address, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None): + fate_share=None, + max_bytes=0, + backup_count=0): """Start a log monitor process. Args: @@ -1056,17 +1058,20 @@ def start_log_monitor(redis_address, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. + max_bytes (int): Log rotation parameter. Corresponding to + RotatingFileHandler's maxBytes. + backup_count (int): Log rotation parameter. Corresponding to + RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ log_monitor_filepath = os.path.join(RAY_PATH, "log_monitor.py") command = [ - sys.executable, - "-u", - log_monitor_filepath, - f"--redis-address={redis_address}", - f"--logs-dir={logs_dir}", + sys.executable, "-u", log_monitor_filepath, + f"--redis-address={redis_address}", f"--logs-dir={logs_dir}", + f"--logging-rotate-bytes={max_bytes}", + f"--logging-rotate-backup-count={backup_count}" ] if redis_password: command += ["--redis-password", redis_password] @@ -1088,7 +1093,9 @@ def start_dashboard(require_dashboard, stdout_file=None, stderr_file=None, redis_password=None, - fate_share=None): + fate_share=None, + max_bytes=0, + backup_count=0): """Start a dashboard process. Args: @@ -1107,6 +1114,10 @@ def start_dashboard(require_dashboard, stderr_file: A file handle opened for writing to redirect stderr to. If no redirection should happen, then this should be None. redis_password (str): The password of the redis server. + max_bytes (int): Log rotation parameter. Corresponding to + RotatingFileHandler's maxBytes. + backup_count (int): Log rotation parameter. Corresponding to + RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. @@ -1132,14 +1143,11 @@ def start_dashboard(require_dashboard, dashboard_dir = "new_dashboard" dashboard_filepath = os.path.join(RAY_PATH, dashboard_dir, "dashboard.py") command = [ - sys.executable, - "-u", - dashboard_filepath, - f"--host={host}", - f"--port={port}", - f"--redis-address={redis_address}", - f"--temp-dir={temp_dir}", - f"--log-dir={logdir}", + sys.executable, "-u", dashboard_filepath, f"--host={host}", + f"--port={port}", f"--redis-address={redis_address}", + f"--temp-dir={temp_dir}", f"--log-dir={logdir}", + f"--logging-rotate-bytes={max_bytes}", + f"--logging-rotate-backup-count={backup_count}" ] if redis_password: @@ -1258,7 +1266,9 @@ def start_raylet(redis_address, fate_share=None, socket_to_use=None, head_node=False, - start_initial_python_workers_for_first_job=False): + start_initial_python_workers_for_first_job=False, + max_bytes=0, + backup_count=0): """Start a raylet, which is a combined local scheduler and object manager. Args: @@ -1295,6 +1305,10 @@ def start_raylet(redis_address, config (dict|None): Optional Raylet configuration that will override defaults in RayConfig. java_worker_options (list): The command options for Java worker. + max_bytes (int): Log rotation parameter. Corresponding to + RotatingFileHandler's maxBytes. + backup_count (int): Log rotation parameter. Corresponding to + RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ @@ -1372,6 +1386,8 @@ def start_raylet(redis_address, f"--config-list={config_str}", f"--temp-dir={temp_dir}", f"--metrics-agent-port={metrics_agent_port}", + f"--logging-rotate-bytes={max_bytes}", + f"--logging-rotate-backup-count={backup_count}", "RAY_WORKER_DYNAMIC_OPTION_PLACEHOLDER", ] if redis_password: @@ -1402,6 +1418,8 @@ def start_raylet(redis_address, f"--raylet-name={raylet_name}", f"--temp-dir={temp_dir}", f"--log-dir={log_dir}", + f"--logging-rotate-bytes={max_bytes}", + f"--logging-rotate-backup-count={backup_count}", ] if redis_password is not None and len(redis_password) != 0: @@ -1780,7 +1798,9 @@ def start_monitor(redis_address, stderr_file=None, autoscaling_config=None, redis_password=None, - fate_share=None): + fate_share=None, + max_bytes=0, + backup_count=0): """Run a process to monitor the other processes. Args: @@ -1792,17 +1812,20 @@ def start_monitor(redis_address, no redirection should happen, then this should be None. autoscaling_config: path to autoscaling config file. redis_password (str): The password of the redis server. + max_bytes (int): Log rotation parameter. Corresponding to + RotatingFileHandler's maxBytes. + backup_count (int): Log rotation parameter. Corresponding to + RotatingFileHandler's backupCount. Returns: ProcessInfo for the process that was started. """ monitor_path = os.path.join(RAY_PATH, "monitor.py") command = [ - sys.executable, - "-u", - monitor_path, - f"--logs-dir={logs_dir}", - "--redis-address=" + str(redis_address), + sys.executable, "-u", monitor_path, f"--logs-dir={logs_dir}", + f"--redis-address={redis_address}", + f"--logging-rotate-bytes={max_bytes}", + f"--logging-rotate-backup-count={backup_count}" ] if autoscaling_config: command.append("--autoscaling-config=" + str(autoscaling_config)) diff --git a/python/ray/node.py b/python/ray/node.py index 086865023e54..9130b39fbe86 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -142,6 +142,18 @@ def __init__(self, if "plasma_store_as_thread" not in self._config: self._config["plasma_store_as_thread"] = True + # Configure log rotation parameters. + self.max_bytes = int( + os.getenv("RAY_ROTATION_MAX_BYTES", + ray_constants.LOGGING_ROTATE_BYTES)) + self.backup_count = int( + os.getenv("RAY_ROTATION_BACKUP_COUNT", + ray_constants.LOGGING_ROTATE_BACKUP_COUNT)) + + assert self.max_bytes >= 0 + assert self.backup_count >= 0 + + # Register the temp dir. if head: redis_client = None # date including microsecond @@ -387,6 +399,14 @@ def socket(self): except AttributeError: return None + @property + def logging_config(self): + """Get the logging config of the current node.""" + return { + "log_rotation_max_bytes": self.max_bytes, + "log_rotation_backup_count": self.backup_count + } + @property def address_info(self): """Get a dictionary of addresses.""" @@ -653,7 +673,9 @@ def start_log_monitor(self): stdout_file=subprocess.DEVNULL, stderr_file=subprocess.DEVNULL, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share) + fate_share=self.kernel_fate_share, + max_bytes=self.max_bytes, + backup_count=self.backup_count) assert ray_constants.PROCESS_TYPE_LOG_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] = [ process_info, @@ -677,6 +699,8 @@ def start_dashboard(self, require_dashboard): stderr_file=subprocess.DEVNULL, # Avoid hang(fd inherit) redis_password=self._ray_params.redis_password, fate_share=self.kernel_fate_share, + max_bytes=self.max_bytes, + backup_count=self.backup_count, port=self._ray_params.dashboard_port) assert ray_constants.PROCESS_TYPE_DASHBOARD not in self.all_processes if process_info is not None: @@ -772,6 +796,8 @@ def start_raylet(self, fate_share=self.kernel_fate_share, socket_to_use=self.socket, head_node=self.head, + max_bytes=self.max_bytes, + backup_count=self.backup_count, start_initial_python_workers_for_first_job=self._ray_params. start_initial_python_workers_for_first_job) assert ray_constants.PROCESS_TYPE_RAYLET not in self.all_processes @@ -797,7 +823,9 @@ def start_monitor(self): stderr_file=stderr_file, autoscaling_config=self._ray_params.autoscaling_config, redis_password=self._ray_params.redis_password, - fate_share=self.kernel_fate_share) + fate_share=self.kernel_fate_share, + max_bytes=self.max_bytes, + backup_count=self.backup_count) assert ray_constants.PROCESS_TYPE_MONITOR not in self.all_processes self.all_processes[ray_constants.PROCESS_TYPE_MONITOR] = [process_info] diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index a5459b8637ba..04dfd8f173b7 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -150,12 +150,9 @@ def to_memory_units(memory_bytes, round_up): LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"] LOGGER_LEVEL_HELP = ("The logging level threshold, choices=['debug', 'info'," " 'warning', 'error', 'critical'], default='info'") -# Default param for RotatingFileHandler -# maxBytes. 10G by default. We intentionally set the default value high -# so that users who won't care don't know about the existence of this. -LOGGING_ROTATE_BYTES = 10 * 1000 * 1000 * 1000 -# The default will grow logs up until 500GB without log loss. -LOGGING_ROTATE_BACKUP_COUNT = 50 # backupCount + +LOGGING_ROTATE_BYTES = 512 * 1024 * 1024 # 512MB. +LOGGING_ROTATE_BACKUP_COUNT = 5 # 5 Backup files at max. # Constants used to define the different process types. PROCESS_TYPE_REAPER = "reaper" @@ -172,6 +169,8 @@ def to_memory_units(memory_bytes, round_up): PROCESS_TYPE_REDIS_SERVER = "redis_server" PROCESS_TYPE_WEB_UI = "web_ui" PROCESS_TYPE_GCS_SERVER = "gcs_server" +PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER = "python-core-driver" +PROCESS_TYPE_PYTHON_CORE_WORKER = "python-core-worker" # Log file names MONITOR_LOG_FILE_NAME = f"{PROCESS_TYPE_MONITOR}.log" diff --git a/python/ray/tests/test_logging.py b/python/ray/tests/test_logging.py new file mode 100644 index 000000000000..6796ac4f7187 --- /dev/null +++ b/python/ray/tests/test_logging.py @@ -0,0 +1,112 @@ +import os +from collections import defaultdict +from pathlib import Path + +import ray +from ray import ray_constants + + +def set_logging_config(max_bytes, backup_count): + os.environ["RAY_ROTATION_MAX_BYTES"] = str(max_bytes) + os.environ["RAY_ROTATION_BACKUP_COUNT"] = str(backup_count) + + +def test_log_rotation_config(ray_start_cluster): + cluster = ray_start_cluster + max_bytes = 100 + backup_count = 3 + + # Create a cluster. + set_logging_config(max_bytes, backup_count) + head_node = cluster.add_node(num_cpus=0) + # Set a different env var for a worker node. + set_logging_config(0, 0) + worker_node = cluster.add_node(num_cpus=0) + cluster.wait_for_nodes() + + config = head_node.logging_config + assert config["log_rotation_max_bytes"] == max_bytes + assert config["log_rotation_backup_count"] == backup_count + config = worker_node.logging_config + assert config["log_rotation_max_bytes"] == 0 + assert config["log_rotation_backup_count"] == 0 + + +def test_log_rotation(shutdown_only): + max_bytes = 1 + backup_count = 3 + set_logging_config(max_bytes, backup_count) + ray.init(num_cpus=1) + session_dir = ray.worker.global_worker.node.address_info["session_dir"] + session_path = Path(session_dir) + log_dir_path = session_path / "logs" + + log_rotating_component = [ + ray_constants.PROCESS_TYPE_DASHBOARD, + ray_constants.PROCESS_TYPE_DASHBOARD_AGENT, + ray_constants.PROCESS_TYPE_LOG_MONITOR, + ray_constants.PROCESS_TYPE_MONITOR, + ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER, + ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER, + # Below components are not log rotating now. + # ray_constants.PROCESS_TYPE_RAYLET, + # ray_constants.PROCESS_TYPE_GCS_SERVER, + # ray_constants.PROCESS_TYPE_WORKER, + ] + + # Run the basic workload. + @ray.remote + def f(): + for i in range(10): + print(f"test {i}") + + ray.get(f.remote()) + + paths = list(log_dir_path.iterdir()) + + def component_exist(component, paths): + for path in paths: + filename = path.stem + if component in filename: + return True + return False + + def component_file_size_small_enough(component): + """Although max_bytes is 1, the file can have size that is big. + For example, if the logger prints the traceback, it can be + much bigger. So, we shouldn't make the assertion too tight. + """ + small_enough_bytes = 512 # 512 bytes. + for path in paths: + if not component_exist(component, [path]): + continue + + if path.stat().st_size > small_enough_bytes: + return False + return True + + for component in log_rotating_component: + assert component_exist(component, paths) + assert component_file_size_small_enough(component) + + # Check if the backup count is respected. + file_cnts = defaultdict(int) + for path in paths: + filename = path.stem + filename_without_suffix = filename.split(".")[0] + file_cnts[filename_without_suffix] += 1 + for filename, file_cnt in file_cnts.items(): + # There could be backup_count + 1 files. + # EX) *.log, *.log.* (as many as backup count). + assert file_cnt <= backup_count + 1, ( + f"{filename} has files that are more than " + f"backup count {backup_count}, file count: {file_cnt}") + + +if __name__ == "__main__": + import pytest + import sys + # Make subprocess happy in bazel. + os.environ["LC_ALL"] = "en_US.UTF-8" + os.environ["LANG"] = "en_US.UTF-8" + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/workers/default_worker.py b/python/ray/workers/default_worker.py index d9f7837ff2ce..7b9c2677bd0b 100644 --- a/python/ray/workers/default_worker.py +++ b/python/ray/workers/default_worker.py @@ -109,6 +109,21 @@ help="A list of directories or jar files separated by colon that specify " "the search path for user code. This will be used as `CLASSPATH` in " "Java and `PYTHONPATH` in Python.") +parser.add_argument( + "--logging-rotate-bytes", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BYTES, + help="Specify the max bytes for rotating " + "log file, default is " + f"{ray_constants.LOGGING_ROTATE_BYTES} bytes.") +parser.add_argument( + "--logging-rotate-backup-count", + required=False, + type=int, + default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT, + help="Specify the backup count of rotated log file, default is " + f"{ray_constants.LOGGING_ROTATE_BACKUP_COUNT}.") if __name__ == "__main__": # NOTE(sang): For some reason, if we move the code below # to a separate function, tensorflow will capture that method diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index d06a1c358196..cd6bd84cee9c 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -369,3 +369,12 @@ RAY_CONFIG(bool, is_external_storage_type_fs, true) /// Whether to enable locality-aware leasing. If enabled, then Ray will consider task /// dependency locality when choosing a worker for leasing. RAY_CONFIG(bool, locality_aware_leasing_enabled, true) + +/* Configuration parameters for logging */ +/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's +/// maxBytes argument. +RAY_CONFIG(int64_t, log_rotation_max_bytes, 100 * 1024 * 1024) + +/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's +/// backupCount argument. +RAY_CONFIG(int64_t, log_rotation_backup_count, 5) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 1640c5cfc657..b06d64441087 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -307,11 +307,19 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres #endif // Reset log pattern and level and we assume a log file can be rotated with // 10 files in max size 512M by default. - if (getenv("RAY_ROTATION_MAX_SIZE")) { - log_rotation_max_size_ = std::atol(getenv("RAY_RAOTATION_MAX_SIZE")); + if (getenv("RAY_ROTATION_MAX_BYTES")) { + long max_size = std::atol(getenv("RAY_ROTATION_MAX_BYTES")); + // 0 means no log rotation in python, but not in spdlog. We just use the default + // value here. + if (max_size != 0) { + log_rotation_max_size_ = max_size; + } } - if (getenv("RAY_ROTATION_FILE_NUM")) { - log_rotation_file_num_ = std::atol(getenv("RAY_ROTATION_FILE_NUM")); + if (getenv("RAY_ROTATION_BACKUP_COUNT")) { + long file_num = std::atol(getenv("RAY_ROTATION_BACKUP_COUNT")); + if (file_num != 0) { + log_rotation_file_num_ = file_num; + } } spdlog::set_pattern(log_format_pattern_); spdlog::set_level(static_cast(severity_threshold_)); From d2963f4ee13c8c32f83fb2c6dcb91ff812d37990 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Tue, 26 Jan 2021 23:10:29 -0800 Subject: [PATCH 063/245] [Object Spilling] Clean up FS storage upon sigint for ray.init(). (#13649) * Initial iteration done. * Remove unnecessary messages. * Addressed code review. * Addressed code review. * fix issues. * addressed code review. * Addressed the last code review. --- python/ray/external_storage.py | 43 +++++++- python/ray/node.py | 12 +++ python/ray/tests/test_object_spilling.py | 129 ++++++++++++++++------- python/ray/worker.py | 2 + 4 files changed, 146 insertions(+), 40 deletions(-) diff --git a/python/ray/external_storage.py b/python/ray/external_storage.py index 6e16351482cd..f764e9c0fc5e 100644 --- a/python/ray/external_storage.py +++ b/python/ray/external_storage.py @@ -1,5 +1,7 @@ import abc +import logging import os +import shutil import urllib from collections import namedtuple from typing import List, IO, Tuple @@ -9,6 +11,7 @@ from ray._raylet import ObjectRef ParsedURL = namedtuple("ParsedURL", "base_url, offset, size") +logger = logging.getLogger(__name__) def create_url_with_offset(*, url: str, offset: int, size: int) -> str: @@ -176,6 +179,14 @@ def delete_spilled_objects(self, urls: List[str]): urls: URLs that store spilled object files. """ + @abc.abstractmethod + def destroy_external_storage(self): + """Destroy external storage when a head node is down. + + NOTE: This is currently working when the cluster is + started by ray.init + """ + class NullStorage(ExternalStorage): """The class that represents an uninitialized external storage.""" @@ -189,6 +200,9 @@ def restore_spilled_objects(self, object_refs, url_with_offset_list): def delete_spilled_objects(self, urls: List[str]): raise NotImplementedError("External storage is not initialized") + def destroy_external_storage(self): + raise NotImplementedError("External storage is not initialized") + class FileSystemStorage(ExternalStorage): """The class for filesystem-like external storage. @@ -199,8 +213,8 @@ class FileSystemStorage(ExternalStorage): """ def __init__(self, directory_path): - self.directory_path = directory_path - self.prefix = DEFAULT_OBJECT_PREFIX + self.spill_dir_name = DEFAULT_OBJECT_PREFIX + self.directory_path = os.path.join(directory_path, self.spill_dir_name) os.makedirs(self.directory_path, exist_ok=True) if not os.path.exists(self.directory_path): raise ValueError("The given directory path to store objects, " @@ -211,7 +225,7 @@ def spill_objects(self, object_refs) -> List[str]: return [] # Always use the first object ref as a key when fusioning objects. first_ref = object_refs[0] - filename = f"{self.prefix}-{first_ref.hex()}-multi-{len(object_refs)}" + filename = f"{first_ref.hex()}-multi-{len(object_refs)}" url = f"{os.path.join(self.directory_path, filename)}" with open(url, "wb") as f: return self._write_multiple_objects(f, object_refs, url) @@ -243,6 +257,25 @@ def delete_spilled_objects(self, urls: List[str]): filename = parse_url_with_offset(url.decode()).base_url os.remove(os.path.join(self.directory_path, filename)) + def destroy_external_storage(self): + # Q: Should we add stdout here to + # indicate we are deleting a directory? + + # There's a race condition where IO workers are still + # deleting each objects while we try deleting the + # whole directory. So we should keep trying it until + # The directory is actually deleted. + while os.path.isdir(self.directory_path): + try: + shutil.rmtree(self.directory_path) + except FileNotFoundError: + # If excpetion occurs when other IO workers are + # deleting the file at the same time. + pass + except Exception: + logger.exception("Error cleaning up spill files") + break + class ExternalStorageSmartOpenImpl(ExternalStorage): """The external storage class implemented by smart_open. @@ -331,6 +364,9 @@ def restore_spilled_objects(self, object_refs: List[ObjectRef], def delete_spilled_objects(self, urls: List[str]): pass + def destroy_external_storage(self): + pass + _external_storage = NullStorage() @@ -353,6 +389,7 @@ def setup_external_storage(config): raise ValueError(f"Unknown external storage type: {storage_type}") else: _external_storage = NullStorage() + return _external_storage def reset_external_storage(): diff --git a/python/ray/node.py b/python/ray/node.py index 9130b39fbe86..2668d9aa0735 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -421,6 +421,9 @@ def address_info(self): "metrics_export_port": self._metrics_export_port } + def is_head(self): + return self.head + def create_redis_client(self): """Create a redis client.""" return ray._private.services.create_redis_client( @@ -1152,3 +1155,12 @@ def remaining_processes_alive(self): True if any process that wasn't explicitly killed is still alive. """ return not any(self.dead_processes()) + + def destroy_external_storage(self): + object_spilling_config = self._config.get("object_spilling_config", {}) + if object_spilling_config: + object_spilling_config = json.loads(object_spilling_config) + from ray import external_storage + storage = external_storage.setup_external_storage( + object_spilling_config) + storage.destroy_external_storage() diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index a80a91580c6f..3f5b5f7ae885 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -3,6 +3,7 @@ import os import random import platform +import subprocess import sys import numpy as np @@ -10,7 +11,7 @@ import ray from ray.external_storage import (create_url_with_offset, parse_url_with_offset) -from ray.test_utils import wait_for_condition +from ray.test_utils import wait_for_condition, run_string_as_driver from ray.internal.internal_api import memory_summary bucket_name = "object-spilling-test" @@ -68,6 +69,17 @@ def multi_node_object_spilling_config(request, tmp_path): yield create_object_spilling_config(request, tmp_path) +def is_dir_empty(temp_folder, + append_path=ray.ray_constants.DEFAULT_OBJECT_PREFIX): + # append_path is used because the file based spilling will append + # new directory path. + num_files = 0 + temp_folder = temp_folder / append_path + for path in temp_folder.iterdir(): + num_files += 1 + return num_files == 0 + + def test_invalid_config_raises_exception(shutdown_only): # Make sure ray.init raises an exception before # it starts processes when invalid object spilling @@ -120,13 +132,7 @@ def test_spilling_not_done_for_pinned_object(object_spilling_config, with pytest.raises(ray.exceptions.ObjectStoreFullError): ref2 = ray.put(arr) # noqa - def is_dir_empty(): - num_files = 0 - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - - wait_for_condition(is_dir_empty) + wait_for_condition(lambda: is_dir_empty(temp_folder)) @pytest.mark.skipif( @@ -203,7 +209,7 @@ def test_spill_objects_automatically(object_spilling_config, shutdown_only): ref = ray.put(arr) replay_buffer.append(ref) solution_buffer.append(arr) - + print("spill done.") # randomly sample objects for _ in range(1000): index = random.choice(list(range(buffer_length))) @@ -317,6 +323,7 @@ def test_spill_deadlock(object_spilling_config, shutdown_only): def test_delete_objects(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -337,15 +344,9 @@ def test_delete_objects(object_spilling_config, shutdown_only): print("-----------------------------------") - def is_dir_empty(): - num_files = 0 - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - del replay_buffer del ref - wait_for_condition(is_dir_empty) + wait_for_condition(lambda: is_dir_empty(temp_folder)) @pytest.mark.skipif( @@ -354,6 +355,7 @@ def test_delete_objects_delete_while_creating(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -381,16 +383,10 @@ def test_delete_objects_delete_while_creating(object_spilling_config, sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) - def is_dir_empty(): - num_files = 0 - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - # After all, make sure all objects are killed without race condition. del replay_buffer del ref - wait_for_condition(is_dir_empty, timeout=1000) + wait_for_condition(lambda: is_dir_empty(temp_folder)) @pytest.mark.skipif( @@ -399,6 +395,7 @@ def test_delete_objects_on_worker_failure(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config + ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ @@ -449,14 +446,8 @@ def wait_until_actor_dead(): wait_for_condition(wait_until_actor_dead) - def is_dir_empty(): - num_files = 0 - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - # After all, make sure all objects are deleted upon worker failures. - wait_for_condition(is_dir_empty, timeout=1000) + wait_for_condition(lambda: is_dir_empty(temp_folder)) @pytest.mark.skipif( @@ -465,6 +456,7 @@ def test_delete_objects_multi_node(multi_node_object_spilling_config, ray_start_cluster): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = multi_node_object_spilling_config + cluster = ray_start_cluster # Head node. cluster.add_node( @@ -518,18 +510,12 @@ def wait_until_actor_dead(actor): return True return False - def is_dir_empty(): - num_files = 0 - for path in temp_folder.iterdir(): - num_files += 1 - return num_files == 0 - # Kill actors to remove all references. for actor in actors: ray.kill(actor) wait_for_condition(lambda: wait_until_actor_dead(actor)) # The multi node deletion should work. - wait_for_condition(is_dir_empty) + wait_for_condition(lambda: is_dir_empty(temp_folder)) @pytest.mark.skipif(platform.system() == "Windows", reason="Flaky on Windows.") @@ -570,6 +556,9 @@ def test_fusion_objects(object_spilling_config, shutdown_only): assert np.array_equal(sample, solution) is_test_passing = False + # Since we'd like to see the temp directory that stores the files, + # we need to append this directory. + temp_folder = temp_folder / ray.ray_constants.DEFAULT_OBJECT_PREFIX for path in temp_folder.iterdir(): file_size = path.stat().st_size # Make sure there are at least one @@ -691,5 +680,71 @@ def allocate(*args): ray.get(tasks) +@pytest.mark.skipif( + platform.system() in ["Windows"], reason="Failing on " + "Windows and Mac.") +def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): + # Limit our object store to 75 MiB of memory. + temp_folder = tmp_path / "spill" + temp_folder.mkdir() + + driver = """ +import json +import os +import signal +import numpy as np + +import ray + +ray.init( + object_store_memory=75 * 1024 * 1024, + _system_config={{ + "max_io_workers": 2, + "min_spilling_size": 0, + "automatic_object_spilling_enabled": True, + "object_store_full_delay_ms": 100, + "object_spilling_config": json.dumps({{ + "type": "filesystem", + "params": {{ + "directory_path": "{temp_dir}" + }} + }}), + }}) +arr = np.random.rand(1024 * 1024) # 8 MB data +replay_buffer = [] + +# Spill lots of objects +for _ in range(30): + ref = None + while ref is None: + ref = ray.put(arr) + replay_buffer.append(ref) +# Send sigterm to itself. +signum = {signum} +sig = None +if signum == 2: + sig = signal.SIGINT +elif signum == 15: + sig = signal.SIGTERM +os.kill(os.getpid(), sig) +""" + + # Run a driver with sigint. + print("Sending sigint...") + with pytest.raises(subprocess.CalledProcessError): + print( + run_string_as_driver( + driver.format(temp_dir=str(temp_folder), signum=2))) + wait_for_condition(lambda: is_dir_empty(temp_folder, append_path="")) + + # Q: Looks like Sigterm doesn't work with Ray? + # print("Sending sigterm...") + # # Run a driver with sigterm. + # with pytest.raises(subprocess.CalledProcessError): + # print(run_string_as_driver( + # driver.format(temp_dir=str(temp_folder), signum=15))) + # wait_for_condition(is_dir_empty, timeout=1000) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/worker.py b/python/ray/worker.py index 350bbc6491e5..337b4ffc95fe 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -818,6 +818,8 @@ def shutdown(_exiting_interpreter=False): # Shut down the Ray processes. global _global_node if _global_node is not None: + if _global_node.is_head(): + _global_node.destroy_external_storage() _global_node.kill_all_processes(check_alive=False, allow_graceful=True) _global_node = None From 7f6d326ad843d698f29ba2e214ee50249fa5034c Mon Sep 17 00:00:00 2001 From: "DK.Pino" Date: Wed, 27 Jan 2021 18:51:26 +0800 Subject: [PATCH 064/245] [Placement Group]Add detached support for placement group. (#13582) --- doc/source/placement-group.rst | 36 ++++++ python/ray/_raylet.pyx | 6 +- python/ray/actor.py | 4 +- python/ray/includes/common.pxd | 3 +- python/ray/tests/test_placement_group.py | 113 ++++++++++++++++++ python/ray/util/placement_group.py | 17 ++- src/ray/common/placement_group.h | 6 +- src/ray/core_worker/common.h | 9 +- src/ray/core_worker/core_worker.cc | 4 +- ...io_ray_runtime_task_NativeTaskSubmitter.cc | 3 +- .../gcs_server/gcs_placement_group_manager.cc | 12 +- .../gcs_server/gcs_placement_group_manager.h | 8 +- src/ray/gcs/test/gcs_test_util.h | 5 +- src/ray/protobuf/common.proto | 2 + src/ray/protobuf/gcs.proto | 2 + 15 files changed, 209 insertions(+), 21 deletions(-) diff --git a/doc/source/placement-group.rst b/doc/source/placement-group.rst index 6fe8bc3a894d..1424b850c9c8 100644 --- a/doc/source/placement-group.rst +++ b/doc/source/placement-group.rst @@ -252,6 +252,42 @@ Note that you can anytime remove the placement group to clean up resources. ray.shutdown() +Placement Group Lifetimes +------------------------- + +.. tabs:: + .. group-tab:: Python + + By default, the lifetimes of placement groups are not detached and will be destroyed + when the driver is terminated (but, if it is created from a detached actor, it is + killed when the detached actor is killed). If you'd like to keep the placement group + alive regardless of its job or detached actor, you should specify + `lifetime="detached"`. For example: + + .. code-block:: python + + # first_driver.py + pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="STRICT_SPREAD", lifetime="detached") + ray.get(pg.ready()) + + The placement group's lifetime will be independent of the driver now. This means it + is possible to retrieve the placement group from other drivers regardless of when + the current driver exits. Let's see an example: + + .. code-block:: python + + # second_driver.py + table = ray.util.placement_group_table() + print(len(table)) + + Note that the lifetime option is decoupled from the name. If we only specified + the name without specifying ``lifetime="detached"``, then the placement group can + only be retrieved as long as the original driver is still running. + + .. group-tab:: Java + + The lifetime argument is not implemented for Java APIs yet. + Tips for Using Placement Groups ------------------------------- - Learn the :ref:`lifecycle ` of placement groups. diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8ba80852fb40..0fc3f4bf25da 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -1184,7 +1184,8 @@ cdef class CoreWorker: self, c_string name, c_vector[unordered_map[c_string, double]] bundles, - c_string strategy): + c_string strategy, + c_bool is_detached): cdef: CPlacementGroupID c_placement_group_id CPlacementStrategy c_strategy @@ -1208,7 +1209,8 @@ cdef class CoreWorker: CPlacementGroupCreationOptions( name, c_strategy, - bundles + bundles, + is_detached ), &c_placement_group_id)) diff --git a/python/ray/actor.py b/python/ray/actor.py index 499cd1eacd36..547a2929db15 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -584,7 +584,9 @@ def _remote(self, elif lifetime == "detached": detached = True else: - raise ValueError("lifetime must be either `None` or 'detached'") + raise ValueError( + "actor `lifetime` argument must be either `None` or 'detached'" + ) if placement_group_capture_child_tasks is None: placement_group_capture_child_tasks = ( diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index a7ba4b23b8b2..679ff6f0aa3b 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -270,7 +270,8 @@ cdef extern from "ray/core_worker/common.h" nogil: CPlacementGroupCreationOptions( const c_string &name, CPlacementStrategy strategy, - const c_vector[unordered_map[c_string, double]] &bundles + const c_vector[unordered_map[c_string, double]] &bundles, + c_bool is_detached ) cdef extern from "ray/gcs/gcs_client.h" nogil: diff --git a/python/ray/tests/test_placement_group.py b/python/ray/tests/test_placement_group.py index 7c5963f9e8a1..87273a4998c9 100644 --- a/python/ray/tests/test_placement_group.py +++ b/python/ray/tests/test_placement_group.py @@ -1309,6 +1309,119 @@ def is_all_placement_group_removed(): wait_for_condition(is_all_placement_group_removed) + ray.shutdown() + + +def test_detached_placement_group(ray_start_cluster): + cluster = ray_start_cluster + for _ in range(2): + cluster.add_node(num_cpus=3) + cluster.wait_for_nodes() + info = ray.init(address=cluster.address) + + # Make sure detached placement group will alive when job dead. + driver_code = f""" +import ray + +ray.init(address="{info["redis_address"]}") + +pg = ray.util.placement_group( + [{{"CPU": 1}} for _ in range(2)], + strategy="STRICT_SPREAD", lifetime="detached") +ray.get(pg.ready()) + +@ray.remote(num_cpus=1) +class Actor: + def ready(self): + return True + +for bundle_index in range(2): + actor = Actor.options(lifetime="detached", placement_group=pg, + placement_group_bundle_index=bundle_index).remote() + ray.get(actor.ready.remote()) + +ray.shutdown() + """ + + run_string_as_driver(driver_code) + + # Wait until the driver is reported as dead by GCS. + def is_job_done(): + jobs = ray.jobs() + for job in jobs: + if "StopTime" in job: + return True + return False + + def assert_alive_num_pg(expected_num_pg): + alive_num_pg = 0 + for _, placement_group_info in ray.util.placement_group_table().items( + ): + if placement_group_info["state"] == "CREATED": + alive_num_pg += 1 + return alive_num_pg == expected_num_pg + + def assert_alive_num_actor(expected_num_actor): + alive_num_actor = 0 + for actor_info in ray.actors().values(): + if actor_info["State"] == ray.gcs_utils.ActorTableData.ALIVE: + alive_num_actor += 1 + return alive_num_actor == expected_num_actor + + wait_for_condition(is_job_done) + + assert assert_alive_num_pg(1) + assert assert_alive_num_actor(2) + + # Make sure detached placement group will alive when its creator which + # is detached actor dead. + # Test actors first. + @ray.remote(num_cpus=1) + class NestedActor: + def ready(self): + return True + + @ray.remote(num_cpus=1) + class Actor: + def __init__(self): + self.actors = [] + + def ready(self): + return True + + def schedule_nested_actor_with_detached_pg(self): + # Create placement group which is detached. + pg = ray.util.placement_group( + [{ + "CPU": 1 + } for _ in range(2)], + strategy="STRICT_SPREAD", + lifetime="detached", + name="detached_pg") + ray.get(pg.ready()) + # Schedule nested actor with the placement group. + for bundle_index in range(2): + actor = NestedActor.options( + placement_group=pg, + placement_group_bundle_index=bundle_index, + lifetime="detached").remote() + ray.get(actor.ready.remote()) + self.actors.append(actor) + + a = Actor.options(lifetime="detached").remote() + ray.get(a.ready.remote()) + # 1 parent actor and 2 children actor. + ray.get(a.schedule_nested_actor_with_detached_pg.remote()) + + # Kill an actor and wait until it is killed. + ray.kill(a) + with pytest.raises(ray.exceptions.RayActorError): + ray.get(a.ready.remote()) + + # We should have 2 alive pgs and 4 alive actors. + assert assert_alive_num_pg(2) + assert assert_alive_num_actor(4) + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/util/placement_group.py b/python/ray/util/placement_group.py index be24772ab518..6d15f607f22c 100644 --- a/python/ray/util/placement_group.py +++ b/python/ray/util/placement_group.py @@ -145,7 +145,8 @@ def _fill_bundle_cache_if_needed(self): def placement_group(bundles: List[Dict[str, float]], strategy: str = "PACK", - name: str = "unnamed_group") -> PlacementGroup: + name: str = "unnamed_group", + lifetime=None) -> PlacementGroup: """Asynchronously creates a PlacementGroup. Args: @@ -160,6 +161,10 @@ def placement_group(bundles: List[Dict[str, float]], - "STRICT_SPREAD": Packs Bundles across distinct nodes. name(str): The name of the placement group. + lifetime(str): Either `None`, which defaults to the placement group + will fate share with its creator and will be deleted once its + creator is dead, or "detached", which means the placement group + will live as a global object independent of the creator. Return: PlacementGroup: Placement group object. @@ -179,8 +184,16 @@ def placement_group(bundles: List[Dict[str, float]], "Bundles cannot be an empty dictionary or " f"resources with only 0 values. Bundles: {bundles}") + if lifetime is None: + detached = False + elif lifetime == "detached": + detached = True + else: + raise ValueError("placement group `lifetime` argument must be either" + " `None` or 'detached'") + placement_group_id = worker.core_worker.create_placement_group( - name, bundles, strategy) + name, bundles, strategy, detached) return PlacementGroup(placement_group_id) diff --git a/src/ray/common/placement_group.h b/src/ray/common/placement_group.h index a068ce4a1e51..532f69d74ef9 100644 --- a/src/ray/common/placement_group.h +++ b/src/ray/common/placement_group.h @@ -67,8 +67,9 @@ class PlacementGroupSpecBuilder { PlacementGroupSpecBuilder &SetPlacementGroupSpec( const PlacementGroupID &placement_group_id, std::string name, const std::vector> &bundles, - const rpc::PlacementStrategy strategy, const JobID &creator_job_id, - const ActorID &creator_actor_id, bool is_creator_detached_actor) { + const rpc::PlacementStrategy strategy, const bool is_detached, + const JobID &creator_job_id, const ActorID &creator_actor_id, + bool is_creator_detached_actor) { message_->set_placement_group_id(placement_group_id.Binary()); message_->set_name(name); message_->set_strategy(strategy); @@ -82,6 +83,7 @@ class PlacementGroupSpecBuilder { message_->set_creator_job_dead(is_creator_detached_actor); message_->set_creator_actor_id(creator_actor_id.Binary()); message_->set_creator_actor_dead(creator_actor_id.IsNil()); + message_->set_is_detached(is_detached); for (size_t i = 0; i < bundles.size(); i++) { auto resources = bundles[i]; diff --git a/src/ray/core_worker/common.h b/src/ray/core_worker/common.h index 1716fe606de9..bb10aff958ad 100644 --- a/src/ray/core_worker/common.h +++ b/src/ray/core_worker/common.h @@ -144,8 +144,11 @@ using PlacementStrategy = rpc::PlacementStrategy; struct PlacementGroupCreationOptions { PlacementGroupCreationOptions( std::string name, PlacementStrategy strategy, - std::vector> bundles) - : name(std::move(name)), strategy(strategy), bundles(std::move(bundles)) {} + std::vector> bundles, bool is_detached) + : name(std::move(name)), + strategy(strategy), + bundles(std::move(bundles)), + is_detached(is_detached) {} /// The name of the placement group. const std::string name; @@ -153,6 +156,8 @@ struct PlacementGroupCreationOptions { const PlacementStrategy strategy = rpc::PACK; /// The resource bundles in this placement group. const std::vector> bundles; + /// Whether to keep the placement group persistent after its creator dead. + const bool is_detached = false; }; } // namespace ray diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index f7e473eca5a2..2f5dcc57efc1 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -1463,8 +1463,8 @@ Status CoreWorker::CreatePlacementGroup( builder.SetPlacementGroupSpec( placement_group_id, placement_group_creation_options.name, placement_group_creation_options.bundles, placement_group_creation_options.strategy, - worker_context_.GetCurrentJobID(), worker_context_.GetCurrentActorID(), - worker_context_.CurrentActorDetached()); + placement_group_creation_options.is_detached, worker_context_.GetCurrentJobID(), + worker_context_.GetCurrentActorID(), worker_context_.CurrentActorDetached()); PlacementGroupSpecification placement_group_spec = builder.Build(); *return_placement_group_id = placement_group_id; RAY_LOG(INFO) << "Submitting Placement Group creation to GCS: " << placement_group_id; diff --git a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc index 5470f70fb395..cd374b76a272 100644 --- a/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc +++ b/src/ray/core_worker/lib/java/io_ray_runtime_task_NativeTaskSubmitter.cc @@ -201,7 +201,8 @@ inline ray::PlacementGroupCreationOptions ToPlacementGroupCreationOptions( }); }); return ray::PlacementGroupCreationOptions(JavaStringToNativeString(env, name), - ConvertStrategy(java_strategy), bundles); + ConvertStrategy(java_strategy), bundles, + /*is_detached=*/false); } #ifdef __cplusplus diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc index b56f6b1d3b81..a856002b6465 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.cc @@ -96,11 +96,15 @@ void GcsPlacementGroup::MarkCreatorActorDead() { placement_group_table_data_.set_creator_actor_dead(true); } -bool GcsPlacementGroup::IsPlacementGroupRemovable() const { - return placement_group_table_data_.creator_job_dead() && +bool GcsPlacementGroup::IsPlacementGroupLifetimeDone() const { + return !IsDetached() && placement_group_table_data_.creator_job_dead() && placement_group_table_data_.creator_actor_dead(); } +bool GcsPlacementGroup::IsDetached() const { + return placement_group_table_data_.is_detached(); +} + ///////////////////////////////////////////////////////////////////////////////////////// GcsPlacementGroupManager::GcsPlacementGroupManager( @@ -495,7 +499,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenJobDead( continue; } placement_group->MarkCreatorJobDead(); - if (placement_group->IsPlacementGroupRemovable()) { + if (placement_group->IsPlacementGroupLifetimeDone()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } @@ -509,7 +513,7 @@ void GcsPlacementGroupManager::CleanPlacementGroupIfNeededWhenActorDead( continue; } placement_group->MarkCreatorActorDead(); - if (placement_group->IsPlacementGroupRemovable()) { + if (placement_group->IsPlacementGroupLifetimeDone()) { RemovePlacementGroup(placement_group->GetPlacementGroupID(), [](Status status) {}); } } diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h index c76849108990..28ce82090077 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_manager.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_manager.h @@ -61,6 +61,7 @@ class GcsPlacementGroup { placement_group_spec.creator_job_dead()); placement_group_table_data_.set_creator_actor_dead( placement_group_spec.creator_actor_dead()); + placement_group_table_data_.set_is_detached(placement_group_spec.is_detached()); } /// Get the immutable PlacementGroupTableData of this placement group. @@ -107,8 +108,11 @@ class GcsPlacementGroup { /// Mark that the creator actor of this placement group is dead. void MarkCreatorActorDead(); - /// Return True if the placement group is removable. False otherwise. - bool IsPlacementGroupRemovable() const; + /// Return True if the placement group lifetime is done. False otherwise. + bool IsPlacementGroupLifetimeDone() const; + + /// Returns whether or not this is a detached placement group. + bool IsDetached() const; private: /// The placement_group meta data which contains the task specification as well as the diff --git a/src/ray/gcs/test/gcs_test_util.h b/src/ray/gcs/test/gcs_test_util.h index bf908c3a278f..4d51fdd866f6 100644 --- a/src/ray/gcs/test/gcs_test_util.h +++ b/src/ray/gcs/test/gcs_test_util.h @@ -101,8 +101,9 @@ struct Mocker { PlacementGroupSpecBuilder builder; auto placement_group_id = PlacementGroupID::FromRandom(); - builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, job_id, - actor_id, /* is_creator_detached */ false); + builder.SetPlacementGroupSpec(placement_group_id, name, bundles, strategy, + /* is_detached */ false, job_id, actor_id, + /* is_creator_detached */ false); return builder.Build(); } diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index cc3149e84f46..844f44bea723 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -233,6 +233,8 @@ message PlacementGroupSpec { bool creator_job_dead = 7; // Whether or not if the creator actor is dead. bool creator_actor_dead = 8; + // Whether the placement group is persistent. + bool is_detached = 9; } message ObjectReference { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index 1e59ae8123ca..902c29cb7f58 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -191,6 +191,8 @@ message PlacementGroupTableData { bool creator_job_dead = 8; // Whether or not if the creator actor is dead. bool creator_actor_dead = 9; + // Whether the placement group is persistent. + bool is_detached = 10; } message ScheduleData { From 2664a2a8f699fcf53c13239cc7f5bc1db1fc9351 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Wed, 27 Jan 2021 16:42:44 +0100 Subject: [PATCH 065/245] [tune] fix non-deterministic category sampling by switching back to `np.random.choice` (#13710) * Enable zoopt tests again, but wait for next release * Add test and preserve state in trial executor * Add baseline check with integers * [tune] fix non-deterministic category sampling, re-enable zoopt tests * Remove random import * Disable zoopt tests --- python/ray/tune/ray_trial_executor.py | 1 + python/ray/tune/sample.py | 5 ++-- python/ray/tune/suggest/zoopt.py | 6 ++--- python/ray/tune/tests/test_sample.py | 34 ++++++++++++++++++++++++--- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py index a1fd4a8f3d06..26480118c2b0 100644 --- a/python/ray/tune/ray_trial_executor.py +++ b/python/ray/tune/ray_trial_executor.py @@ -573,6 +573,7 @@ def get_next_available_trial(self, timeout: Optional[float] = None): return None shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) + # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. diff --git a/python/ray/tune/sample.py b/python/ray/tune/sample.py index e4d349ee9db1..3be1b61e0c68 100644 --- a/python/ray/tune/sample.py +++ b/python/ray/tune/sample.py @@ -1,5 +1,4 @@ import logging -import random from copy import copy from inspect import signature from math import isclose @@ -295,7 +294,7 @@ def sample(self, spec: Optional[Union[List[Dict], Dict]] = None, size: int = 1): - items = random.choices(domain.categories, k=size) + items = np.random.choice(domain.categories, size=size).tolist() return items if len(items) > 1 else domain.cast(items[0]) default_sampler_cls = _Uniform @@ -471,7 +470,7 @@ def choice(categories: List): """Sample a categorical value. Sampling from ``tune.choice([1, 2])`` is equivalent to sampling from - ``random.choice([1, 2])`` + ``np.random.choice([1, 2])`` """ return Categorical(categories).uniform() diff --git a/python/ray/tune/suggest/zoopt.py b/python/ray/tune/suggest/zoopt.py index c0c0ddb18562..71cedffd5500 100644 --- a/python/ray/tune/suggest/zoopt.py +++ b/python/ray/tune/suggest/zoopt.py @@ -198,8 +198,8 @@ def _setup_zoopt(self): init_samples = None if self._points_to_evaluate: - logger.warning( - "`points_to_evaluate` seems to be ignored by ZOOpt.") + logger.warning("`points_to_evaluate` is ignored by ZOOpt in " + "versions <= 0.4.1.") init_samples = [ Solution(x=tuple(point[dim] for dim in self._dim_keys)) for point in self._points_to_evaluate @@ -213,8 +213,6 @@ def _setup_zoopt(self): parameter=par, parallel_num=self.parallel_num, **self.kwargs) - if init_samples: - self.optimizer.init_attribute() def set_search_properties(self, metric: Optional[str], mode: Optional[str], config: Dict) -> bool: diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index 378a2c1ef565..0b752e1be207 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -193,6 +193,32 @@ def testQuantized(self): samples = tune.sample.Float(0, 33).quantized(3).sample(size=1000) self.assertTrue(all(0 <= s <= 33 for s in samples)) + def testCategoricalSeedInTrainingLoop(self): + def train(config): + return 0 + + config = { + "integer": tune.randint(0, 100_000), + "choice": tune.choice(list(range(100_000))) + } + + np.random.seed(1000) + + out_1 = tune.run(train, config=config, num_samples=8, verbose=0) + + integers_1 = [t.config["integer"] for t in out_1.trials] + choices_1 = [t.config["choice"] for t in out_1.trials] + + np.random.seed(1000) + + out_2 = tune.run(train, config=config, num_samples=8, verbose=0) + + integers_2 = [t.config["integer"] for t in out_2.trials] + choices_2 = [t.config["choice"] for t in out_2.trials] + + self.assertSequenceEqual(integers_1, integers_2) + self.assertSequenceEqual(choices_1, choices_2) + def testConvertAx(self): from ray.tune.suggest.ax import AxSearch from ax.service.ax_client import AxClient @@ -952,9 +978,11 @@ def testPointsToEvaluateSkOpt(self): return self._testPointsToEvaluate(SkOptSearch, config) def testPointsToEvaluateZoOpt(self): - # https://github.com/polixir/ZOOpt/issues/5 - self.skipTest("ZoOpt currently ignores initial points. This test " - "will be enabled after this has been fixed.") + self.skipTest( + "ZOOpt's latest release (0.4.1) does not support sampling " + "initial points. Please re-enable this test after the next " + "release.") + config = { "metric": tune.sample.Categorical([1, 2, 3, 4]).uniform(), "a": tune.sample.Categorical(["t1", "t2", "t3", "t4"]).uniform(), From c5b645e3da9939197d68a7ad4332d2851c023e82 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Wed, 27 Jan 2021 16:43:50 +0100 Subject: [PATCH 066/245] [tune] add type hints to tune.run(), fix abstract methods of ProgressReporter (#13684) --- python/ray/tune/progress_reporter.py | 7 ++ python/ray/tune/tune.py | 145 +++++++++++++++------------ 2 files changed, 86 insertions(+), 66 deletions(-) diff --git a/python/ray/tune/progress_reporter.py b/python/ray/tune/progress_reporter.py index a71a2da546a8..a462f8e51ef3 100644 --- a/python/ray/tune/progress_reporter.py +++ b/python/ray/tune/progress_reporter.py @@ -57,6 +57,13 @@ def report(self, trials: List[Trial], done: bool, *sys_info: Dict): """ raise NotImplementedError + def set_search_properties(self, metric: Optional[str], + mode: Optional[str]): + return True + + def set_total_samples(self, total_samples: int): + pass + class TuneReporterBase(ProgressReporter): """Abstract base class for the default Tune reporters. diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index fab7b79bf5e5..009335c6073f 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -1,25 +1,35 @@ +from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Type, \ + Union + +import datetime import logging import sys import time -from ray.tune.error import TuneError -from ray.tune.experiment import convert_to_experiment_list, Experiment from ray.tune.analysis import ExperimentAnalysis -from ray.tune.suggest import BasicVariantGenerator, SearchGenerator +from ray.tune.callback import Callback +from ray.tune.error import TuneError +from ray.tune.experiment import Experiment, convert_to_experiment_list +from ray.tune.logger import Logger +from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter, \ + ProgressReporter +from ray.tune.ray_trial_executor import RayTrialExecutor +from ray.tune.registry import get_trainable_cls +from ray.tune.stopper import Stopper +from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm, \ + SearchGenerator from ray.tune.suggest.suggestion import Searcher from ray.tune.suggest.variant_generator import has_unresolved_values -from ray.tune.trial import Trial +from ray.tune.syncer import SyncConfig, set_sync_periods, wait_for_sync from ray.tune.trainable import Trainable -from ray.tune.ray_trial_executor import RayTrialExecutor -from ray.tune.utils.callback import create_default_callbacks -from ray.tune.registry import get_trainable_cls -from ray.tune.syncer import wait_for_sync, set_sync_periods, \ - SyncConfig +from ray.tune.trial import Trial from ray.tune.trial_runner import TrialRunner -from ray.tune.progress_reporter import CLIReporter, JupyterNotebookReporter -from ray.tune.schedulers import FIFOScheduler +from ray.tune.utils.callback import create_default_callbacks from ray.tune.utils.log import Verbosity, has_verbosity, set_verbosity +# Must come last to avoid circular imports +from ray.tune.schedulers import FIFOScheduler, TrialScheduler + logger = logging.getLogger(__name__) try: @@ -55,50 +65,51 @@ def _report_progress(runner, reporter, done=False): def run( - run_or_experiment, - name=None, - metric=None, - mode=None, - stop=None, - time_budget_s=None, - config=None, - resources_per_trial=None, - num_samples=1, - local_dir=None, - search_alg=None, - scheduler=None, - keep_checkpoints_num=None, - checkpoint_score_attr=None, - checkpoint_freq=0, - checkpoint_at_end=False, - verbose=Verbosity.V3_TRIAL_DETAILS, - progress_reporter=None, - log_to_file=False, - trial_name_creator=None, - trial_dirname_creator=None, - sync_config=None, - export_formats=None, - max_failures=0, - fail_fast=False, - restore=None, - server_port=None, - resume=False, - queue_trials=False, - reuse_actors=False, - trial_executor=None, - raise_on_failed_trial=True, - callbacks=None, + run_or_experiment: Union[str, Callable, Type], + name: Optional[str] = None, + metric: Optional[str] = None, + mode: Optional[str] = None, + stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], + bool]] = None, + time_budget_s: Union[None, int, float, datetime.timedelta] = None, + config: Optional[Dict[str, Any]] = None, + resources_per_trial: Optional[Mapping[str, Union[float, int]]] = None, + num_samples: int = 1, + local_dir: Optional[str] = None, + search_alg: Optional[Union[Searcher, SearchAlgorithm]] = None, + scheduler: Optional[TrialScheduler] = None, + keep_checkpoints_num: Optional[int] = None, + checkpoint_score_attr: Optional[str] = None, + checkpoint_freq: int = 0, + checkpoint_at_end: bool = False, + verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, + progress_reporter: Optional[ProgressReporter] = None, + log_to_file: bool = False, + trial_name_creator: Optional[Callable[[Trial], str]] = None, + trial_dirname_creator: Optional[Callable[[Trial], str]] = None, + sync_config: Optional[SyncConfig] = None, + export_formats: Optional[Sequence] = None, + max_failures: int = 0, + fail_fast: bool = False, + restore: Optional[str] = None, + server_port: Optional[int] = None, + resume: bool = False, + queue_trials: bool = False, + reuse_actors: bool = False, + trial_executor: Optional[RayTrialExecutor] = None, + raise_on_failed_trial: bool = True, + callbacks: Optional[Sequence[Callback]] = None, # Deprecated args - loggers=None, - ray_auto_init=None, - run_errored_only=None, - global_checkpoint_period=None, - with_server=None, - upload_dir=None, - sync_to_cloud=None, - sync_to_driver=None, - sync_on_checkpoint=None, -): + loggers: Optional[Sequence[Type[Logger]]] = None, + ray_auto_init: Optional = None, + run_errored_only: Optional = None, + global_checkpoint_period: Optional = None, + with_server: Optional = None, + upload_dir: Optional = None, + sync_to_cloud: Optional = None, + sync_to_driver: Optional = None, + sync_on_checkpoint: Optional = None, +) -> ExperimentAnalysis: """Executes training. Examples: @@ -458,18 +469,20 @@ def run( default_mode=mode) -def run_experiments(experiments, - scheduler=None, - server_port=None, - verbose=Verbosity.V3_TRIAL_DETAILS, - progress_reporter=None, - resume=False, - queue_trials=False, - reuse_actors=False, - trial_executor=None, - raise_on_failed_trial=True, - concurrent=True, - callbacks=None): +def run_experiments( + experiments: Union[Experiment, Mapping, Sequence[Union[Experiment, + Mapping]]], + scheduler: Optional[TrialScheduler] = None, + server_port: Optional[int] = None, + verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, + progress_reporter: Optional[ProgressReporter] = None, + resume: bool = False, + queue_trials: bool = False, + reuse_actors: bool = False, + trial_executor: Optional[RayTrialExecutor] = None, + raise_on_failed_trial: bool = True, + concurrent: bool = True, + callbacks: Optional[Sequence[Callback]] = None): """Runs and blocks until all trials finish. Examples: From 2d34e95c933e90cdfe07384c1c892c52b29fcee4 Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Wed, 27 Jan 2021 10:19:58 -0700 Subject: [PATCH 067/245] Don't gather check_parent_task on Windows, since it's undefined. (#13700) --- dashboard/agent.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dashboard/agent.py b/dashboard/agent.py index 7bf5e1551a2b..a1afb5f77f2a 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -185,8 +185,11 @@ async def _check_parent(): agent_port=self.grpc_port, agent_ip_address=self.ip)) - await asyncio.gather(check_parent_task, - *(m.run(self.server) for m in modules)) + tasks = [m.run(self.server) for m in modules] + if sys.platform not in ["win32", "cygwin"]: + tasks.append(check_parent_task) + await asyncio.gather(*tasks) + await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup() From 06fac785b89239dde039c310db2ee171f44aa776 Mon Sep 17 00:00:00 2001 From: Edward Oakes Date: Wed, 27 Jan 2021 14:05:37 -0600 Subject: [PATCH 068/245] [serve] Fix whacky worker replica failure test (#13696) --- python/ray/serve/tests/test_failure.py | 49 +++++++++++++------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py index de7003c39f8f..3cba01ffb3ba 100644 --- a/python/ray/serve/tests/test_failure.py +++ b/python/ray/serve/tests/test_failure.py @@ -1,13 +1,11 @@ import os import requests import sys -import tempfile import time import pytest import ray from ray.test_utils import wait_for_condition -from ray import serve from ray.serve.config import BackendConfig, ReplicaConfig @@ -160,34 +158,30 @@ def __call__(self, *args): def test_worker_replica_failure(serve_instance): client = serve_instance + @ray.remote + class Counter: + def __init__(self): + self.count = 0 + + def inc_and_get(self): + self.count += 1 + return self.count + class Worker: # Assumes that two replicas are started. Will hang forever in the # constructor for any workers that are restarted. - def __init__(self, path): + def __init__(self, counter): self.should_hang = False - if not os.path.exists(path): - with open(path, "w") as f: - f.write("1") - else: - with open(path, "r") as f: - num = int(f.read()) - - with open(path, "w") as f: - if num == 2: - self.should_hang = True - else: - f.write(str(num + 1)) - - if self.should_hang: + self.index = ray.get(counter.inc_and_get.remote()) + if self.index > 2: while True: pass def __call__(self, *args): - pass + return self.index - temp_path = os.path.join(tempfile.gettempdir(), - serve.utils.get_random_letters()) - client.create_backend("replica_failure", Worker, temp_path) + counter = Counter.remote() + client.create_backend("replica_failure", Worker, counter) client.update_backend_config( "replica_failure", BackendConfig(num_replicas=2)) client.create_endpoint( @@ -195,9 +189,16 @@ def __call__(self, *args): # Wait until both replicas have been started. responses = set() - while len(responses) == 1: - responses.add(request_with_retries("/replica_failure", timeout=1).text) + start = time.time() + while time.time() - start < 30: time.sleep(0.1) + response = request_with_retries("/replica_failure", timeout=1).text + assert response in ["1", "2"] + responses.add(response) + if len(responses) > 1: + break + else: + raise TimeoutError("Timed out waiting for replicas after 30s.") # Kill one of the replicas. handles = _get_worker_handles(client, "replica_failure") @@ -263,6 +264,4 @@ def f(_): if __name__ == "__main__": - import sys - import pytest sys.exit(pytest.main(["-v", "-s", __file__])) From 202fbdf38c48f7db54994e7143232a75490c9fdb Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Wed, 27 Jan 2021 12:11:31 -0800 Subject: [PATCH 069/245] [Serve] Fix ServeHandle serialization (#13695) --- python/ray/serve/api.py | 7 +++++ python/ray/serve/handle.py | 25 ++++++++++----- python/ray/serve/tests/test_handle.py | 44 ++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 8 deletions(-) diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index b42cd78464a7..19783dc3700b 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,6 +66,8 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): + self.controller_handle = controller_handle + self.sync = sync self.router = Router(controller_handle) if sync: @@ -92,6 +94,11 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro + def __reduce__(self): + deserializer = ThreadProxiedRouter + serialized_data = (self.controller_handle, self.sync) + return deserializer, serialized_data + class Client: def __init__(self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index c6951c6380b9..4ee2624a8d31 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,8 +4,6 @@ from typing import Any, Dict, Optional, Union from enum import Enum -from ray.serve.router import Router - @dataclass(frozen=True) class HandleOptions: @@ -40,10 +38,11 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__(self, - router: Router, - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__( + self, + router, # ThreadProxiedRouter + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() @@ -78,7 +77,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchrounous request to the endpoint. + """Issue an asynchronous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -98,6 +97,12 @@ async def remote(self, def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" + def __reduce__(self): + deserializer = RayServeHandle + serialized_data = (self.router, self.endpoint_name, + self.handle_options) + return deserializer, serialized_data + class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -123,3 +128,9 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() + + def __reduce__(self): + deserializer = RayServeSyncHandle + serialized_data = (self.router, self.endpoint_name, + self.handle_options) + return deserializer, serialized_data diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index c17db7686aad..88ab9d2c2b7a 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,9 +1,51 @@ import requests - +import pytest import ray from ray import serve +@pytest.mark.asyncio +async def test_async_handle_serializable(serve_instance): + client = serve_instance + + def f(_): + return "hello" + + client.create_backend("f", f) + client.create_endpoint("f", backend="f") + + @ray.remote + class TaskActor: + async def task(self, handle): + ref = await handle.remote() + output = await ref + return output + + handle = client.get_handle("f", sync=False) + + task_actor = TaskActor.remote() + result = await task_actor.task.remote(handle) + assert result == "hello" + + +def test_sync_handle_serializable(serve_instance): + client = serve_instance + + def f(_): + return "hello" + + client.create_backend("f", f) + client.create_endpoint("f", backend="f") + + @ray.remote + def task(handle): + return ray.get(handle.remote()) + + handle = client.get_handle("f", sync=True) + result_ref = task.remote(handle) + assert ray.get(result_ref) == "hello" + + def test_handle_in_endpoint(serve_instance): client = serve_instance From eba698d48ed531c2144ab5cb158afce7e4fdc702 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 27 Jan 2021 13:10:45 -0800 Subject: [PATCH 070/245] Remove docs for install-nightly (#13744) --- doc/source/installation.rst | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/doc/source/installation.rst b/doc/source/installation.rst index 397113d95c04..049d3ed28038 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -24,22 +24,7 @@ You can install the latest official version of Ray as follows. Official releases Daily Releases (Nightlies) -------------------------- -You can install the latest Ray wheels via the following command. These daily releases are tested via automated tests but do not go through the full release process: - -.. code-block:: bash - - pip install -U ray - ray install-nightly - - -.. note:: ``ray install-nightly`` may not capture updated library dependencies. After running ``ray install-nightly``, consider running ``pip install ray[]`` *without upgrading (via -U)* to update dependencies. - - -.. note:: If you're currently on ``ray<=1.0.1.post1``, ``ray install-nightly`` will not install the most recent nightly wheels. Please use the links below instead. - -Alternatively, here are the links to the latest wheels (which are built for each commit on the -master branch). To install these wheels, use the following ``pip`` command and wheels -instead of the ones above: +You can install the nightly Ray wheels via the following links. These daily releases are tested via automated tests but do not go through the full release process. To install these wheels, use the following ``pip`` command and wheels: .. code-block:: bash From b4bcb9b60a0861753ebf78c90236807a2280fd70 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Wed, 27 Jan 2021 13:45:30 -0800 Subject: [PATCH 071/245] [Docker] Use Cuda 11 (#13691) --- build-docker.sh | 2 +- ci/travis/build-docker-images.py | 2 +- ci/travis/build-docker-images.sh | 2 +- docker/base-deps/Dockerfile | 2 +- python/requirements_ml_docker.txt | 7 +++++-- release/rllib_tests/unit_gpu_tests/requirements.txt | 6 ++++-- 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/build-docker.sh b/build-docker.sh index 3a09b4896010..b39336186caf 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -16,7 +16,7 @@ key="$1" case $key in --gpu) GPU="-gpu" - BASE_IMAGE="nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" + BASE_IMAGE="nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" ;; --no-cache-build) NO_CACHE="--no-cache" diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index a2ae7a18d13c..c549bc95e60a 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -84,7 +84,7 @@ def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: build_args = {} if image_name == "base-deps": build_args["BASE_IMAGE"] = ( - "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" + "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" if gpu == "-gpu" else "ubuntu:focal") else: build_args["GPU"] = gpu diff --git a/ci/travis/build-docker-images.sh b/ci/travis/build-docker-images.sh index c894da23a662..6463c880f649 100755 --- a/ci/travis/build-docker-images.sh +++ b/ci/travis/build-docker-images.sh @@ -22,7 +22,7 @@ build_and_push_tags() { # $2 tag for image (e.g. hash of commit) for GPU in "" "-gpu" do - BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) + BASE_IMAGE=$(if [ "$GPU" ]; then echo "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04"; else echo "ubuntu:focal"; fi;) FULL_NAME_WITH_TAG="rayproject/$1:$2$GPU" NIGHTLY_FULL_NAME_WITH_TAG="rayproject/$1:nightly$GPU" docker build --no-cache --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH=".whl/$WHEEL" --label "SHA=$2" -t "$FULL_NAME_WITH_TAG" /"$ROOT_DIR"/docker/"$1" diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index a5bcfedbf6be..3aec50c99f80 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -1,6 +1,6 @@ # The base-deps Docker image installs main libraries needed to run Ray -# The GPU option is nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04 +# The GPU option is nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 ARG BASE_IMAGE="ubuntu:focal" FROM ${BASE_IMAGE} # If this arg is not "autoscaler" then no autoscaler requirements will be included diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index 6f610c46862e..c61ba0c055f6 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -1,3 +1,6 @@ ipython -tensorflow-gpu -torch \ No newline at end of file +tensorflow-gpu>=2.4.0 +-f https://download.pytorch.org/whl/torch_stable.html +torch==1.7.1+cu110 +-f https://download.pytorch.org/whl/torch_stable.html +torchvision==0.8.2+cu110 \ No newline at end of file diff --git a/release/rllib_tests/unit_gpu_tests/requirements.txt b/release/rllib_tests/unit_gpu_tests/requirements.txt index 4f88975397f9..b8a991f74f34 100644 --- a/release/rllib_tests/unit_gpu_tests/requirements.txt +++ b/release/rllib_tests/unit_gpu_tests/requirements.txt @@ -1,7 +1,9 @@ ray[rllib] ray -torch==1.6+cu101 -torchvision==0.7.0+cu101 +-f https://download.pytorch.org/whl/torch_stable.html +torch==1.7.1+cu110 +-f https://download.pytorch.org/whl/torch_stable.html +torchvision==0.8.2+cu110 boto3==1.4.8 cython==0.29.0 pytest From c5209e2dab28783c2bf017b45fbc588eb4f12c2d Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Wed, 27 Jan 2021 13:46:07 -0800 Subject: [PATCH 072/245] [Docker] default to /home/ray (#13738) --- docker/base-deps/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 3aec50c99f80..278fad1ec73d 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -69,3 +69,5 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ fi;) \ && sudo rm -rf /var/lib/apt/lists/* \ && sudo apt-get clean + +WORKDIR $HOME \ No newline at end of file From 56a9523020aa4612a72fe56565869b126bc018cf Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 27 Jan 2021 14:02:22 -0800 Subject: [PATCH 073/245] Fix high CPU usage in object manager due to O(n^2) iteration over active pulls list (#13724) --- src/ray/object_manager/pull_manager.cc | 29 ++++++++++++++------------ src/ray/object_manager/pull_manager.h | 6 ++++-- src/ray/raylet/node_manager.cc | 10 +++++---- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index 302f2f4354ef..f4920a8def92 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -51,7 +51,8 @@ uint64_t PullManager::Pull(const std::vector &object_ref_b bool PullManager::ActivateNextPullBundleRequest( const std::map>::iterator - &next_request_it) { + &next_request_it, + std::vector *objects_to_pull) { // Check that we have sizes for all of the objects in the bundle. If not, we // should not activate the bundle, since it may put us over the available // capacity. @@ -81,6 +82,7 @@ bool PullManager::ActivateNextPullBundleRequest( auto it = object_pull_requests_.find(obj_id); RAY_CHECK(it != object_pull_requests_.end()); num_bytes_being_pulled_ += it->second.object_size; + objects_to_pull->push_back(obj_id); } } @@ -91,7 +93,8 @@ bool PullManager::ActivateNextPullBundleRequest( } void PullManager::DeactivatePullBundleRequest( - const std::map>::iterator &request_it) { + const std::map>::iterator &request_it, + std::unordered_set *objects_to_cancel) { for (const auto &ref : request_it->second) { auto obj_id = ObjectRefToId(ref); RAY_CHECK(active_object_pull_requests_[obj_id].erase(request_it->first)); @@ -101,6 +104,10 @@ void PullManager::DeactivatePullBundleRequest( RAY_CHECK(it != object_pull_requests_.end()); num_bytes_being_pulled_ -= it->second.object_size; active_object_pull_requests_.erase(obj_id); + + if (objects_to_cancel) { + objects_to_cancel->insert(obj_id); + } } } @@ -120,10 +127,9 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) RAY_LOG(DEBUG) << "Updating pulls based on available memory: " << num_bytes_available; } num_bytes_available_ = num_bytes_available; - uint64_t prev_highest_req_id_being_pulled = highest_req_id_being_pulled_; - std::unordered_set object_ids_to_pull; // While there is available capacity, activate the next pull request. + std::vector objects_to_pull; while (num_bytes_being_pulled_ < num_bytes_available_) { // Get the next pull request in the queue. const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); @@ -145,7 +151,7 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) << " num bytes available: " << num_bytes_available_; // There is another pull bundle request that we could try, and there is // enough space. Activate the next pull bundle request in the queue. - if (!ActivateNextPullBundleRequest(next_request_it)) { + if (!ActivateNextPullBundleRequest(next_request_it, &objects_to_pull)) { // This pull bundle request could not be activated, due to lack of object // size information. Wait until we have object size information before // activating this pull bundle. @@ -162,18 +168,15 @@ void PullManager::UpdatePullsBasedOnAvailableMemory(size_t num_bytes_available) << " num bytes available: " << num_bytes_available_; const auto last_request_it = pull_request_bundles_.find(highest_req_id_being_pulled_); RAY_CHECK(last_request_it != pull_request_bundles_.end()); - DeactivatePullBundleRequest(last_request_it); + DeactivatePullBundleRequest(last_request_it, &object_ids_to_cancel); } TriggerOutOfMemoryHandlingIfNeeded(); - if (highest_req_id_being_pulled_ > prev_highest_req_id_being_pulled) { - // There are newly activated requests. Start pulling objects for the newly - // activated requests. - // NOTE(swang): We could also just wait for the next timer tick to pull the - // objects, but this would add a delay of up to one tick for any bundles of - // multiple objects, even when we are not under memory pressure. - Tick(); + for (const auto &obj_id : objects_to_pull) { + if (object_ids_to_cancel.count(obj_id) == 0) { + TryToMakeObjectLocal(obj_id); + } } } diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 26eba1a35264..3a542fef7af2 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -146,12 +146,14 @@ class PullManager { /// any objects in the request that are not already being pulled. bool ActivateNextPullBundleRequest( const std::map>::iterator - &next_request_it); + &next_request_it, + std::vector *objects_to_pull); /// Deactivate a pull request in the queue. This cancels any pull or restore /// operations for the object. void DeactivatePullBundleRequest( - const std::map>::iterator &request_it); + const std::map>::iterator &request_it, + std::unordered_set *objects_to_cancel = nullptr); /// Trigger out-of-memory handling if the first request in the queue needs /// more space than the bytes available. This is needed to make room for the diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 072064f4695a..e1ac5eb670bb 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2509,14 +2509,16 @@ rpc::ObjectStoreStats AccumulateStoreStats( rpc::ObjectStoreStats store_stats; for (const auto &reply : node_stats) { auto cur_store = reply.store_stats(); - store_stats.set_spill_time_total_s(store_stats.spill_time_total_s() + - cur_store.spill_time_total_s()); + // Use max aggregation for time, since the nodes are spilling concurrently. + store_stats.set_spill_time_total_s( + std::max(store_stats.spill_time_total_s(), cur_store.spill_time_total_s())); + store_stats.set_restore_time_total_s( + std::max(store_stats.restore_time_total_s(), cur_store.restore_time_total_s())); + // Use sum aggregation for the rest of the metrics. store_stats.set_spilled_bytes_total(store_stats.spilled_bytes_total() + cur_store.spilled_bytes_total()); store_stats.set_spilled_objects_total(store_stats.spilled_objects_total() + cur_store.spilled_objects_total()); - store_stats.set_restore_time_total_s(store_stats.restore_time_total_s() + - cur_store.restore_time_total_s()); store_stats.set_restored_bytes_total(store_stats.restored_bytes_total() + cur_store.restored_bytes_total()); store_stats.set_restored_objects_total(store_stats.restored_objects_total() + From 3644df415a1fc6bb34b532067f9c676985c726ee Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 27 Jan 2021 14:18:06 -0800 Subject: [PATCH 074/245] [CI] Add retry to java doc test (#13743) --- java/test.sh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/java/test.sh b/java/test.sh index 49a0d68bbdc5..86afc719b5b0 100755 --- a/java/test.sh +++ b/java/test.sh @@ -71,15 +71,18 @@ RAY_BACKEND_LOG_LEVEL=debug java -cp bazel-bin/java/all_tests_deploy.jar -Dray.a -Dray.redis.password='123456' -Dray.job.code-search-path="$PWD/bazel-bin/java/all_tests_deploy.jar" io.ray.test.MultiDriverTest ray stop -echo "Running documentation demo code." -docdemo_path="java/test/src/main/java/io/ray/docdemo/" -for file in "$docdemo_path"*.java; do - file=${file#"$docdemo_path"} - class=${file%".java"} - echo "Running $class" - java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" -done -popd +# See issue #13742 the test is very flaky. +# Skipping the doc test for now. + +# echo "Running documentation demo code." +# docdemo_path="java/test/src/main/java/io/ray/docdemo/" +# for file in "$docdemo_path"*.java; do +# file=${file#"$docdemo_path"} +# class=${file%".java"} +# echo "Running $class" +# java -cp bazel-bin/java/all_tests_deploy.jar "io.ray.docdemo.$class" +# done +# popd pushd "$ROOT_DIR" echo "Testing maven install." From c0fe8164667ea47ac284489e5f696c0bf5ece4e6 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Wed, 27 Jan 2021 15:30:58 -0800 Subject: [PATCH 075/245] [Core/Autoscaler] Properly clean up resource backlog from (#13727) --- .../raylet/scheduling/cluster_task_manager.cc | 21 ++++-- .../scheduling/cluster_task_manager_test.cc | 71 ++++++++++++------- 2 files changed, 62 insertions(+), 30 deletions(-) diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index a395e51b5077..43c6ce1cc78a 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -618,12 +618,21 @@ bool ClusterTaskManager::AnyPendingTasks(Task *exemplar, bool *any_pending, std::string ClusterTaskManager::DebugStr() const { // TODO(Shanly): This method will be replaced with `DebugString` once we remove the // legacy scheduler. + auto accumulator = [](int state, const std::pair> &pair) { + return state + pair.second.size(); + }; + int num_infeasible_tasks = + std::accumulate(infeasible_tasks_.begin(), infeasible_tasks_.end(), 0, accumulator); + int num_tasks_to_schedule = std::accumulate(tasks_to_schedule_.begin(), + tasks_to_schedule_.end(), 0, accumulator); + int num_tasks_to_dispatch = std::accumulate(tasks_to_dispatch_.begin(), + tasks_to_dispatch_.end(), 0, accumulator); std::stringstream buffer; buffer << "========== Node: " << self_node_id_ << " =================\n"; - buffer << "Schedule queue length: " << tasks_to_schedule_.size() << "\n"; - buffer << "Dispatch queue length: " << tasks_to_dispatch_.size() << "\n"; + buffer << "Infeasible queue length: " << num_infeasible_tasks << "\n"; + buffer << "Schedule queue length: " << num_tasks_to_schedule << "\n"; + buffer << "Dispatch queue length: " << num_tasks_to_dispatch << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_.size() << "\n"; - buffer << "infeasible queue length size: " << infeasible_tasks_.size() << "\n"; buffer << "cluster_resource_scheduler state: " << cluster_resource_scheduler_->DebugString() << "\n"; buffer << "=================================================="; @@ -673,7 +682,6 @@ void ClusterTaskManager::Dispatch( const Task &task, rpc::RequestWorkerLeaseReply *reply, std::function send_reply_callback) { const auto &task_spec = task.GetTaskSpecification(); - RAY_LOG(DEBUG) << "Dispatching task " << task_spec.TaskId(); // Pass the contact info of the worker to use. reply->set_worker_pid(worker->GetProcess().GetId()); reply->mutable_worker_address()->set_ip_address(worker->IpAddress()); @@ -683,6 +691,7 @@ void ClusterTaskManager::Dispatch( RAY_CHECK(leased_workers.find(worker->WorkerId()) == leased_workers.end()); leased_workers[worker->WorkerId()] = worker; + RemoveFromBacklogTracker(task); // Update our internal view of the cluster state. std::shared_ptr allocated_resources; @@ -734,7 +743,9 @@ void ClusterTaskManager::Dispatch( } void ClusterTaskManager::Spillback(const NodeID &spillback_to, const Work &work) { - const auto &task_spec = std::get<0>(work).GetTaskSpecification(); + const auto &task = std::get<0>(work); + const auto &task_spec = task.GetTaskSpecification(); + RemoveFromBacklogTracker(task); RAY_LOG(DEBUG) << "Spilling task " << task_spec.TaskId() << " to node " << spillback_to; if (!cluster_resource_scheduler_->AllocateRemoteTaskResources( diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 7c5f00820839..776e7fc53030 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -554,48 +554,69 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { *callback_occurred_ptr = true; }; - std::shared_ptr worker = - std::make_shared(WorkerID::FromRandom(), 1234); - pool_.PushWorker(std::dynamic_pointer_cast(worker)); - std::vector to_cancel; - for (int i = 0; i < 10; i++) { - Task task = CreateTask({{ray::kCPU_ResourceLabel, 100}}); - task.SetBacklogSize(i); + // Don't add these fist 2 tasks to `to_cancel`. + for (int i = 0; i < 1; i++) { + Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); + task.SetBacklogSize(10 - i); + task_manager_.QueueAndScheduleTask(task, &reply, callback); + } + + for (int i = 1; i < 10; i++) { + Task task = CreateTask({{ray::kCPU_ResourceLabel, 8}}); + task.SetBacklogSize(10 - i); task_manager_.QueueAndScheduleTask(task, &reply, callback); to_cancel.push_back(task.GetTaskSpecification().TaskId()); } ASSERT_FALSE(callback_occurred); ASSERT_EQ(leased_workers_.size(), 0); - ASSERT_EQ(pool_.workers.size(), 1); + ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); - auto data = std::make_shared(); - task_manager_.FillResourceUsage(data); + { // No tasks can run because the worker pool is empty. + auto data = std::make_shared(); + task_manager_.FillResourceUsage(data); + auto resource_load_by_shape = data->resource_load_by_shape(); + auto shape1 = resource_load_by_shape.resource_demands()[0]; + + ASSERT_EQ(shape1.backlog_size(), 55); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); + ASSERT_EQ(shape1.num_ready_requests_queued(), 10); + } + + // Push a worker so the first task can run. + std::shared_ptr worker = + std::make_shared(WorkerID::FromRandom(), 1234); + pool_.PushWorker(worker); + task_manager_.ScheduleAndDispatchTasks(); - auto resource_load_by_shape = data->resource_load_by_shape(); - auto shape1 = resource_load_by_shape.resource_demands()[0]; + { + auto data = std::make_shared(); + task_manager_.FillResourceUsage(data); + auto resource_load_by_shape = data->resource_load_by_shape(); + auto shape1 = resource_load_by_shape.resource_demands()[0]; - ASSERT_EQ(shape1.backlog_size(), 45); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 10); - ASSERT_EQ(shape1.num_ready_requests_queued(), 0); + ASSERT_TRUE(callback_occurred); + ASSERT_EQ(shape1.backlog_size(), 45); + ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); + ASSERT_EQ(shape1.num_ready_requests_queued(), 9); + } + // Cancel the rest. for (auto &task_id : to_cancel) { ASSERT_TRUE(task_manager_.CancelTask(task_id)); } + RAY_LOG(ERROR) << "Finished cancelling tasks"; - data = std::make_shared(); - task_manager_.FillResourceUsage(data); - - resource_load_by_shape = data->resource_load_by_shape(); - shape1 = resource_load_by_shape.resource_demands()[0]; - - ASSERT_EQ(shape1.backlog_size(), 0); - ASSERT_EQ(shape1.num_infeasible_requests_queued(), 0); - ASSERT_EQ(shape1.num_ready_requests_queued(), 0); - AssertNoLeaks(); + { + auto data = std::make_shared(); + task_manager_.FillResourceUsage(data); + auto resource_load_by_shape = data->resource_load_by_shape(); + ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); + AssertNoLeaks(); + } } TEST_F(ClusterTaskManagerTest, OwnerDeadTest) { From bdf0c009893c1e153543766ce3941ce99084afa2 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Wed, 27 Jan 2021 15:33:33 -0800 Subject: [PATCH 076/245] Revert "Revert "[CLI] Fix Ray Status with ENV Variable set (#13707) (#13726) --- python/ray/_private/services.py | 2 +- python/ray/tests/test_cli.py | 28 +++++++++++++++++++ .../test_cli_patterns/test_ray_status.txt | 14 ++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 python/ray/tests/test_cli_patterns/test_ray_status.txt diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index d0eafc9693c6..1c4c6497dca6 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -216,7 +216,7 @@ def get_ray_address_to_use_or_die(): A string to pass into `ray.init(address=...)` """ if "RAY_ADDRESS" in os.environ: - return "auto" # Avoid conflict with RAY_ADDRESS env var + return os.environ.get("RAY_ADDRESS") return find_redis_address_or_die() diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py index 57bf61419690..f5628701f91b 100644 --- a/python/ray/tests/test_cli.py +++ b/python/ray/tests/test_cli.py @@ -37,6 +37,7 @@ import ray.autoscaler._private.aws.config as aws_config import ray.scripts.scripts as scripts +from ray.test_utils import wait_for_condition boto3_list = [{ "InstanceType": "t1.micro", @@ -415,5 +416,32 @@ def commands_mock(command, stdin): _check_output_via_pattern("test_ray_submit.txt", result) +def test_ray_status(): + import ray + address = ray.init().get("redis_address") + runner = CliRunner() + + def output_ready(): + result = runner.invoke(scripts.status) + result.stdout + return not result.exception and "memory" in result.output + + wait_for_condition(output_ready) + + result = runner.invoke(scripts.status, []) + _check_output_via_pattern("test_ray_status.txt", result) + + result_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_arg) + + # Try to check status with RAY_ADDRESS set + os.environ["RAY_ADDRESS"] = address + result_env = runner.invoke(scripts.status) + _check_output_via_pattern("test_ray_status.txt", result_env) + + result_env_arg = runner.invoke(scripts.status, ["--address", address]) + _check_output_via_pattern("test_ray_status.txt", result_env_arg) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/test_cli_patterns/test_ray_status.txt b/python/ray/tests/test_cli_patterns/test_ray_status.txt new file mode 100644 index 000000000000..f903c6d62503 --- /dev/null +++ b/python/ray/tests/test_cli_patterns/test_ray_status.txt @@ -0,0 +1,14 @@ +======== Cluster status: .+ +Node status +------------------------------------------------------------ + 1 node\(s\) with resources: .+ + +Resources +------------------------------------------------------------ +Usage: + 0.+ + 0.+ + 0.+ + +Demands: + \(no resource demands\) From 32ec0d205f596038a07b81a948d6676a8357f3fb Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Wed, 27 Jan 2021 16:26:32 -0800 Subject: [PATCH 077/245] [Object Spilling] Remove job id from the io worker log name. (#13746) --- python/ray/ray_logging.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/ray/ray_logging.py b/python/ray/ray_logging.py index 56df7b5c2092..c9af57536b0c 100644 --- a/python/ray/ray_logging.py +++ b/python/ray/ray_logging.py @@ -165,15 +165,17 @@ def get_worker_log_file_name(worker_type): "please report it to Ray's Github issue.") worker_name = "worker" else: - job_id = ray.JobID.nil() + job_id = "" worker_name = "io_worker" # Make sure these values are set already. assert ray.worker._global_node is not None assert ray.worker.global_worker is not None filename = (f"{worker_name}-" - f"{binary_to_hex(ray.worker.global_worker.worker_id)}-" - f"{job_id}-{os.getpid()}") + f"{binary_to_hex(ray.worker.global_worker.worker_id)}-") + if job_id: + filename += f"{job_id}-" + filename += f"{os.getpid()}" return filename From 25fa391193caf86f1f08daedccde5216a986c302 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 27 Jan 2021 16:32:00 -0800 Subject: [PATCH 078/245] [Core] Add private on_completed callback for ObjectRef (#13688) --- python/ray/_raylet.pyx | 42 +++++++++--------------------- python/ray/includes/object_ref.pxi | 42 ++++++++++++++++++++++++++---- python/ray/tests/test_asyncio.py | 22 +++++++++++++++- 3 files changed, 70 insertions(+), 36 deletions(-) diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 0fc3f4bf25da..dc9fceaca7df 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -1569,12 +1569,13 @@ cdef class CoreWorker: return ref_counts - def get_async(self, ObjectRef object_ref, future): - cpython.Py_INCREF(future) + def set_get_async_callback(self, ObjectRef object_ref, callback): + cpython.Py_INCREF(callback) CCoreWorkerProcess.GetCoreWorker().GetAsync( - object_ref.native(), - async_set_result, - future) + object_ref.native(), + async_callback, + callback + ) def push_error(self, JobID job_id, error_type, error_message, double timestamp): @@ -1588,13 +1589,11 @@ cdef class CoreWorker: resource_name.encode("ascii"), capacity, CNodeID.FromBinary(client_id.binary())) -cdef void async_set_result(shared_ptr[CRayObject] obj, - CObjectID object_ref, - void *future) with gil: +cdef void async_callback(shared_ptr[CRayObject] obj, + CObjectID object_ref, + void *user_callback) with gil: cdef: c_vector[shared_ptr[CRayObject]] objects_to_deserialize - py_future = (future) - loop = py_future._loop # Object is retrieved from in memory store. # Here we go through the code path used to deserialize objects. @@ -1605,23 +1604,6 @@ cdef void async_set_result(shared_ptr[CRayObject] obj, result = ray.worker.global_worker.deserialize_objects( data_metadata_pairs, ids_to_deserialize)[0] - def set_future(): - # Issue #11030, #8841 - # If this future has result set already, we just need to - # skip the set result/exception procedure. - if py_future.done(): - cpython.Py_DECREF(py_future) - return - - if isinstance(result, RayTaskError): - ray.worker.last_task_error_raise_time = time.time() - py_future.set_exception(result.as_instanceof_cause()) - elif isinstance(result, RayError): - # Directly raise exception for RayActorError - py_future.set_exception(result) - else: - py_future.set_result(result) - - cpython.Py_DECREF(py_future) - - loop.call_soon_threadsafe(set_future) + py_callback = user_callback + py_callback(result) + cpython.Py_DECREF(py_callback) diff --git a/python/ray/includes/object_ref.pxi b/python/ray/includes/object_ref.pxi index 3353e696edbf..31c59d08ba2c 100644 --- a/python/ray/includes/object_ref.pxi +++ b/python/ray/includes/object_ref.pxi @@ -1,6 +1,7 @@ from ray.includes.unique_ids cimport CObjectID import asyncio +from typing import Callable, Any import ray @@ -71,10 +72,41 @@ cdef class ObjectRef(BaseID): def as_future(self): loop = asyncio.get_event_loop() - core_worker = ray.worker.global_worker.core_worker + py_future = loop.create_future() + + def callback(result): + loop = py_future._loop + + def set_future(): + # Issue #11030, #8841 + # If this future has result set already, we just need to + # skip the set result/exception procedure. + if py_future.done(): + return + + if isinstance(result, RayTaskError): + ray.worker.last_task_error_raise_time = time.time() + py_future.set_exception(result.as_instanceof_cause()) + elif isinstance(result, RayError): + # Directly raise exception for RayActorError + py_future.set_exception(result) + else: + py_future.set_result(result) + + loop.call_soon_threadsafe(set_future) + + self._on_completed(callback) - future = loop.create_future() - core_worker.get_async(self, future) # A hack to keep a reference to the object ref for ref counting. - future.object_ref = self - return future + py_future.object_ref = self + return py_future + + def _on_completed(self, py_callback: Callable[[Any], None]): + """Register a callback that will be called after Object is ready. + If the ObjectRef is already ready, the callback will be called soon. + The callback should take the result as the only argument. The result + can be an exception object in case of task error. + """ + core_worker = ray.worker.global_worker.core_worker + core_worker.set_get_async_callback(self, py_callback) + return self diff --git a/python/ray/tests/test_asyncio.py b/python/ray/tests/test_asyncio.py index 18dd63a22d07..31f03aefa546 100644 --- a/python/ray/tests/test_asyncio.py +++ b/python/ray/tests/test_asyncio.py @@ -6,7 +6,7 @@ import pytest import ray -from ray.test_utils import SignalActor +from ray.test_utils import SignalActor, wait_for_condition def test_asyncio_actor(ray_start_regular_shared): @@ -224,6 +224,26 @@ async def loop_forever(self): ray.get(a.ping.remote()) +def test_async_callback(ray_start_regular_shared): + global_set = set() + + ref = ray.put(None) + ref._on_completed(lambda _: global_set.add("completed-1")) + wait_for_condition(lambda: "completed-1" in global_set) + + signal = SignalActor.remote() + + @ray.remote + def wait(): + ray.get(signal.wait.remote()) + + ref = wait.remote() + ref._on_completed(lambda _: global_set.add("completed-2")) + assert "completed-2" not in global_set + signal.send.remote() + wait_for_condition(lambda: "completed-2" in global_set) + + if __name__ == "__main__": import pytest sys.exit(pytest.main(["-v", __file__])) From 28cf5f91e31d5c6c0fa5fb11fc9a4cc1682939c2 Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Wed, 27 Jan 2021 16:53:15 -0800 Subject: [PATCH 079/245] [docs] change MLFlow to MLflow in docs (#13739) --- doc/source/tune/_tutorials/overview.rst | 4 ++-- doc/source/tune/api_docs/logging.rst | 2 +- doc/source/tune/examples/index.rst | 6 +++--- doc/source/tune/index.rst | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/tune/_tutorials/overview.rst b/doc/source/tune/_tutorials/overview.rst index 0517c2f0a9e5..8e79b8ca158a 100644 --- a/doc/source/tune/_tutorials/overview.rst +++ b/doc/source/tune/_tutorials/overview.rst @@ -71,9 +71,9 @@ Take a look at any of the below tutorials to get started with Tune. :description: :doc:`Track your experiment process with the Weights & Biases tools ` .. customgalleryitem:: - :tooltip: Use MLFlow with Ray Tune. + :tooltip: Use MLflow with Ray Tune. :figure: /images/mlflow.png - :description: :doc:`Log and track your hyperparameter sweep with MLFlow Tracking & AutoLogging ` + :description: :doc:`Log and track your hyperparameter sweep with MLflow Tracking & AutoLogging ` .. raw:: html diff --git a/doc/source/tune/api_docs/logging.rst b/doc/source/tune/api_docs/logging.rst index b976a898ed08..1bdc400cc802 100644 --- a/doc/source/tune/api_docs/logging.rst +++ b/doc/source/tune/api_docs/logging.rst @@ -162,7 +162,7 @@ CSVLogger MLFlowLogger ------------ -Tune also provides a default logger for `MLFlow `_. You can install MLFlow via ``pip install mlflow``. +Tune also provides a default logger for `MLflow `_. You can install MLflow via ``pip install mlflow``. You can see the :doc:`tutorial here `. WandbLogger diff --git a/doc/source/tune/examples/index.rst b/doc/source/tune/examples/index.rst index 27fde3a05711..acdb758929ea 100644 --- a/doc/source/tune/examples/index.rst +++ b/doc/source/tune/examples/index.rst @@ -82,13 +82,13 @@ Pytorch Lightning - :doc:`/tune/examples/mnist_pytorch_lightning`: A comprehensive example using `Pytorch Lightning `_ to train a MNIST model. This example showcases how to use various search optimization techniques. It utilizes the Ray Tune-provided :ref:`PyTorch Lightning callbacks `. - :ref:`A walkthrough tutorial for using Ray Tune with Pytorch-Lightning `. -Wandb, MLFlow +Wandb, MLflow ~~~~~~~~~~~~~ - :ref:`Tutorial ` for using `wandb `__ with Ray Tune - :doc:`/tune/examples/wandb_example`: Example for using `Weights and Biases `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_example`: Example for using `MLFlow `__ with Ray Tune. -- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLFlow `__ and `Pytorch Lightning `_ with Ray Tune. +- :doc:`/tune/examples/mlflow_example`: Example for using `MLflow `__ with Ray Tune. +- :doc:`/tune/examples/mlflow_ptl_example`: Example for using `MLflow `__ and `Pytorch Lightning `_ with Ray Tune. Tensorflow/Keras ~~~~~~~~~~~~~~~~ diff --git a/doc/source/tune/index.rst b/doc/source/tune/index.rst index 86f312cf8ddd..2003b2eacb80 100644 --- a/doc/source/tune/index.rst +++ b/doc/source/tune/index.rst @@ -73,7 +73,7 @@ A key problem with machine learning frameworks is the need to restructure all of With Tune, you can optimize your model just by :ref:`adding a few code snippets `. -Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLFlow and TensorBoard. +Further, Tune actually removes boilerplate from your code training workflow, automatically :ref:`managing checkpoints ` and :ref:`logging results to tools ` such as MLflow and TensorBoard. Multi-GPU & distributed training out of the box From 40234ad631598f92ea25381c74840d5fd3ca8a0e Mon Sep 17 00:00:00 2001 From: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> Date: Wed, 27 Jan 2021 17:00:52 -0800 Subject: [PATCH 080/245] [autoscaler][AWS] Make sure subnets belong to same VPC as user-specified security groups (#13558) * initial commit * Filter subnets by security groups' VPCs * fix stubs * wip * Fix inbound rule logic. Tests WIP. * wip * unit test * example yaml * Unit test tests for bug being fixed * Update python/ray/tests/aws/utils/constants.py Co-authored-by: Thomas Desrosiers <681004+thomasdesr@users.noreply.github.com> Co-authored-by: Thomas Desrosiers <681004+thomasdesr@users.noreply.github.com> --- python/ray/autoscaler/_private/aws/config.py | 56 ++++++++++++++++++- ...xample-head-and-worker-security-group.yaml | 31 ++++++++++ python/ray/tests/aws/test_autoscaler_aws.py | 20 +++++++ python/ray/tests/aws/utils/constants.py | 13 +++++ python/ray/tests/aws/utils/stubs.py | 21 ++++++- 5 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml diff --git a/python/ray/autoscaler/_private/aws/config.py b/python/ray/autoscaler/_private/aws/config.py index 79fc57896dac..4c3a1c448102 100644 --- a/python/ray/autoscaler/_private/aws/config.py +++ b/python/ray/autoscaler/_private/aws/config.py @@ -5,6 +5,7 @@ import json import os import time +from typing import Any, Dict, List import logging import boto3 @@ -357,9 +358,23 @@ def _configure_subnet(config): ec2 = _resource("ec2", config) use_internal_ips = config["provider"].get("use_internal_ips", False) + # If head or worker security group is specified, filter down to subnets + # belonging to the same VPC as the security group. + sg_ids = (config["head_node"].get("SecurityGroupIds", []) + + config["worker_nodes"].get("SecurityGroupIds", [])) + if sg_ids: + vpc_id_of_sg = _get_vpc_id_of_sg(sg_ids, config) + else: + vpc_id_of_sg = None + try: + candidate_subnets = ec2.subnets.all() + if vpc_id_of_sg: + candidate_subnets = [ + s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg + ] subnets = sorted( - (s for s in ec2.subnets.all() if s.state == "available" and ( + (s for s in candidate_subnets if s.state == "available" and ( use_internal_ips or s.map_public_ip_on_launch)), reverse=True, # sort from Z-A key=lambda subnet: subnet.availability_zone) @@ -414,6 +429,34 @@ def _configure_subnet(config): return config +def _get_vpc_id_of_sg(sg_ids: List[str], config: Dict[str, Any]) -> str: + """Returns the VPC id of the security groups with the provided security + group ids. + + Errors if the provided security groups belong to multiple VPCs. + Errors if no security group with any of the provided ids is identified. + """ + sg_ids = list(set(sg_ids)) + + ec2 = _resource("ec2", config) + filters = [{"Name": "group-id", "Values": sg_ids}] + security_groups = ec2.security_groups.filter(Filters=filters) + vpc_ids = [sg.vpc_id for sg in security_groups] + vpc_ids = list(set(vpc_ids)) + + multiple_vpc_msg = "All security groups specified in the cluster config "\ + "should belong to the same VPC." + cli_logger.doassert(len(vpc_ids) <= 1, multiple_vpc_msg) + assert len(vpc_ids) <= 1, multiple_vpc_msg + + no_sg_msg = "Failed to detect a security group with id equal to any of "\ + "the configured SecurityGroupIds." + cli_logger.doassert(len(vpc_ids) > 0, no_sg_msg) + assert len(vpc_ids) > 0, no_sg_msg + + return vpc_ids[0] + + def _configure_security_group(config): _set_config_info( head_security_group_src="config", workers_security_group_src="config") @@ -566,6 +609,13 @@ def _create_security_group(config, vpc_id, group_name): def _upsert_security_group_rules(conf, security_groups): sgids = {sg.id for sg in security_groups.values()} + + # Update sgids to include user-specified security groups. + # This is necessary if the user specifies the head node type's security + # groups but not the worker's, or vice-versa. + for node_type in NODE_KIND_CONFIG_KEYS.values(): + sgids.update(conf[node_type].get("SecurityGroupIds", [])) + # sort security group items for deterministic inbound rule config order # (mainly supports more precise stub-based boto3 unit testing) for node_type, sg in sorted(security_groups.items()): @@ -583,7 +633,7 @@ def _update_inbound_rules(target_security_group, sgids, config): def _create_default_inbound_rules(sgids, extended_rules=[]): - intracluster_rules = _create_default_instracluster_inbound_rules(sgids) + intracluster_rules = _create_default_intracluster_inbound_rules(sgids) ssh_rules = _create_default_ssh_inbound_rules() merged_rules = itertools.chain( intracluster_rules, @@ -593,7 +643,7 @@ def _create_default_inbound_rules(sgids, extended_rules=[]): return list(merged_rules) -def _create_default_instracluster_inbound_rules(intracluster_sgids): +def _create_default_intracluster_inbound_rules(intracluster_sgids): return [{ "FromPort": -1, "ToPort": -1, diff --git a/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml b/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml new file mode 100644 index 000000000000..b940366a0e2f --- /dev/null +++ b/python/ray/autoscaler/aws/example-head-and-worker-security-group.yaml @@ -0,0 +1,31 @@ +cluster_name: sg + +max_workers: 1 + +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + +auth: + ssh_user: ubuntu + +# If required, head and worker nodes can exist on subnets in different VPCs and +# communicate via VPC peering. + +# VPC peering overview: https://docs.aws.amazon.com/vpc/latest/userguide/vpc-peering.html. +# Setup VPC peering: https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html. +# Configure VPC peering route tables: https://docs.aws.amazon.com/vpc/latest/peering/vpc-peering-routing.html. + +# To enable external SSH connectivity, you should also ensure that your VPC +# is configured to assign public IPv4 addresses to every EC2 instance +# assigned to it. +head_node: + SecurityGroupIds: + - sg-1234abcd # Replace with an actual security group id. + +worker_nodes: + SecurityGroupIds: + - sg-1234abcd # Replace with an actual security group id. + + diff --git a/python/ray/tests/aws/test_autoscaler_aws.py b/python/ray/tests/aws/test_autoscaler_aws.py index 697c9efb163c..52ceb9fb8ecd 100644 --- a/python/ray/tests/aws/test_autoscaler_aws.py +++ b/python/ray/tests/aws/test_autoscaler_aws.py @@ -113,6 +113,26 @@ def test_create_sg_with_custom_inbound_rules_and_name(iam_client_stub, ec2_client_stub.assert_no_pending_responses() +def test_subnet_given_head_and_worker_sg(iam_client_stub, ec2_client_stub): + stubs.configure_iam_role_default(iam_client_stub) + stubs.configure_key_pair_default(ec2_client_stub) + + # list a security group and a thousand subnets in different vpcs + stubs.describe_a_security_group(ec2_client_stub, DEFAULT_SG) + stubs.describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub) + + config = helpers.bootstrap_aws_example_config_file( + "example-head-and-worker-security-group.yaml") + + # check that just the single subnet in the right vpc is filled + assert config["head_node"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] + assert config["worker_nodes"]["SubnetIds"] == [DEFAULT_SUBNET["SubnetId"]] + + # expect no pending responses left in IAM or EC2 client stub queues + iam_client_stub.assert_no_pending_responses() + ec2_client_stub.assert_no_pending_responses() + + if __name__ == "__main__": import sys sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tests/aws/utils/constants.py b/python/ray/tests/aws/utils/constants.py index cdcf5a79c68d..adc8a5b2abe4 100644 --- a/python/ray/tests/aws/utils/constants.py +++ b/python/ray/tests/aws/utils/constants.py @@ -50,6 +50,19 @@ "VpcId": "vpc-0000000", } + +def subnet_in_vpc(vpc_num): + """Returns a copy of DEFAULT_SUBNET whose VpcId ends with the digits + of vpc_num.""" + subnet = copy.copy(DEFAULT_SUBNET) + subnet["VpcId"] = f"vpc-{vpc_num:07d}" + return subnet + + +A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS = [ + subnet_in_vpc(vpc_num) for vpc_num in range(1, 1000) +] + [DEFAULT_SUBNET] + # Secondary EC2 subnet to expose to tests as required. AUX_SUBNET = { "AvailabilityZone": "us-west-2a", diff --git a/python/ray/tests/aws/utils/stubs.py b/python/ray/tests/aws/utils/stubs.py index 7840447d80e0..61f1f9ab632b 100644 --- a/python/ray/tests/aws/utils/stubs.py +++ b/python/ray/tests/aws/utils/stubs.py @@ -1,7 +1,7 @@ import ray from ray.tests.aws.utils.mocks import mock_path_exists_key_pair from ray.tests.aws.utils.constants import DEFAULT_INSTANCE_PROFILE, \ - DEFAULT_KEY_PAIR, DEFAULT_SUBNET + DEFAULT_KEY_PAIR, DEFAULT_SUBNET, A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS from unittest import mock @@ -41,6 +41,13 @@ def configure_subnet_default(ec2_client_stub): service_response={"Subnets": [DEFAULT_SUBNET]}) +def describe_a_thousand_subnets_in_different_vpcs(ec2_client_stub): + ec2_client_stub.add_response( + "describe_subnets", + expected_params={}, + service_response={"Subnets": A_THOUSAND_SUBNETS_IN_DIFFERENT_VPCS}) + + def skip_to_configure_sg(ec2_client_stub, iam_client_stub): configure_iam_role_default(iam_client_stub) configure_key_pair_default(ec2_client_stub) @@ -66,6 +73,18 @@ def describe_no_security_groups(ec2_client_stub): service_response={}) +def describe_a_security_group(ec2_client_stub, security_group): + ec2_client_stub.add_response( + "describe_security_groups", + expected_params={ + "Filters": [{ + "Name": "group-id", + "Values": [security_group["GroupId"]] + }] + }, + service_response={"SecurityGroups": [security_group]}) + + def create_sg_echo(ec2_client_stub, security_group): ec2_client_stub.add_response( "create_security_group", From 0e7343ec19dec0fae44ce5f3ef612f47cd9e3fed Mon Sep 17 00:00:00 2001 From: Zhe Zhang Date: Wed, 27 Jan 2021 17:16:29 -0800 Subject: [PATCH 081/245] [docs] Fix MLflow / Tune example in documentation (#13740) Minor fixes to make it runnable --- python/ray/tune/integration/mlflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/integration/mlflow.py b/python/ray/tune/integration/mlflow.py index cbd3811d4e30..6e038b810f78 100644 --- a/python/ray/tune/integration/mlflow.py +++ b/python/ray/tune/integration/mlflow.py @@ -274,8 +274,8 @@ def train_fn(config): @mlflow_mixin def train_fn(config): for i in range(10): - loss = self.config["a"] + self.config["b"] - mlflow.log_metric(key="loss", value=loss}) + loss = config["a"] + config["b"] + mlflow.log_metric(key="loss", value=loss) tune.report(loss=loss, done=True) tune.run( From 2e01d5d26edffef15e9ca0b6d3562d4c2105c7a1 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Wed, 27 Jan 2021 17:37:50 -0800 Subject: [PATCH 082/245] Report failed deserialization of errors in Ray client --- python/ray/util/client/worker.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/ray/util/client/worker.py b/python/ray/util/client/worker.py index 9f2f189c6ae2..b0a4b78f52b1 100644 --- a/python/ray/util/client/worker.py +++ b/python/ray/util/client/worker.py @@ -172,7 +172,11 @@ def _get(self, ref: ClientObjectRef, timeout: float): except grpc.RpcError as e: raise e.details() if not data.valid: - err = cloudpickle.loads(data.error) + try: + err = cloudpickle.loads(data.error) + except Exception: + logger.exception("Failed to deserialize {}".format(data.error)) + raise logger.error(err) raise err return loads_from_server(data.data) @@ -256,7 +260,12 @@ def _call_schedule_for_task( except grpc.RpcError as e: raise decode_exception(e.details) if not ticket.valid: - raise cloudpickle.loads(ticket.error) + try: + raise cloudpickle.loads(ticket.error) + except Exception: + logger.exception("Failed to deserialize {}".format( + ticket.error)) + raise return ticket.return_ids def call_release(self, id: bytes) -> None: From c10abbb1bba1a882c76ae199956edeef5a39a6d8 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 27 Jan 2021 17:47:42 -0800 Subject: [PATCH 083/245] Revert "[Serve] Fix ServeHandle serialization (#13695)" (#13753) This reverts commit 202fbdf38c48f7db54994e7143232a75490c9fdb. --- python/ray/serve/api.py | 7 ----- python/ray/serve/handle.py | 25 +++++---------- python/ray/serve/tests/test_handle.py | 44 +-------------------------- 3 files changed, 8 insertions(+), 68 deletions(-) diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index 19783dc3700b..b42cd78464a7 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -66,8 +66,6 @@ def check(self, *args, **kwargs): class ThreadProxiedRouter: def __init__(self, controller_handle, sync: bool): - self.controller_handle = controller_handle - self.sync = sync self.router = Router(controller_handle) if sync: @@ -94,11 +92,6 @@ def _remote(self, endpoint_name, handle_options, request_data, **kwargs) return coro - def __reduce__(self): - deserializer = ThreadProxiedRouter - serialized_data = (self.controller_handle, self.sync) - return deserializer, serialized_data - class Client: def __init__(self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 4ee2624a8d31..c6951c6380b9 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,6 +4,8 @@ from typing import Any, Dict, Optional, Union from enum import Enum +from ray.serve.router import Router + @dataclass(frozen=True) class HandleOptions: @@ -38,11 +40,10 @@ class RayServeHandle: # raises RayTaskError Exception """ - def __init__( - self, - router, # ThreadProxiedRouter - endpoint_name, - handle_options: Optional[HandleOptions] = None): + def __init__(self, + router: Router, + endpoint_name, + handle_options: Optional[HandleOptions] = None): self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() @@ -77,7 +78,7 @@ def options(self, async def remote(self, request_data: Optional[Union[Dict, Any]] = None, **kwargs): - """Issue an asynchronous request to the endpoint. + """Issue an asynchrounous request to the endpoint. Returns a Ray ObjectRef whose results can be waited for or retrieved using ray.wait or ray.get (or ``await object_ref``), respectively. @@ -97,12 +98,6 @@ async def remote(self, def __repr__(self): return f"{self.__class__.__name__}(endpoint='{self.endpoint_name}')" - def __reduce__(self): - deserializer = RayServeHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data - class RayServeSyncHandle(RayServeHandle): def remote(self, request_data: Optional[Union[Dict, Any]] = None, @@ -128,9 +123,3 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self.router.async_loop) return future.result() - - def __reduce__(self): - deserializer = RayServeSyncHandle - serialized_data = (self.router, self.endpoint_name, - self.handle_options) - return deserializer, serialized_data diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 88ab9d2c2b7a..c17db7686aad 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -1,51 +1,9 @@ import requests -import pytest + import ray from ray import serve -@pytest.mark.asyncio -async def test_async_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - class TaskActor: - async def task(self, handle): - ref = await handle.remote() - output = await ref - return output - - handle = client.get_handle("f", sync=False) - - task_actor = TaskActor.remote() - result = await task_actor.task.remote(handle) - assert result == "hello" - - -def test_sync_handle_serializable(serve_instance): - client = serve_instance - - def f(_): - return "hello" - - client.create_backend("f", f) - client.create_endpoint("f", backend="f") - - @ray.remote - def task(handle): - return ray.get(handle.remote()) - - handle = client.get_handle("f", sync=True) - result_ref = task.remote(handle) - assert ray.get(result_ref) == "hello" - - def test_handle_in_endpoint(serve_instance): client = serve_instance From 4f1f5588026e21247d30bb50e2a2374529e10987 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 27 Jan 2021 19:01:56 -0800 Subject: [PATCH 084/245] [Core] Hotfix Windows Compilation Error for ClusterTaskManager (#13754) * [Core] Hotfix Windows Compilation Error for ClusterTaskManager * fix --- src/ray/raylet/scheduling/cluster_task_manager.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index 43c6ce1cc78a..a4dbff1f48dd 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -618,15 +618,15 @@ bool ClusterTaskManager::AnyPendingTasks(Task *exemplar, bool *any_pending, std::string ClusterTaskManager::DebugStr() const { // TODO(Shanly): This method will be replaced with `DebugString` once we remove the // legacy scheduler. - auto accumulator = [](int state, const std::pair> &pair) { + auto accumulator = [](size_t state, const std::pair> &pair) { return state + pair.second.size(); }; - int num_infeasible_tasks = - std::accumulate(infeasible_tasks_.begin(), infeasible_tasks_.end(), 0, accumulator); - int num_tasks_to_schedule = std::accumulate(tasks_to_schedule_.begin(), - tasks_to_schedule_.end(), 0, accumulator); - int num_tasks_to_dispatch = std::accumulate(tasks_to_dispatch_.begin(), - tasks_to_dispatch_.end(), 0, accumulator); + size_t num_infeasible_tasks = std::accumulate( + infeasible_tasks_.begin(), infeasible_tasks_.end(), (size_t)0, accumulator); + size_t num_tasks_to_schedule = std::accumulate( + tasks_to_schedule_.begin(), tasks_to_schedule_.end(), (size_t)0, accumulator); + size_t num_tasks_to_dispatch = std::accumulate( + tasks_to_dispatch_.begin(), tasks_to_dispatch_.end(), (size_t)0, accumulator); std::stringstream buffer; buffer << "========== Node: " << self_node_id_ << " =================\n"; buffer << "Infeasible queue length: " << num_infeasible_tasks << "\n"; From cb95ff1e564cdf44e9052a8e5fdd8631b736bb36 Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Wed, 27 Jan 2021 19:03:15 -0800 Subject: [PATCH 085/245] [Serve] Add "endpoint registered" message to router log (#13752) --- python/ray/serve/router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/serve/router.py b/python/ray/serve/router.py index 477f037fd459..c4a87b49bb60 100644 --- a/python/ray/serve/router.py +++ b/python/ray/serve/router.py @@ -256,6 +256,7 @@ async def assign_request( raise RayServeException( f"Endpoint {endpoint} was removed. This request " "cannot be completed.") + logger.info(f"Endpoint {endpoint} registered.") endpoint_policy = self.endpoint_policies[endpoint] chosen_backend, *shadow_backends = endpoint_policy.assign(query) From 56ee6ef55f655870f931a5f4b1233fc5a86d5ab9 Mon Sep 17 00:00:00 2001 From: Tao Wang Date: Thu, 28 Jan 2021 11:12:57 +0800 Subject: [PATCH 086/245] [GCS]only update states related fields when publish actor table data (#13448) --- .../stats_collector/stats_collector_head.py | 12 ++- .../tests/test_stats_collector.py | 83 +++++++++++++++++++ .../gcs/gcs_client/service_based_accessor.cc | 2 +- .../gcs_client/service_based_gcs_client.cc | 2 +- src/ray/gcs/gcs_server/gcs_actor_manager.cc | 26 +++--- src/ray/gcs/gcs_server/gcs_actor_manager.h | 12 ++- 6 files changed, 121 insertions(+), 16 deletions(-) diff --git a/dashboard/modules/stats_collector/stats_collector_head.py b/dashboard/modules/stats_collector/stats_collector_head.py index e0b6cffa77b8..d8c085c0ea62 100644 --- a/dashboard/modules/stats_collector/stats_collector_head.py +++ b/dashboard/modules/stats_collector/stats_collector_head.py @@ -221,15 +221,25 @@ def _process_actor_table_data(data): RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS) # Receive actors from channel. + state_keys = ("state", "address", "numRestarts", "timestamp", "pid") async for sender, msg in receiver.iter(): try: - _, actor_table_data = msg + actor_id, actor_table_data = msg pubsub_message = ray.gcs_utils.PubSubMessage.FromString( actor_table_data) message = ray.gcs_utils.ActorTableData.FromString( pubsub_message.data) actor_table_data = actor_table_data_to_dict(message) _process_actor_table_data(actor_table_data) + # If actor is not new registered but updated, we only update + # states related fields. + if actor_table_data["state"] != "DEPENDENCIES_UNREADY": + actor_id = actor_id.decode("UTF-8")[len( + ray.gcs_utils.TablePrefix_ACTOR_string + ":"):] + actor_table_data_copy = dict(DataSource.actors[actor_id]) + for k in state_keys: + actor_table_data_copy[k] = actor_table_data[k] + actor_table_data = actor_table_data_copy actor_id = actor_table_data["actorId"] job_id = actor_table_data["jobId"] node_id = actor_table_data["address"]["rayletId"] diff --git a/dashboard/modules/stats_collector/tests/test_stats_collector.py b/dashboard/modules/stats_collector/tests/test_stats_collector.py index fcd1c42e3456..cb4a1d3c5470 100644 --- a/dashboard/modules/stats_collector/tests/test_stats_collector.py +++ b/dashboard/modules/stats_collector/tests/test_stats_collector.py @@ -7,9 +7,12 @@ import random import pytest import ray +import redis import threading import ray.new_dashboard.modules.stats_collector.stats_collector_consts \ as stats_collector_consts +import ray.new_dashboard.utils as dashboard_utils +import ray.ray_constants as ray_constants from datetime import datetime, timedelta from ray.cluster_utils import Cluster from ray.new_dashboard.tests.conftest import * # noqa @@ -417,5 +420,85 @@ class InfeasibleActor: raise Exception(f"Timed out while testing, {ex_stack}") +def test_actor_pubsub(disable_aiohttp_cache, ray_start_with_dashboard): + timeout = 5 + assert (wait_until_server_available(ray_start_with_dashboard["webui_url"]) + is True) + address_info = ray_start_with_dashboard + address = address_info["redis_address"] + address = address.split(":") + assert len(address) == 2 + + client = redis.StrictRedis( + host=address[0], + port=int(address[1]), + password=ray_constants.REDIS_DEFAULT_PASSWORD) + + p = client.pubsub(ignore_subscribe_messages=True) + p.psubscribe(ray.gcs_utils.RAY_ACTOR_PUBSUB_PATTERN) + + @ray.remote + class DummyActor: + def __init__(self): + pass + + # Create a dummy actor. + a = DummyActor.remote() + + def handle_pub_messages(client, msgs, timeout, expect_num): + start_time = time.time() + while time.time() - start_time < timeout and len(msgs) < expect_num: + msg = client.get_message() + if msg is None: + time.sleep(0.01) + continue + pubsub_msg = ray.gcs_utils.PubSubMessage.FromString(msg["data"]) + actor_data = ray.gcs_utils.ActorTableData.FromString( + pubsub_msg.data) + msgs.append(actor_data) + + msgs = [] + handle_pub_messages(p, msgs, timeout, 2) + + # Assert we received published actor messages with state + # DEPENDENCIES_UNREADY and ALIVE. + assert len(msgs) == 2 + + # Kill actor. + ray.kill(a) + handle_pub_messages(p, msgs, timeout, 3) + + # Assert we received published actor messages with state DEAD. + assert len(msgs) == 3 + + def actor_table_data_to_dict(message): + return dashboard_utils.message_to_dict( + message, { + "actorId", "parentId", "jobId", "workerId", "rayletId", + "actorCreationDummyObjectId", "callerId", "taskId", + "parentTaskId", "sourceActorId", "placementGroupId" + }, + including_default_value_fields=False) + + non_state_keys = ("actorId", "jobId", "taskSpec") + for msg in msgs: + actor_data_dict = actor_table_data_to_dict(msg) + # DEPENDENCIES_UNREADY is 0, which would not be keeped in dict. We + # need check its original value. + if msg.state == 0: + assert len(actor_data_dict) > 5 + for k in non_state_keys: + assert k in actor_data_dict + # For status that is not DEPENDENCIES_UNREADY, only states fields will + # be published. + elif actor_data_dict["state"] in ("ALIVE", "DEAD"): + assert actor_data_dict.keys() == { + "state", "address", "timestamp", "pid" + } + else: + raise Exception("Unknown state: {}".format( + actor_data_dict["state"])) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", __file__])) diff --git a/src/ray/gcs/gcs_client/service_based_accessor.cc b/src/ray/gcs/gcs_client/service_based_accessor.cc index 821e0f7d930a..891bd6ba6a54 100644 --- a/src/ray/gcs/gcs_client/service_based_accessor.cc +++ b/src/ray/gcs/gcs_client/service_based_accessor.cc @@ -275,7 +275,7 @@ Status ServiceBasedActorInfoAccessor::AsyncSubscribe( auto on_subscribe = [subscribe](const std::string &id, const std::string &data) { ActorTableData actor_data; actor_data.ParseFromString(data); - subscribe(ActorID::FromBinary(actor_data.actor_id()), actor_data); + subscribe(ActorID::FromHex(id), actor_data); }; return client_impl_->GetGcsPubSub().Subscribe(ACTOR_CHANNEL, actor_id.Hex(), on_subscribe, subscribe_done); diff --git a/src/ray/gcs/gcs_client/service_based_gcs_client.cc b/src/ray/gcs/gcs_client/service_based_gcs_client.cc index cf9bdd9e4d4e..5fccd645726d 100644 --- a/src/ray/gcs/gcs_client/service_based_gcs_client.cc +++ b/src/ray/gcs/gcs_client/service_based_gcs_client.cc @@ -207,7 +207,7 @@ void ServiceBasedGcsClient::ReconnectGcsServer() { RAY_LOG(INFO) << "Repeated reconnection in " << RayConfig::instance().minimum_gcs_reconnect_interval_milliseconds() - << "milliseconds, return directly."; + << " milliseconds, return directly."; return; } diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.cc b/src/ray/gcs/gcs_server/gcs_actor_manager.cc index 7b30bbc7dde9..2f3740654c8b 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.cc +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.cc @@ -503,9 +503,9 @@ void GcsActorManager::DestroyActor(const ActorID &actor_id) { RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor->GetActorID(), *actor_table_data, [this, actor_id, actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), - actor_table_data->SerializeAsString(), - nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish( + ACTOR_CHANNEL, actor_id.Hex(), + GenActorDataOnlyWithStates(*actor_table_data)->SerializeAsString(), nullptr)); // Destroy placement group owned by this actor. destroy_owned_placement_group_if_needed_(actor_id); })); @@ -677,7 +677,6 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // between memory cache and storage. mutable_actor_table_data->set_num_restarts(num_restarts + 1); mutable_actor_table_data->set_state(rpc::ActorTableData::RESTARTING); - const auto actor_table_data = actor->GetActorTableData(); // Make sure to reset the address before flushing to GCS. Otherwise, // GCS will mistakenly consider this lease request succeeds when restarting. actor->UpdateAddress(rpc::Address()); @@ -685,10 +684,11 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, - [this, actor_id, actor_table_data](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), - actor_table_data.SerializeAsString(), - nullptr)); + [this, actor_id, mutable_actor_table_data](Status status) { + RAY_CHECK_OK(gcs_pub_sub_->Publish( + ACTOR_CHANNEL, actor_id.Hex(), + GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), + nullptr)); })); gcs_actor_scheduler_->Schedule(actor); } else { @@ -701,6 +701,7 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } mutable_actor_table_data->set_state(rpc::ActorTableData::DEAD); + mutable_actor_table_data->set_timestamp(current_sys_time_ms()); // The backend storage is reliable in the future, so the status must be ok. RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, *mutable_actor_table_data, @@ -713,7 +714,8 @@ void GcsActorManager::ReconstructActor(const ActorID &actor_id, bool need_resche } RAY_CHECK_OK(gcs_pub_sub_->Publish( ACTOR_CHANNEL, actor_id.Hex(), - mutable_actor_table_data->SerializeAsString(), nullptr)); + GenActorDataOnlyWithStates(*mutable_actor_table_data)->SerializeAsString(), + nullptr)); })); // The actor is dead, but we should not remove the entry from the // registered actors yet. If the actor is owned, we will destroy the actor @@ -754,9 +756,9 @@ void GcsActorManager::OnActorCreationSuccess(const std::shared_ptr &ac RAY_CHECK_OK(gcs_table_storage_->ActorTable().Put( actor_id, actor_table_data, [this, actor_id, actor_table_data, actor](Status status) { - RAY_CHECK_OK(gcs_pub_sub_->Publish(ACTOR_CHANNEL, actor_id.Hex(), - actor_table_data.SerializeAsString(), - nullptr)); + RAY_CHECK_OK(gcs_pub_sub_->Publish( + ACTOR_CHANNEL, actor_id.Hex(), + GenActorDataOnlyWithStates(actor_table_data)->SerializeAsString(), nullptr)); // Invoke all callbacks for all registration requests of this actor (duplicated // requests are included) and remove all of them from // actor_to_create_callbacks_. diff --git a/src/ray/gcs/gcs_server/gcs_actor_manager.h b/src/ray/gcs/gcs_server/gcs_actor_manager.h index 0f47cfb4f672..d3ffc309793e 100644 --- a/src/ray/gcs/gcs_server/gcs_actor_manager.h +++ b/src/ray/gcs/gcs_server/gcs_actor_manager.h @@ -316,7 +316,6 @@ class GcsActorManager : public rpc::ActorInfoHandler { absl::flat_hash_set GetUnresolvedActorsByOwnerWorker( const NodeID &node_id, const WorkerID &worker_id) const; - private: /// Reconstruct the specified actor. /// /// \param actor The target actor to be reconstructed. @@ -346,6 +345,17 @@ class GcsActorManager : public rpc::ActorInfoHandler { /// \param actor The actor to be killed. void AddDestroyedActorToCache(const std::shared_ptr &actor); + std::shared_ptr GenActorDataOnlyWithStates( + const rpc::ActorTableData &actor) { + auto actor_delta = std::make_shared(); + actor_delta->set_state(actor.state()); + actor_delta->mutable_address()->CopyFrom(actor.address()); + actor_delta->set_num_restarts(actor.num_restarts()); + actor_delta->set_timestamp(actor.timestamp()); + actor_delta->set_pid(actor.pid()); + return actor_delta; + } + /// Callbacks of pending `RegisterActor` requests. /// Maps actor ID to actor registration callbacks, which is used to filter duplicated /// messages from a driver/worker caused by some network problems. From d4ef5c5993c65257ded867406d552fd9aa1b061c Mon Sep 17 00:00:00 2001 From: cathrinS <38454060+cathrinS@users.noreply.github.com> Date: Thu, 28 Jan 2021 12:07:00 +0100 Subject: [PATCH 087/245] [RLlib] Atari-RAM-Preprocessing, unsigned observation vector results in a false preprocessed observation (#13013) --- rllib/models/preprocessors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/models/preprocessors.py b/rllib/models/preprocessors.py index 44312a807432..0abfb8658080 100644 --- a/rllib/models/preprocessors.py +++ b/rllib/models/preprocessors.py @@ -140,7 +140,7 @@ def _init_shape(self, obs_space: gym.Space, options: dict) -> List[int]: @override(Preprocessor) def transform(self, observation: TensorType) -> np.ndarray: self.check_shape(observation) - return (observation - 128) / 128 + return (observation.astype("float32") - 128) / 128 class OneHotPreprocessor(Preprocessor): From b01b0f80aa33fc10569f3ab36676ef71fc624d08 Mon Sep 17 00:00:00 2001 From: Yuri Rocha Date: Thu, 28 Jan 2021 21:28:08 +0900 Subject: [PATCH 088/245] [RLlib] Fix multiple Unity3DEnvs trying to connect to the same custom port (#13519) --- python/requirements_rllib.txt | 4 ++ rllib/BUILD | 7 +++ rllib/env/wrappers/tests/test_unity3d_env.py | 55 ++++++++++++++++++++ rllib/env/wrappers/unity3d_env.py | 21 ++++++-- 4 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 rllib/env/wrappers/tests/test_unity3d_env.py diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 0cefb02969b3..5f5a0f99112d 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -16,3 +16,7 @@ kaggle_environments # For MAML on PyTorch. higher + +# Unity3D testing +mlagents +mlagents_envs diff --git a/rllib/BUILD b/rllib/BUILD index f8f1cbd3c6f8..dd1d4c1638a7 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1069,6 +1069,13 @@ sh_test( data = glob(["examples/serving/*.py"]), ) +py_test( + name = "env/wrappers/tests/test_unity3d_env", + tags = ["env"], + size = "small", + srcs = ["env/wrappers/tests/test_unity3d_env.py"] +) + py_test( name = "env/wrappers/tests/test_recsim_wrapper", tags = ["env"], diff --git a/rllib/env/wrappers/tests/test_unity3d_env.py b/rllib/env/wrappers/tests/test_unity3d_env.py new file mode 100644 index 000000000000..5e347ed0ec05 --- /dev/null +++ b/rllib/env/wrappers/tests/test_unity3d_env.py @@ -0,0 +1,55 @@ +import unittest +from unittest.mock import patch + +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv + + +@patch("mlagents_envs.environment.UnityEnvironment") +class TestUnity3DEnv(unittest.TestCase): + def test_port_editor(self, mock_unity3d): + """Test if the environment uses the editor port + when no environment file is provided""" + + _ = Unity3DEnv(port=None) + args, kwargs = mock_unity3d.call_args + mock_unity3d.assert_called_once() + self.assertEqual(5004, kwargs.get("base_port")) + + def test_port_app(self, mock_unity3d): + """Test if the environment uses the correct port + when the environment file is provided""" + + _ = Unity3DEnv(file_name="app", port=None) + args, kwargs = mock_unity3d.call_args + mock_unity3d.assert_called_once() + self.assertEqual(5005, kwargs.get("base_port")) + + def test_ports_multi_app(self, mock_unity3d): + """Test if the base_port + worker_id + is different for each environment""" + + _ = Unity3DEnv(file_name="app", port=None) + args, kwargs_first = mock_unity3d.call_args + _ = Unity3DEnv(file_name="app", port=None) + args, kwargs_second = mock_unity3d.call_args + self.assertNotEqual( + kwargs_first.get("base_port") + kwargs_first.get("worker_id"), + kwargs_second.get("base_port") + kwargs_second.get("worker_id")) + + def test_custom_port_app(self, mock_unity3d): + """Test if the base_port + worker_id is different + for each environment when using custom ports""" + + _ = Unity3DEnv(file_name="app", port=5010) + args, kwargs_first = mock_unity3d.call_args + _ = Unity3DEnv(file_name="app", port=5010) + args, kwargs_second = mock_unity3d.call_args + self.assertNotEqual( + kwargs_first.get("base_port") + kwargs_first.get("worker_id"), + kwargs_second.get("base_port") + kwargs_second.get("worker_id")) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py index 753c234439d7..876c06e96508 100644 --- a/rllib/env/wrappers/unity3d_env.py +++ b/rllib/env/wrappers/unity3d_env.py @@ -27,7 +27,12 @@ class Unity3DEnv(MultiAgentEnv): inside an RLlib PolicyClient for cloud/distributed training of Unity games. """ - _BASE_PORT = 5004 + # Default base port when connecting directly to the Editor + _BASE_PORT_EDITOR = 5004 + # Default base port when connecting to a compiled environment + _BASE_PORT_ENVIRONMENT = 5005 + # The worker_id for each environment instance + _WORKER_ID = 0 def __init__(self, file_name: str = None, @@ -73,18 +78,24 @@ def __init__(self, # environments (num_workers >> 1). Otherwise, would lead to port # conflicts sometimes. time.sleep(random.randint(1, 10)) - port_ = port or self._BASE_PORT - self._BASE_PORT += 1 + port_ = port or (self._BASE_PORT_ENVIRONMENT + if file_name else self._BASE_PORT_EDITOR) + # cache the worker_id and + # increase it for the next environment + worker_id_ = Unity3DEnv._WORKER_ID if file_name else 0 + Unity3DEnv._WORKER_ID += 1 try: self.unity_env = UnityEnvironment( file_name=file_name, - worker_id=0, + worker_id=worker_id_, base_port=port_, seed=seed, no_graphics=no_graphics, timeout_wait=timeout_wait, ) - print("Created UnityEnvironment for port {}".format(port_)) + print( + "Created UnityEnvironment for port {}".format(port_ + + worker_id_)) except mlagents_envs.exception.UnityWorkerInUseException: pass else: From c583113d66941ba86b62c3627edb31814421c3d8 Mon Sep 17 00:00:00 2001 From: Lena Kashtelyan Date: Thu, 28 Jan 2021 13:01:51 -0500 Subject: [PATCH 089/245] [Ax] Align optimization mode and reported SEM with Ax (#13611) * [Ax] Align optimization mode and reported SEM with Ax Ensure that `mode` aligns with the mode set in Ax + report SEM as None rather than as 0.0 to make use of Ax noise inference * Account for review * Update ax.py * Fix lint * Fix tests, ad additional checks * Fix tests for python 3.6 Co-authored-by: Kai Fricke --- python/ray/tune/suggest/ax.py | 44 ++++++++++++++++++------- python/ray/tune/tests/test_sample.py | 10 +++--- python/ray/tune/tests/test_searchers.py | 6 ++-- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/python/ray/tune/suggest/ax.py b/python/ray/tune/suggest/ax.py index 7cccf74a79d6..85aa79f30284 100644 --- a/python/ray/tune/suggest/ax.py +++ b/python/ray/tune/suggest/ax.py @@ -1,7 +1,6 @@ import copy from typing import Dict, List, Optional, Union -from ax.service.ax_client import AxClient from ray.tune.result import DEFAULT_METRIC from ray.tune.sample import Categorical, Float, Integer, LogUniform, \ Quantized, Uniform @@ -12,8 +11,17 @@ try: import ax + from ax.service.ax_client import AxClient except ImportError: - ax = None + ax = AxClient = None + +# This exception only exists in newer Ax releases for python 3.7 +try: + from ax.exceptions.generation_strategy import \ + MaxParallelismReachedException +except ImportError: + MaxParallelismReachedException = Exception + import logging from ray.tune.suggest import Searcher @@ -124,6 +132,7 @@ def __init__(self, assert ax is not None, """Ax must be installed! You can install AxSearch with the command: `pip install ax-platform sqlalchemy`.""" + if mode: assert mode in ["min", "max"], "`mode` must be 'min' or 'max'." @@ -151,7 +160,6 @@ def __init__(self, self.max_concurrent = max_concurrent - self._objective_name = metric self._parameters = [] self._live_trial_mapping = {} @@ -179,6 +187,10 @@ def _setup_experiment(self): "`AxClient.create_experiment()`, or you should pass an " "Ax search space as the `space` parameter to `AxSearch`, " "or pass a `config` dict to `tune.run()`.") + if self._mode not in ["min", "max"]: + raise ValueError( + "Please specify the `mode` argument when initializing " + "the `AxSearch` object or pass it to `tune.run()`.") self._ax.create_experiment( parameters=self._space, objective_name=self._metric, @@ -188,16 +200,25 @@ def _setup_experiment(self): else: if any([ self._space, self._parameter_constraints, - self._outcome_constraints + self._outcome_constraints, self._mode, self._metric ]): raise ValueError( "If you create the Ax experiment yourself, do not pass " "values for these parameters to `AxSearch`: {}.".format([ - "space", "parameter_constraints", "outcome_constraints" + "space", + "parameter_constraints", + "outcome_constraints", + "mode", + "metric", ])) exp = self._ax.experiment - self._objective_name = exp.optimization_config.objective.metric.name + + # Update mode and metric from experiment if it has been passed + self._mode = "min" \ + if exp.optimization_config.objective.minimize else "max" + self._metric = exp.optimization_config.objective.metric.name + self._parameters = list(exp.parameters) if self._ax._enforce_sequential_optimization: @@ -239,7 +260,10 @@ def suggest(self, trial_id: str) -> Optional[Dict]: config = self._points_to_evaluate.pop(0) parameters, trial_index = self._ax.attach_trial(config) else: - parameters, trial_index = self._ax.get_next_trial() + try: + parameters, trial_index = self._ax.get_next_trial() + except MaxParallelismReachedException: + return None self._live_trial_mapping[trial_id] = trial_index return unflatten_dict(parameters) @@ -255,14 +279,12 @@ def on_trial_complete(self, trial_id, result=None, error=False): def _process_result(self, trial_id, result): ax_trial_index = self._live_trial_mapping[trial_id] - metric_dict = { - self._objective_name: (result[self._objective_name], 0.0) - } + metric_dict = {self._metric: (result[self._metric], None)} outcome_names = [ oc.metric.name for oc in self._ax.experiment.optimization_config.outcome_constraints ] - metric_dict.update({on: (result[on], 0.0) for on in outcome_names}) + metric_dict.update({on: (result[on], None) for on in outcome_names}) self._ax.complete_trial( trial_index=ax_trial_index, raw_data=metric_dict) diff --git a/python/ray/tune/tests/test_sample.py b/python/ray/tune/tests/test_sample.py index 0b752e1be207..b631dc2b15b5 100644 --- a/python/ray/tune/tests/test_sample.py +++ b/python/ray/tune/tests/test_sample.py @@ -263,12 +263,14 @@ def testConvertAx(self): ] client1 = AxClient(random_seed=1234) - client1.create_experiment(parameters=converted_config) - searcher1 = AxSearch(ax_client=client1, metric="a", mode="max") + client1.create_experiment( + parameters=converted_config, objective_name="a", minimize=False) + searcher1 = AxSearch(ax_client=client1) client2 = AxClient(random_seed=1234) - client2.create_experiment(parameters=ax_config) - searcher2 = AxSearch(ax_client=client2, metric="a", mode="max") + client2.create_experiment( + parameters=ax_config, objective_name="a", minimize=False) + searcher2 = AxSearch(ax_client=client2) config1 = searcher1.suggest("0") config2 = searcher2.suggest("0") diff --git a/python/ray/tune/tests/test_searchers.py b/python/ray/tune/tests/test_searchers.py index 0b50be49db90..403b11276dcc 100644 --- a/python/ray/tune/tests/test_searchers.py +++ b/python/ray/tune/tests/test_searchers.py @@ -49,8 +49,10 @@ def testAx(self): # At least one nan, inf, -inf and float client = AxClient(random_seed=4321) client.create_experiment( - parameters=converted_config, objective_name="_metric") - searcher = AxSearch(ax_client=client, metric="_metric", mode="max") + parameters=converted_config, + objective_name="_metric", + minimize=False) + searcher = AxSearch(ax_client=client) out = tune.run( _invalid_objective, From 4bc257f4fb7054073cd15bb25f31f1708d02c64b Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 28 Jan 2021 19:28:48 +0100 Subject: [PATCH 090/245] [RLlib] Fix custom multi action distr (#13681) --- rllib/models/catalog.py | 5 ++-- rllib/tests/test_catalog.py | 52 ++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 6d0bfd111296..66796d71f907 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -199,13 +199,14 @@ def get_action_dist( config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): - action_dist_name = config["custom_action_dist"] + custom_action_config = config.copy() + action_dist_name = custom_action_config.pop("custom_action_dist") logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) return ModelCatalog._get_multi_action_distribution( - dist_cls, action_space, config, framework) + dist_cls, action_space, custom_action_config, framework) # Dist_type is given directly as a class. elif type(dist_type) is type and \ diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py index b98f7143a56d..bbd1ec1bbbaa 100644 --- a/rllib/tests/test_catalog.py +++ b/rllib/tests/test_catalog.py @@ -1,13 +1,15 @@ +from functools import partial import gym -from gym.spaces import Box, Discrete +from gym.spaces import Box, Dict, Discrete import numpy as np import unittest import ray -from ray.rllib.models import ModelCatalog, MODEL_DEFAULTS, ActionDistribution -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.tf.tf_action_dist import TFActionDistribution +from ray.rllib.models import ActionDistribution, ModelCatalog, MODEL_DEFAULTS from ray.rllib.models.preprocessors import NoPreprocessor, Preprocessor +from ray.rllib.models.tf.tf_action_dist import MultiActionDistribution, \ + TFActionDistribution +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf, try_import_torch from ray.rllib.utils.test_utils import framework_iterator @@ -60,6 +62,12 @@ def logp(self, x): return tf.zeros(self.output_shape) +class CustomMultiActionDistribution(MultiActionDistribution): + @override(MultiActionDistribution) + def entropy(self): + raise NotImplementedError + + class TestModelCatalog(unittest.TestCase): def tearDown(self): ray.shutdown() @@ -161,6 +169,42 @@ class Model(): with self.assertRaises(NotImplementedError): dist.entropy() + def test_custom_multi_action_distribution(self): + class Model(): + pass + + ray.init( + object_store_memory=1000 * 1024 * 1024, + ignore_reinit_error=True) # otherwise fails sometimes locally + # registration + ModelCatalog.register_custom_action_dist( + "test", CustomMultiActionDistribution) + s1 = Discrete(5) + s2 = Box(0, 1, shape=(3, ), dtype=np.float32) + spaces = dict(action_1=s1, action_2=s2) + action_space = Dict(spaces) + # test retrieving it + model_config = MODEL_DEFAULTS.copy() + model_config["custom_action_dist"] = "test" + dist_cls, param_shape = ModelCatalog.get_action_dist( + action_space, model_config) + self.assertIsInstance(dist_cls, partial) + self.assertEqual(param_shape, s1.n + 2 * s2.shape[0]) + + # test the class works as a distribution + dist_input = tf1.placeholder(tf.float32, (None, param_shape)) + model = Model() + model.model_config = model_config + dist = dist_cls(dist_input, model=model) + self.assertIsInstance(dist.sample(), dict) + self.assertIn("action_1", dist.sample()) + self.assertIn("action_2", dist.sample()) + self.assertEqual(dist.sample()["action_1"].dtype, tf.int64) + self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape) + + with self.assertRaises(NotImplementedError): + dist.entropy() + if __name__ == "__main__": import pytest From cb771f263d358cc7a4ad2447cb5de58ff7a59d5c Mon Sep 17 00:00:00 2001 From: architkulkarni Date: Thu, 28 Jan 2021 12:40:47 -0800 Subject: [PATCH 091/245] [Serve] Add ServeHandle metrics (#13640) --- doc/source/serve/advanced.rst | 4 ++ python/ray/serve/handle.py | 15 ++++++++ python/ray/serve/router.py | 55 +++++++++++++++++++-------- python/ray/serve/tests/test_api.py | 4 ++ python/ray/serve/tests/test_router.py | 2 +- 5 files changed, 64 insertions(+), 16 deletions(-) diff --git a/doc/source/serve/advanced.rst b/doc/source/serve/advanced.rst index 3ac191f1b3a4..542a3ce188ec 100644 --- a/doc/source/serve/advanced.rst +++ b/doc/source/serve/advanced.rst @@ -321,6 +321,10 @@ The following metrics are exposed by Ray Serve: - The number of HTTP requests processed. * - ``serve_num_router_requests`` - The number of requests processed by the router. + * - ``serve_handle_request_counter`` + - The number of requests processed by this ServeHandle. + * - ``backend_queued_queries`` + - The number of queries for this backend waiting to be assigned to a replica. To see this in action, run ``ray start --head --metrics-export-port=8080`` in your terminal, and then run the following script: diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index c6951c6380b9..475f64556cb5 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -4,6 +4,8 @@ from typing import Any, Dict, Optional, Union from enum import Enum +from ray.serve.utils import get_random_letters +from ray.util import metrics from ray.serve.router import Router @@ -47,6 +49,17 @@ def __init__(self, self.router = router self.endpoint_name = endpoint_name self.handle_options = handle_options or HandleOptions() + self.handle_tag = f"{self.endpoint_name}#{get_random_letters()}" + + self.request_counter = metrics.Count( + "serve_handle_request_counter", + description=("The number of handle.remote() calls that have been " + "made on this handle."), + tag_keys=("handle", "endpoint")) + self.request_counter.set_default_tags({ + "handle": self.handle_tag, + "endpoint": self.endpoint_name + }) def options(self, *, @@ -92,6 +105,7 @@ async def remote(self, ``**kwargs``: All keyword arguments will be available in ``request.query_params``. """ + self.request_counter.record(1) return await self.router._remote( self.endpoint_name, self.handle_options, request_data, kwargs) @@ -118,6 +132,7 @@ def remote(self, request_data: Optional[Union[Dict, Any]] = None, ``**kwargs``: All keyword arguments will be available in ``request.args``. """ + self.request_counter.record(1) coro = self.router._remote(self.endpoint_name, self.handle_options, request_data, kwargs) future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( diff --git a/python/ray/serve/router.py b/python/ray/serve/router.py index c4a87b49bb60..ec887d006c43 100644 --- a/python/ray/serve/router.py +++ b/python/ray/serve/router.py @@ -1,7 +1,6 @@ import asyncio from enum import Enum import itertools -from collections import defaultdict from dataclasses import dataclass, field from typing import Any, ChainMap, Dict, Iterable, List, Optional @@ -49,12 +48,12 @@ class Query: class ReplicaSet: """Data structure representing a set of replica actor handles""" - def __init__(self): + def __init__(self, backend_tag): + self.backend_tag = backend_tag # NOTE(simon): We have to do this because max_concurrent_queries # and the replica handles come from different long poll keys. self.max_concurrent_queries: int = 8 self.in_flight_queries: Dict[ActorHandle, set] = dict() - # The iterator used for load balancing among replicas. Using itertools # cycle, we implements a round-robin policy, skipping overloaded # replicas. @@ -64,15 +63,25 @@ def __init__(self): self.replica_iterator = itertools.cycle(self.in_flight_queries.keys()) # Used to unblock this replica set waiting for free replicas. A newly - # added replica or updated max_concurrenty_queries value means the + # added replica or updated max_concurrent_queries value means the # query that waits on a free replica might be unblocked on. self.config_updated_event = asyncio.Event() + self.num_queued_queries = 0 + self.num_queued_queries_gauge = metrics.Gauge( + "serve_backend_queued_queries", + description=( + "The current number of queries to this backend waiting" + " to be assigned to a replica."), + tag_keys=("backend", "endpoint")) + self.num_queued_queries_gauge.set_default_tags({ + "backend": self.backend_tag + }) def set_max_concurrent_queries(self, new_value): if new_value != self.max_concurrent_queries: self.max_concurrent_queries = new_value logger.debug( - f"ReplicaSet: chaging max_concurrent_queries to {new_value}") + f"ReplicaSet: changing max_concurrent_queries to {new_value}") self.config_updated_event.set() def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): @@ -92,7 +101,7 @@ def update_worker_replicas(self, worker_replicas: Iterable[ActorHandle]): self.config_updated_event.set() def _try_assign_replica(self, query: Query) -> Optional[ray.ObjectRef]: - """Try to assign query to a replica, return the object ref is succeeded + """Try to assign query to a replica, return the object ref if succeeded or return None if it can't assign this query to any replicas. """ for _ in range(len(self.in_flight_queries.keys())): @@ -130,6 +139,10 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: and only send a query to available replicas (determined by the backend max_concurrent_quries value.) """ + endpoint = query.metadata.endpoint + self.num_queued_queries += 1 + self.num_queued_queries_gauge.record( + self.num_queued_queries, tags={"endpoint": endpoint}) assigned_ref = self._try_assign_replica(query) while assigned_ref is None: # Can't assign a replica right now. logger.debug("Failed to assign a replica for " @@ -147,8 +160,12 @@ async def assign_replica(self, query: Query) -> ray.ObjectRef: return_when=asyncio.FIRST_COMPLETED) if self.config_updated_event.is_set(): self.config_updated_event.clear() - # We are pretty sure a free replica is ready now. + # We are pretty sure a free replica is ready now, let's recurse and + # assign this query a replica. assigned_ref = self._try_assign_replica(query) + self.num_queued_queries -= 1 + self.num_queued_queries_gauge.record( + self.num_queued_queries, tags={"endpoint": endpoint}) return assigned_ref @@ -168,7 +185,8 @@ def __init__(self, controller_handle: ActorHandle): self.controller = controller_handle self.endpoint_policies: Dict[str, EndpointPolicy] = dict() - self.backend_replicas: Dict[str, ReplicaSet] = defaultdict(ReplicaSet) + + self.backend_replicas: Dict[str, ReplicaSet] = dict() self._pending_endpoints: Dict[str, asyncio.Future] = dict() @@ -212,8 +230,8 @@ async def _update_replica_handles(self, replica_handles): replica_handles) for backend_tag, replica_handles in ChainMap(added, updated).items(): - self.backend_replicas[backend_tag].update_worker_replicas( - replica_handles) + self._get_or_create_replica_set( + backend_tag).update_worker_replicas(replica_handles) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -223,8 +241,9 @@ async def _update_backend_configs(self, backend_configs): added, removed, updated = compute_dict_delta(self.backend_replicas, backend_configs) for backend_tag, config in ChainMap(added, updated).items(): - self.backend_replicas[backend_tag].set_max_concurrent_queries( - config.max_concurrent_queries) + self._get_or_create_replica_set( + backend_tag).set_max_concurrent_queries( + config.max_concurrent_queries) for backend_tag in removed.keys(): if backend_tag in self.backend_replicas: @@ -261,11 +280,17 @@ async def assign_request( endpoint_policy = self.endpoint_policies[endpoint] chosen_backend, *shadow_backends = endpoint_policy.assign(query) - result_ref = await self.backend_replicas[chosen_backend - ].assign_replica(query) + result_ref = await self._get_or_create_replica_set( + chosen_backend).assign_replica(query) for backend in shadow_backends: - await self.backend_replicas[backend].assign_replica(query) + (await self._get_or_create_replica_set(backend) + .assign_replica(query)) self.num_router_requests.record(1, tags={"endpoint": endpoint}) return result_ref + + def _get_or_create_replica_set(self, backend_name): + if backend_name not in self.backend_replicas: + self.backend_replicas[backend_name] = ReplicaSet(backend_name) + return self.backend_replicas[backend_name] diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index a35f7e54b361..62f239f78782 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -875,6 +875,10 @@ def verify_metrics(do_assert=False): # gauge "replica_processing_queries", "replica_queued_queries", + # handle + "serve_handle_request_counter", + # ReplicaSet + "backend_queued_queries" ] for metric in expected_metrics: # For the final error round diff --git a/python/ray/serve/tests/test_router.py b/python/ray/serve/tests/test_router.py index 231ac11a5bfd..9b8eb5548b7c 100644 --- a/python/ray/serve/tests/test_router.py +++ b/python/ray/serve/tests/test_router.py @@ -204,7 +204,7 @@ async def num_queries(self): return self._num_queries # We will test a scenario with two replicas in the replica set. - rs = ReplicaSet() + rs = ReplicaSet("my_backend") workers = [MockWorker.remote() for _ in range(2)] rs.set_max_concurrent_queries(1) rs.update_worker_replicas(workers) From 0c906a8b93f46bb672622af4666de4033ac570c9 Mon Sep 17 00:00:00 2001 From: Tanja Bayer <30770185+TanjaBayer@users.noreply.github.com> Date: Thu, 28 Jan 2021 23:27:54 +0100 Subject: [PATCH 092/245] [Docker] usage of python-version (#13011) Co-authored-by: Tanja Bayer Co-authored-by: Ian Rodney --- build-docker.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build-docker.sh b/build-docker.sh index b39336186caf..42f9068954f1 100755 --- a/build-docker.sh +++ b/build-docker.sh @@ -8,7 +8,8 @@ set -x GPU="" BASE_IMAGE="ubuntu:focal" WHEEL_URL="https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" -PYTHON_VERSION="" +PYTHON_VERSION="3.7.7" + while [[ $# -gt 0 ]] do @@ -41,6 +42,7 @@ case $key in --python-version) # Python version to install. e.g. 3.7.7. # Changing python versions may require a different wheel. + # If not provided defaults to 3.7.7 shift PYTHON_VERSION=$1 ;; @@ -59,7 +61,7 @@ for IMAGE in "base-deps" "ray-deps" "ray" do cp "$WHEEL" "docker/$IMAGE/$(basename "$WHEEL")" if [ $OUTPUT_SHA ]; then - IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) + IMAGE_SHA=$(docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -q -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE) echo "rayproject/$IMAGE:nightly$GPU SHA:$IMAGE_SHA" else docker build $NO_CACHE --build-arg GPU="$GPU" --build-arg BASE_IMAGE="$BASE_IMAGE" --build-arg WHEEL_PATH="$(basename "$WHEEL")" --build-arg PYTHON_VERSION="$PYTHON_VERSION" -t rayproject/$IMAGE:nightly$GPU docker/$IMAGE From 813a7ab0e260a4623ecf99deee959dde27aa81cb Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Thu, 28 Jan 2021 15:24:50 -0800 Subject: [PATCH 093/245] [docker] Build Python3.6 & Python3.8 Docker Images (#13548) --- .travis.yml | 35 ++++- ci/travis/build-docker-images.py | 208 +++++++++++++++++----------- ci/travis/determine_tests_to_run.py | 2 + docker/base-deps/Dockerfile | 8 +- docker/ray-ml/Dockerfile | 10 +- python/requirements_ml_docker.txt | 3 +- 6 files changed, 175 insertions(+), 91 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4d8f8ddd1255..8cff56d419d2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -209,10 +209,32 @@ matrix: - . ./ci/travis/ci.sh test_wheels - export PATH="$HOME/miniconda3/bin:$PATH" - python -m pip install docker - - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py; fi + - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37; fi - bash ./java/build-jar-multiplatform.sh linux cache: false + + # Build Py36 & Py38 Docker Images + - os: linux + env: + - LINUX_WHEELS=1 + - DOCKER_BUILD_PY36_38=1 + - PYTHONWARNINGS=ignore + language: java + jdk: openjdk8 + install: + - . ./ci/travis/ci.sh init RAY_CI_LINUX_WHEELS_AFFECTED + before_script: + - . ./ci/travis/ci.sh build + script: + - wget --quiet "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda3.sh + - bash miniconda3.sh -b -p "$HOME/miniconda3" + - export PATH="$HOME/miniconda3/bin:$PATH" + - conda install -y python=3.7.6 + - python -m pip install docker + - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38; fi + cache: false + # Build and deploy multi-platform jars. - os: linux env: @@ -491,7 +513,7 @@ deploy: - provider: script edge: true # This supposedly opts in to deploy v2. - script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py + script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY37 skip_cleanup: true on: repo: ray-project/ray @@ -530,3 +552,12 @@ deploy: repo: ray-project/ray branch: master condition: $MULTIPLATFORM_JARS = 1 || $MAC_JARS = 1 || $LINUX_JARS = 1 + + - provider: script + edge: true # This supposedly opts in to deploy v2. + script: export PATH="$HOME/miniconda3/bin:$PATH"; ./ci/keep_alive python $TRAVIS_BUILD_DIR/ci/travis/build-docker-images.py PY36_PY38 + skip_cleanup: true + on: + repo: ray-project/ray + all_branches: true + condition: $LINUX_WHEELS = 1 \ No newline at end of file diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index c549bc95e60a..ad69a15dbcaa 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -15,7 +15,7 @@ print = functools.partial(print, file=sys.stderr, flush=True) DOCKER_USERNAME = "raytravisbot" DOCKER_CLIENT = None -PYTHON_WHL_VERSION = "cp37m" +PYTHON_WHL_VERSION = "cp3" DOCKER_HUB_DESCRIPTION = { "base-deps": ("Internal Image, refer to " @@ -29,6 +29,8 @@ "https://hub.docker.com/repository/docker/rayproject/ray-ml") } +PY_MATRIX = {"-py36": "3.6.12", "-py37": "3.7.7", "-py38": "3.8.5"} + def _merge_build(): return os.environ.get("TRAVIS_PULL_REQUEST").lower() == "false" @@ -52,13 +54,18 @@ def _get_root_dir(): return os.path.join(_get_curr_dir(), "../../") -def _get_wheel_name(): - matches = glob.glob( - f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}-manylinux*") - assert len(matches) == 1, ( - f"Found ({len(matches)}) matches " - f"'*{PYTHON_WHL_VERSION}-manylinux*' instead of 1") - return os.path.basename(matches[0]) +def _get_wheel_name(minor_version_number): + if minor_version_number: + matches = glob.glob(f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}" + f"{minor_version_number}*-manylinux*") + assert len(matches) == 1, ( + f"Found ({len(matches)}) matches for '*{PYTHON_WHL_VERSION}" + f"{minor_version_number}*-manylinux*' instead of 1") + return os.path.basename(matches[0]) + else: + matches = glob.glob( + f"{_get_root_dir()}/.whl/*{PYTHON_WHL_VERSION}*-manylinux*") + return [os.path.basename(i) for i in matches] def _docker_affected(): @@ -81,64 +88,76 @@ def _docker_affected(): def _build_cpu_gpu_images(image_name, no_cache=True) -> List[str]: built_images = [] for gpu in ["-cpu", "-gpu"]: - build_args = {} - if image_name == "base-deps": - build_args["BASE_IMAGE"] = ( - "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" - if gpu == "-gpu" else "ubuntu:focal") - else: - build_args["GPU"] = gpu - - if "ray" in image_name: - build_args["WHEEL_PATH"] = f".whl/{_get_wheel_name()}" - - tagged_name = f"rayproject/{image_name}:nightly{gpu}" - for i in range(2): - output = DOCKER_CLIENT.api.build( - path=os.path.join(_get_root_dir(), "docker", image_name), - tag=tagged_name, - nocache=no_cache, - buildargs=build_args) - - full_output = "" - try: - start = datetime.datetime.now() - current_iter = start - for line in output: - if datetime.datetime.now( - ) - current_iter >= datetime.timedelta(minutes=5): - current_iter = datetime.datetime.now() - elapsed = datetime.datetime.now() - start - print(f"Still building {tagged_name} after " - f"{elapsed.seconds} seconds") - full_output += line.decode("utf-8") - except Exception as e: - print(f"FAILURE with error {e}") - - if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: - print(f"ERROR building: {tagged_name} & error below:") - print(full_output) - if (i == 1): - raise Exception("FAILED TO BUILD IMAGE") - print("TRYING AGAIN") + for py_name, py_version in PY_MATRIX.items(): + build_args = {} + build_args["PYTHON_VERSION"] = py_version + # I.e. "-py36"[-1] == 6 + build_args["PYTHON_MINOR_VERSION"] = py_name[-1] + + if image_name == "base-deps": + build_args["BASE_IMAGE"] = ( + "nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04" + if gpu == "-gpu" else "ubuntu:focal") else: - break - - print("BUILT: ", tagged_name) - built_images.append(tagged_name) + # NOTE(ilr) This is a bit of an abuse of the name "GPU" + build_args["GPU"] = f"{py_name}{gpu}" + + if image_name in ["ray", "ray-deps"]: + wheel = _get_wheel_name(build_args["PYTHON_MINOR_VERSION"]) + build_args["WHEEL_PATH"] = f".whl/{wheel}" + + tagged_name = f"rayproject/{image_name}:nightly{py_name}{gpu}" + for i in range(2): + cleanup = DOCKER_CLIENT.containers.prune().get( + "SpaceReclaimed") + if cleanup is not None: + print(f"Cleaned up {cleanup / (2**20)}MB") + output = DOCKER_CLIENT.api.build( + path=os.path.join(_get_root_dir(), "docker", image_name), + tag=tagged_name, + nocache=no_cache, + buildargs=build_args) + + full_output = "" + try: + start = datetime.datetime.now() + current_iter = start + for line in output: + if datetime.datetime.now( + ) - current_iter >= datetime.timedelta(minutes=5): + current_iter = datetime.datetime.now() + elapsed = datetime.datetime.now() - start + print(f"Still building {tagged_name} after " + f"{elapsed.seconds} seconds") + full_output += line.decode("utf-8") + except Exception as e: + print(f"FAILURE with error {e}") + + if len(DOCKER_CLIENT.api.images(tagged_name)) == 0: + print(f"ERROR building: {tagged_name} & error below:") + print(full_output) + if (i == 1): + raise Exception("FAILED TO BUILD IMAGE") + print("TRYING AGAIN") + else: + break + + print("BUILT: ", tagged_name) + built_images.append(tagged_name) return built_images def copy_wheels(): root_dir = _get_root_dir() - wheel = _get_wheel_name() - source = os.path.join(root_dir, ".whl", wheel) - ray_dst = os.path.join(root_dir, "docker/ray/.whl/") - ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") - os.makedirs(ray_dst, exist_ok=True) - shutil.copy(source, ray_dst) - os.makedirs(ray_dep_dst, exist_ok=True) - shutil.copy(source, ray_dep_dst) + wheels = _get_wheel_name(None) + for wheel in wheels: + source = os.path.join(root_dir, ".whl", wheel) + ray_dst = os.path.join(root_dir, "docker/ray/.whl/") + ray_dep_dst = os.path.join(root_dir, "docker/ray-deps/.whl/") + os.makedirs(ray_dst, exist_ok=True) + shutil.copy(source, ray_dst) + os.makedirs(ray_dep_dst, exist_ok=True) + shutil.copy(source, ray_dep_dst) def build_or_pull_base_images(is_docker_affected: bool) -> List[str]: @@ -239,31 +258,48 @@ def get_new_tag(old_tag, new_tag): image_list.extend(["base-deps", "ray-deps"]) for image in image_list: - full_image = f"rayproject/{image}" + for py_version in PY_MATRIX.keys(): + full_image = f"rayproject/{image}" - # Generate :nightly from nightly-cpu - DOCKER_CLIENT.api.tag( - image=f"{full_image}:nightly-cpu", - repository=full_image, - tag="nightly") - - for arch_tag in ["-cpu", "-gpu", ""]: - full_arch_tag = f"nightly{arch_tag}" - # Do not tag release builds because they are no longer up to date - # after the branch cut. - if not _release_build(): - # Tag and push rayproject/:nightly - docker_push(full_image, full_arch_tag) - - # Ex: specific_tag == "1.0.1" or "" or "" - specific_tag = get_new_tag( - full_arch_tag, date_tag if "-deps" in image else sha_tag) - # Tag and push rayproject/: + # Tag "nightly-py3x" from "nightly-py3x-cpu" DOCKER_CLIENT.api.tag( - image=f"{full_image}:{full_arch_tag}", + image=f"{full_image}:nightly{py_version}-cpu", repository=full_image, - tag=specific_tag) - docker_push(full_image, specific_tag) + tag=f"nightly{py_version}") + + for arch_tag in ["-cpu", "-gpu", ""]: + full_arch_tag = f"nightly{py_version}{arch_tag}" + # Do not tag release builds because they are no longer up to + # date after the branch cut. + if not _release_build(): + # Tag and push rayproject/:nightly + docker_push(full_image, full_arch_tag) + + # Ex: specific_tag == "1.0.1" or "" or "" + specific_tag = get_new_tag( + full_arch_tag, date_tag if "-deps" in image else sha_tag) + + # Tag and push rayproject/: + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=specific_tag) + docker_push(full_image, specific_tag) + + if "-py37" in py_version: + non_python_specific_tag = specific_tag.replace("-py37", "") + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=non_python_specific_tag) + docker_push(full_image, non_python_specific_tag) + + non_python_nightly_tag = full_arch_tag.replace("-py37", "") + DOCKER_CLIENT.api.tag( + image=f"{full_image}:{full_arch_tag}", + repository=full_image, + tag=non_python_nightly_tag) + docker_push(full_image, non_python_nightly_tag) # Push infra here: @@ -306,6 +342,14 @@ def push_readmes(): if __name__ == "__main__": print("RUNNING WITH: ", sys.version) + if len(sys.argv) == 2: + version_to_drop = sys.argv[1] + if version_to_drop == "PY37": + PY_MATRIX.pop("-py36") + PY_MATRIX.pop("-py38") + else: + PY_MATRIX.pop("-py37") + print("Building the following python versions: ", PY_MATRIX) if os.environ.get("TRAVIS") == "true": is_docker_affected = _docker_affected() if _merge_build() or is_docker_affected: diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py index 70eefc16a566..cba016fcf610 100644 --- a/ci/travis/determine_tests_to_run.py +++ b/ci/travis/determine_tests_to_run.py @@ -124,6 +124,8 @@ def list_changed_files(commit_range): for prefix in skip_prefix_list): # nothing is run but linting in these cases pass + elif changed_file.endswith("build-docker-images.py"): + RAY_CI_DOCKER_AFFECTED = 1 elif changed_file.startswith("src/"): RAY_CI_TUNE_AFFECTED = 1 RAY_CI_SGD_AFFECTED = 1 diff --git a/docker/base-deps/Dockerfile b/docker/base-deps/Dockerfile index 278fad1ec73d..e00ca141c9d5 100644 --- a/docker/base-deps/Dockerfile +++ b/docker/base-deps/Dockerfile @@ -30,6 +30,8 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ git \ wget \ cmake \ + g++ \ + zlib1g-dev \ $(if [ "$AUTOSCALER" = "autoscaler" ]; then echo \ tmux \ screen \ @@ -52,12 +54,14 @@ RUN sudo apt-get update -y && sudo apt-get upgrade -y \ numpy==1.15.4 \ psutil \ blist \ + atari-py \ # blist is needed for numpy (which is re-installed when ray is installed) + # atari-py is built from source for Python 3.8 (requires g++ & zlib1g-dev) # To avoid the following error on Jenkins: # AttributeError: 'numpy.ufunc' object has no attribute '__module__' && $HOME/anaconda3/bin/pip uninstall -y dask \ - # We install cmake temporarily to get psutil - && sudo apt-get autoremove -y cmake \ + # We install cmake temporarily to get psutil, blist & atari-py + && sudo apt-get autoremove -y cmake g++ zlib1g-dev \ # Either install kubectl or remove wget && (if [ "$AUTOSCALER" = "autoscaler" ]; \ then wget -O - -q https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add - \ diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 25211085edc7..908351df19d9 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -1,12 +1,13 @@ ARG GPU FROM rayproject/ray:nightly"$GPU" +ARG PYTHON_MINOR_VERSION=7 # We have to uninstall wrapt this way for Tensorflow compatibility COPY requirements.txt ./ COPY requirements_ml_docker.txt ./ COPY requirements_rllib.txt ./ # Docker image uses Python 3.7 -COPY linux-py3.7-requirements_tune.txt ./requirements_tune.txt +COPY linux-py3."$PYTHON_MINOR_VERSION"-requirements_tune.txt ./requirements_tune.txt RUN sudo apt-get update \ && sudo apt-get install -y gcc \ @@ -14,12 +15,13 @@ RUN sudo apt-get update \ libgtk2.0-dev \ zlib1g-dev \ libgl1-mesa-dev \ + && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ && $HOME/anaconda3/bin/pip --use-deprecated=legacy-resolver --no-cache-dir install -r requirements.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_rllib.txt \ && $HOME/anaconda3/bin/pip --no-cache-dir install -r requirements_tune.txt \ - && $HOME/anaconda3/bin/pip --no-cache-dir install -U -r requirements_ml_docker.txt \ - # Remove dataclasses & typing because they are included in Py3.7 - && $HOME/anaconda3/bin/pip uninstall dataclasses typing -y \ + # Remove dataclasses & typing because they are included in Python > 3.6 + && if [ $(python -c 'import sys; print(sys.version_info.minor)') != "6" ]; then \ + $HOME/anaconda3/bin/pip uninstall dataclasses typing -y; fi \ && sudo rm requirements.txt && sudo rm requirements_ml_docker.txt \ && sudo rm requirements_tune.txt && sudo rm requirements_rllib.txt \ && sudo apt-get clean diff --git a/python/requirements_ml_docker.txt b/python/requirements_ml_docker.txt index c61ba0c055f6..bbecb5bd873e 100644 --- a/python/requirements_ml_docker.txt +++ b/python/requirements_ml_docker.txt @@ -3,4 +3,5 @@ tensorflow-gpu>=2.4.0 -f https://download.pytorch.org/whl/torch_stable.html torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html -torchvision==0.8.2+cu110 \ No newline at end of file +torchvision==0.8.2+cu110 +pip; python_version > "3.7" From 42d501d747950e3c539d9fcc11ac318780b180ea Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Thu, 28 Jan 2021 19:07:10 -0800 Subject: [PATCH 094/245] [core] Pin arguments during task execution (#13737) * tmp * Pin task args * unit tests * update * test * Fix --- python/ray/tests/test_object_manager.py | 16 +-- python/ray/tests/test_object_spilling.py | 3 - src/ray/raylet/dependency_manager.cc | 6 - src/ray/raylet/dependency_manager.h | 9 -- src/ray/raylet/dependency_manager_test.cc | 10 -- src/ray/raylet/node_manager.cc | 65 ++++++---- src/ray/raylet/node_manager.h | 10 ++ .../raylet/scheduling/cluster_task_manager.cc | 49 ++++++- .../raylet/scheduling/cluster_task_manager.h | 22 +++- .../scheduling/cluster_task_manager_test.cc | 122 ++++++++++++++---- src/ray/raylet/test/util.h | 9 +- 11 files changed, 222 insertions(+), 99 deletions(-) diff --git a/python/ray/tests/test_object_manager.py b/python/ray/tests/test_object_manager.py index e38733f62d7e..004b1c2f6a5d 100644 --- a/python/ray/tests/test_object_manager.py +++ b/python/ray/tests/test_object_manager.py @@ -296,9 +296,6 @@ def driver(): ray.get(driver.remote()) -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") @pytest.mark.timeout(30) def test_pull_bundles_admission_control(shutdown_only): cluster = Cluster() @@ -333,9 +330,6 @@ def foo(*args): ray.get(tasks) -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") @pytest.mark.timeout(30) def test_pull_bundles_admission_control_dynamic(shutdown_only): # This test is the same as test_pull_bundles_admission_control, except that @@ -358,11 +352,13 @@ def test_pull_bundles_admission_control_dynamic(shutdown_only): cluster.wait_for_nodes() @ray.remote - def foo(*args): + def foo(i, *args): + print("foo", i) return @ray.remote - def allocate(*args): + def allocate(i): + print("allocate", i) return np.zeros(object_size, dtype=np.uint8) args = [] @@ -373,8 +369,8 @@ def allocate(*args): ] args.append(task_args) - tasks = [foo.remote(*task_args) for task_args in args] - allocated = [allocate.remote() for _ in range(num_objects)] + tasks = [foo.remote(i, *task_args) for i, task_args in enumerate(args)] + allocated = [allocate.remote(i) for i in range(num_objects)] ray.get(tasks) del allocated diff --git a/python/ray/tests/test_object_spilling.py b/python/ray/tests/test_object_spilling.py index 3f5b5f7ae885..242799dc9281 100644 --- a/python/ray/tests/test_object_spilling.py +++ b/python/ray/tests/test_object_spilling.py @@ -618,9 +618,6 @@ def test_release_during_plasma_fetch(object_spilling_config, shutdown_only): do_test_release_resource(object_spilling_config, expect_released=True) -@pytest.mark.skip( - reason="This hangs due to a deadlock between a worker getting its " - "arguments and the node pulling arguments for the next task queued.") @pytest.mark.skipif( platform.system() == "Windows", reason="Failing on Windows.") @pytest.mark.timeout(30) diff --git a/src/ray/raylet/dependency_manager.cc b/src/ray/raylet/dependency_manager.cc index 988893beaa47..7c9faf642d3c 100644 --- a/src/ray/raylet/dependency_manager.cc +++ b/src/ray/raylet/dependency_manager.cc @@ -185,12 +185,6 @@ bool DependencyManager::RequestTaskDependencies( return task_entry.num_missing_dependencies == 0; } -bool DependencyManager::IsTaskReady(const TaskID &task_id) const { - auto task_entry = queued_task_requests_.find(task_id); - RAY_CHECK(task_entry != queued_task_requests_.end()); - return task_entry->second.num_missing_dependencies == 0; -} - void DependencyManager::RemoveTaskDependencies(const TaskID &task_id) { RAY_LOG(DEBUG) << "Removing dependencies for task " << task_id; auto task_entry = queued_task_requests_.find(task_id); diff --git a/src/ray/raylet/dependency_manager.h b/src/ray/raylet/dependency_manager.h index 1e7ddfcb17c1..903a9893a579 100644 --- a/src/ray/raylet/dependency_manager.h +++ b/src/ray/raylet/dependency_manager.h @@ -37,7 +37,6 @@ class TaskDependencyManagerInterface { virtual bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) = 0; - virtual bool IsTaskReady(const TaskID &task_id) const = 0; virtual void RemoveTaskDependencies(const TaskID &task_id) = 0; virtual ~TaskDependencyManagerInterface(){}; }; @@ -131,14 +130,6 @@ class DependencyManager : public TaskDependencyManagerInterface { bool RequestTaskDependencies(const TaskID &task_id, const std::vector &required_objects); - /// Check whether a task is ready to run. The task ID must have been - /// previously added by the caller. - /// - /// \param task_id The ID of the task to check. - /// \return Whether all of the dependencies for the task are - /// local. - bool IsTaskReady(const TaskID &task_id) const; - /// Cancel a task's dependencies. We will no longer attempt to fetch any /// remote dependencies, if no other task or worker requires them. /// diff --git a/src/ray/raylet/dependency_manager_test.cc b/src/ray/raylet/dependency_manager_test.cc index c6d0ab2ee8c5..6ea260bc3d97 100644 --- a/src/ray/raylet/dependency_manager_test.cc +++ b/src/ray/raylet/dependency_manager_test.cc @@ -89,7 +89,6 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); ASSERT_EQ(object_manager_mock_.active_requests.size(), 1); - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // For each argument, tell the task dependency manager that the argument is // local. All arguments should be canceled as they become available locally. @@ -98,15 +97,12 @@ TEST_F(DependencyManagerTest, TestSimpleTask) { } auto ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[0]); ASSERT_TRUE(ready_task_ids.empty()); - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[1]); ASSERT_TRUE(ready_task_ids.empty()); - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The task is ready to run. ready_task_ids = dependency_manager_.HandleObjectLocal(arguments[2]); ASSERT_EQ(ready_task_ids.size(), 1); ASSERT_EQ(ready_task_ids.front(), task_id); - ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Remove the task. dependency_manager_.RemoveTaskDependencies(task_id); @@ -127,7 +123,6 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { bool ready = dependency_manager_.RequestTaskDependencies( task_id, ObjectIdsToRefs({argument_id})); ASSERT_FALSE(ready); - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // The object should be requested from the object manager once for each task. ASSERT_EQ(object_manager_mock_.active_requests.size(), i + 1); } @@ -139,7 +134,6 @@ TEST_F(DependencyManagerTest, TestMultipleTasks) { std::unordered_set added_tasks(dependent_tasks.begin(), dependent_tasks.end()); for (auto &id : ready_task_ids) { ASSERT_TRUE(added_tasks.erase(id)); - ASSERT_TRUE(dependency_manager_.IsTaskReady(id)); } ASSERT_TRUE(added_tasks.empty()); @@ -166,7 +160,6 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { bool ready = dependency_manager_.RequestTaskDependencies(task_id, ObjectIdsToRefs(arguments)); ASSERT_FALSE(ready); - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); // Tell the task dependency manager that each of the arguments is now // available. @@ -183,7 +176,6 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } - ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); // Simulate each of the arguments getting evicted. Each object should now be // considered remote. @@ -203,7 +195,6 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { // the waiting state. ASSERT_TRUE(waiting_tasks.empty()); } - ASSERT_FALSE(dependency_manager_.IsTaskReady(task_id)); } // Tell the task dependency manager that each of the arguments is available @@ -221,7 +212,6 @@ TEST_F(DependencyManagerTest, TestTaskArgEviction) { ASSERT_TRUE(ready_tasks.empty()); } } - ASSERT_TRUE(dependency_manager_.IsTaskReady(task_id)); dependency_manager_.RemoveTaskDependencies(task_id); AssertNoLeaks(); diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index e1ac5eb670bb..251e28e26aed 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -222,7 +222,11 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, const NodeID &self self_node_id_, std::dynamic_pointer_cast(cluster_resource_scheduler_), dependency_manager_, is_owner_alive, get_node_info_func, announce_infeasible_task, - worker_pool_, leased_workers_)); + worker_pool_, leased_workers_, + [this](const std::vector &object_ids, + std::vector> *results) { + return GetObjectsFromPlasma(object_ids, results); + })); placement_group_resource_manager_ = std::make_shared( std::dynamic_pointer_cast( @@ -1242,8 +1246,9 @@ void NodeManager::DisconnectClient(const std::shared_ptr &clie if ((!task_id.IsNil() || !actor_id.IsNil()) && !worker->IsDead()) { // If the worker was an actor, it'll be cleaned by GCS. if (actor_id.IsNil()) { + // Return the resources that were being used by this worker. Task task; - static_cast(local_queues_.RemoveTask(task_id, &task)); + cluster_task_manager_->TaskFinished(worker, &task); } if (disconnect_type == rpc::WorkerExitType::SYSTEM_ERROR_EXIT) { @@ -2365,6 +2370,33 @@ std::string compact_tag_string(const opencensus::stats::ViewDescriptor &view, return result.str(); } +bool NodeManager::GetObjectsFromPlasma(const std::vector &object_ids, + std::vector> *results) { + // Pin the objects in plasma by getting them and holding a reference to + // the returned buffer. + // NOTE: the caller must ensure that the objects already exist in plasma before + // sending a PinObjectIDs request. + std::vector plasma_results; + // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not + // block when serving the request. However, if the plasma store is under + // heavy load, then this request can still block the NodeManager event loop + // since we must wait for the plasma store's reply. We should consider using + // an `AsyncGet` instead. + if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { + return false; + } + + for (const auto &plasma_result : plasma_results) { + if (plasma_result.data == nullptr) { + results->push_back(nullptr); + } else { + results->emplace_back(std::unique_ptr( + new RayObject(plasma_result.data, plasma_result.metadata, {}))); + } + } + return true; +} + void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, rpc::PinObjectIDsReply *reply, rpc::SendReplyCallback send_reply_callback) { @@ -2374,33 +2406,16 @@ void NodeManager::HandlePinObjectIDs(const rpc::PinObjectIDsRequest &request, object_ids.push_back(ObjectID::FromBinary(object_id_binary)); } if (object_pinning_enabled_) { - // Pin the objects in plasma by getting them and holding a reference to - // the returned buffer. - // NOTE: the caller must ensure that the objects already exist in plasma before - // sending a PinObjectIDs request. - std::vector plasma_results; - // TODO(swang): This `Get` has a timeout of 0, so the plasma store will not - // block when serving the request. However, if the plasma store is under - // heavy load, then this request can still block the NodeManager event loop - // since we must wait for the plasma store's reply. We should consider using - // an `AsyncGet` instead. - if (!store_client_.Get(object_ids, /*timeout_ms=*/0, &plasma_results).ok()) { - RAY_LOG(WARNING) << "Failed to get objects to be pinned from object store."; + std::vector> results; + if (!GetObjectsFromPlasma(object_ids, &results)) { + RAY_LOG(WARNING) + << "Failed to get objects that should have been in the object store. These " + "objects may have been evicted while there are still references in scope."; // TODO(suquark): Maybe "Status::ObjectNotFound" is more accurate here. send_reply_callback(Status::Invalid("Failed to get objects."), nullptr, nullptr); return; } - - std::vector> objects; - for (int64_t i = 0; i < request.object_ids().size(); i++) { - if (plasma_results[i].data == nullptr) { - objects.push_back(nullptr); - } else { - objects.emplace_back(std::unique_ptr( - new RayObject(plasma_results[i].data, plasma_results[i].metadata, {}))); - } - } - local_object_manager_.PinObjects(object_ids, std::move(objects)); + local_object_manager_.PinObjects(object_ids, std::move(results)); } // Wait for the object to be freed by the owner, which keeps the ref count. local_object_manager_.WaitForObjectFree(request.owner_address(), object_ids); diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 3a68fcbae992..606dc3ac6fa7 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -647,6 +647,16 @@ class NodeManager : public rpc::NodeManagerServiceHandler, std::unordered_map> MakeTasksByClass( const std::vector &tasks) const; + /// Get pointers to objects stored in plasma. They will be + /// released once the returned references go out of scope. + /// + /// \param[in] object_ids The objects to get. + /// \param[out] results The pointers to objects stored in + /// plasma. + /// \return Whether the request was successful. + bool GetObjectsFromPlasma(const std::vector &object_ids, + std::vector> *results); + /////////////////////////////////////////////////////////////////////////////////////// //////////////////// Begin of the override methods of ClusterTaskManager ////////////// // The following methods are defined in node_manager.task.cc instead of node_manager.cc diff --git a/src/ray/raylet/scheduling/cluster_task_manager.cc b/src/ray/raylet/scheduling/cluster_task_manager.cc index a4dbff1f48dd..109833eb59ab 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager.cc @@ -20,7 +20,10 @@ ClusterTaskManager::ClusterTaskManager( NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers) + std::unordered_map> &leased_workers, + std::function &object_ids, + std::vector> *results)> + pin_task_arguments) : self_node_id_(self_node_id), cluster_resource_scheduler_(cluster_resource_scheduler), task_dependency_manager_(task_dependency_manager), @@ -31,7 +34,8 @@ ClusterTaskManager::ClusterTaskManager( RayConfig::instance().max_resource_shapes_per_load_report()), report_worker_backlog_(RayConfig::instance().report_worker_backlog()), worker_pool_(worker_pool), - leased_workers_(leased_workers) {} + leased_workers_(leased_workers), + pin_task_arguments_(pin_task_arguments) {} bool ClusterTaskManager::SchedulePendingTasks() { // Always try to schedule infeasible tasks in case they are now feasible. @@ -144,11 +148,36 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( auto &task = std::get<0>(work); auto &spec = task.GetTaskSpecification(); + std::vector> args; + bool success = true; + const auto &deps = spec.GetDependencyIds(); + if (!deps.empty()) { + // This gets refs to the arguments stored in plasma. The refs should be + // deleted once we no longer need to pin the arguments. + success = pin_task_arguments_(deps, &args); + if (!success) { + RAY_LOG(WARNING) << "Error getting task arguments from plasma store"; + } + for (size_t i = 0; i < deps.size(); i++) { + if (args[i] == nullptr) { + // This can happen if the task's arguments were all local at some + // point, but then at least one was evicted before the task could + // be dispatched to a worker. + RAY_LOG(INFO) + << "Task " << spec.TaskId() << " argument " << deps[i] + << " was evicted before the task could be dispatched. This can happen " + "when there are many objects needed on this node. The task will be " + "scheduled once all of its dependencies are local."; + success = false; + break; + } + } + } + // An argument was evicted since this task was added to the dispatch // queue. Move it back to the waiting queue. The caller is responsible // for notifying us when the task is unblocked again. - if (!spec.GetDependencies().empty() && - !task_dependency_manager_.IsTaskReady(spec.TaskId())) { + if (!success) { waiting_tasks_[spec.TaskId()] = std::move(*work_it); work_it = dispatch_queue.erase(work_it); continue; @@ -177,6 +206,12 @@ void ClusterTaskManager::DispatchScheduledTasksToWorkers( bool worker_leased; bool remove = AttemptDispatchWork(*work_it, worker, &worker_leased); if (worker_leased) { + // Pin the arguments while the lease is active. These will be erased + // once the lease is returned. + num_pinned_task_arguments_ += args.size(); + RAY_CHECK(pinned_task_arguments_.emplace(spec.TaskId(), std::move(args)).second) + << spec.TaskId(); + auto reply = std::get<1>(*work_it); auto callback = std::get<2>(*work_it); Dispatch(worker, leased_workers_, task, reply, callback); @@ -295,6 +330,10 @@ void ClusterTaskManager::TaskFinished(std::shared_ptr worker, Task *task) { RAY_CHECK(worker != nullptr && task != nullptr); *task = worker->GetAssignedTask(); + auto it = pinned_task_arguments_.find(task->GetTaskSpecification().TaskId()); + RAY_CHECK(it != pinned_task_arguments_.end()); + num_pinned_task_arguments_ -= it->second.size(); + pinned_task_arguments_.erase(it); if (worker->GetAllocatedInstances() != nullptr) { ReleaseWorkerResources(worker); } @@ -633,6 +672,8 @@ std::string ClusterTaskManager::DebugStr() const { buffer << "Schedule queue length: " << num_tasks_to_schedule << "\n"; buffer << "Dispatch queue length: " << num_tasks_to_dispatch << "\n"; buffer << "Waiting tasks size: " << waiting_tasks_.size() << "\n"; + buffer << "Number of executing tasks: " << pinned_task_arguments_.size() << "\n"; + buffer << "Number of pinned task arguments: " << num_pinned_task_arguments_ << "\n"; buffer << "cluster_resource_scheduler state: " << cluster_resource_scheduler_->DebugString() << "\n"; buffer << "=================================================="; diff --git a/src/ray/raylet/scheduling/cluster_task_manager.h b/src/ray/raylet/scheduling/cluster_task_manager.h index f632357e10f4..7f2652cebc80 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager.h +++ b/src/ray/raylet/scheduling/cluster_task_manager.h @@ -2,6 +2,7 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" +#include "ray/common/ray_object.h" #include "ray/common/task/task.h" #include "ray/common/task/task_common.h" #include "ray/raylet/dependency_manager.h" @@ -60,7 +61,10 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { NodeInfoGetter get_node_info, std::function announce_infeasible_task, WorkerPoolInterface &worker_pool, - std::unordered_map> &leased_workers); + std::unordered_map> &leased_workers, + std::function &object_ids, + std::vector> *results)> + pin_task_arguments); /// (Step 1) Queue tasks and schedule. /// Queue task and schedule. This hanppens when processing the worker lease request. @@ -248,6 +252,22 @@ class ClusterTaskManager : public ClusterTaskManagerInterface { WorkerPoolInterface &worker_pool_; std::unordered_map> &leased_workers_; + /// Callback to get references to task arguments. These will be pinned while + /// the task is running. + std::function &object_ids, + std::vector> *results)> + pin_task_arguments_; + + /// Arguments needed by currently granted lease requests. These should be + /// pinned before the lease is granted to ensure that the arguments are not + /// evicted before the task(s) start running. + std::unordered_map>> + pinned_task_arguments_; + + /// The total number of arguments pinned for running tasks. + /// Used for debug purposes. + size_t num_pinned_task_arguments_ = 0; + /// Determine whether a task should be immediately dispatched, /// or placed on a wait queue. /// diff --git a/src/ray/raylet/scheduling/cluster_task_manager_test.cc b/src/ray/raylet/scheduling/cluster_task_manager_test.cc index 776e7fc53030..80a9406da4d5 100644 --- a/src/ray/raylet/scheduling/cluster_task_manager_test.cc +++ b/src/ray/raylet/scheduling/cluster_task_manager_test.cc @@ -85,7 +85,7 @@ Task CreateTask(const std::unordered_map &required_resource std::make_pair(PlacementGroupID::Nil(), -1), true, ""); for (int i = 0; i < num_args; i++) { - ObjectID put_id = ObjectID::FromIndex(TaskID::Nil(), /*index=*/i + 1); + ObjectID put_id = ObjectID::FromIndex(RandomTaskId(), /*index=*/i + 1); spec_builder.AddArg(TaskArgByReference(put_id, rpc::Address())); } @@ -96,20 +96,25 @@ Task CreateTask(const std::unordered_map &required_resource class MockTaskDependencyManager : public TaskDependencyManagerInterface { public: + MockTaskDependencyManager(std::unordered_set &missing_objects) + : missing_objects_(missing_objects) {} + bool RequestTaskDependencies( const TaskID &task_id, const std::vector &required_objects) { RAY_CHECK(subscribed_tasks.insert(task_id).second); - return task_ready_; + for (auto &obj_ref : required_objects) { + if (missing_objects_.count(ObjectRefToId(obj_ref))) { + return false; + } + } + return true; } void RemoveTaskDependencies(const TaskID &task_id) { RAY_CHECK(subscribed_tasks.erase(task_id)); } - bool IsTaskReady(const TaskID &task_id) const { return task_ready_; } - - bool task_ready_ = true; - + std::unordered_set &missing_objects_; std::unordered_set subscribed_tasks; }; @@ -121,16 +126,34 @@ class ClusterTaskManagerTest : public ::testing::Test { is_owner_alive_(true), node_info_calls_(0), announce_infeasible_task_calls_(0), - task_manager_(id_, scheduler_, dependency_manager_, - [this](const WorkerID &worker_id, const NodeID &node_id) { - return is_owner_alive_; - }, - [this](const NodeID &node_id) { - node_info_calls_++; - return node_info_[node_id]; - }, - [this](const Task &task) { announce_infeasible_task_calls_++; }, - pool_, leased_workers_) {} + dependency_manager_(missing_objects_), + task_manager_( + id_, scheduler_, dependency_manager_, + [this](const WorkerID &worker_id, const NodeID &node_id) { + return is_owner_alive_; + }, + [this](const NodeID &node_id) { + node_info_calls_++; + return node_info_[node_id]; + }, + [this](const Task &task) { announce_infeasible_task_calls_++; }, pool_, + leased_workers_, + [this](const std::vector &object_ids, + std::vector> *results) { + for (auto &obj_id : object_ids) { + if (missing_objects_.count(obj_id) == 0) { + std::string meta = "metadata"; + auto metadata = const_cast( + reinterpret_cast(meta.data())); + auto meta_buffer = + std::make_shared(metadata, meta.size()); + results->emplace_back(new RayObject(nullptr, meta_buffer, {})); + } else { + results->emplace_back(nullptr); + } + } + return true; + }) {} void SetUp() {} @@ -153,13 +176,25 @@ class ClusterTaskManagerTest : public ::testing::Test { ASSERT_TRUE(task_manager_.tasks_to_dispatch_.empty()); ASSERT_TRUE(task_manager_.waiting_tasks_.empty()); ASSERT_TRUE(task_manager_.infeasible_tasks_.empty()); + ASSERT_TRUE(task_manager_.pinned_task_arguments_.empty()); + ASSERT_EQ(task_manager_.num_pinned_task_arguments_, 0); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); } + void AssertPinnedTaskArgumentsEquals(const TaskID &task_id, size_t num_args_expected) { + ASSERT_EQ(task_manager_.pinned_task_arguments_[task_id].size(), num_args_expected); + size_t num_args = 0; + for (auto &args : task_manager_.pinned_task_arguments_) { + num_args += args.second.size(); + } + ASSERT_EQ(task_manager_.num_pinned_task_arguments_, num_args); + } + NodeID id_; std::shared_ptr scheduler_; MockWorkerPool pool_; std::unordered_map> leased_workers_; + std::unordered_set missing_objects_; bool is_owner_alive_; @@ -203,6 +238,11 @@ TEST_F(ClusterTaskManagerTest, BasicTest) { ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(node_info_calls_, 0); + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); + ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), + task.GetTaskSpecification().TaskId()); + AssertNoLeaks(); } @@ -252,8 +292,9 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { }; /* Blocked on dependencies */ - dependency_manager_.task_ready_ = false; - auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); + auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); + auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; + missing_objects_.insert(missing_arg); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -264,36 +305,42 @@ TEST_F(ClusterTaskManagerTest, ResourceTakenWhileResolving) { ASSERT_EQ(pool_.workers.size(), 2); /* This task can run */ - auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}); + auto task2 = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 1); task_manager_.QueueAndScheduleTask(task2, &reply, callback); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* First task is unblocked now, but resources are no longer available */ - dependency_manager_.task_ready_ = true; + missing_objects_.erase(missing_arg); auto id = task.GetTaskSpecification().TaskId(); std::vector unblocked = {id}; task_manager_.TasksUnblocked(unblocked); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); + AssertPinnedTaskArgumentsEquals(task2.GetTaskSpecification().TaskId(), 1); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 1); /* Second task finishes, making space for the original task */ + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); leased_workers_.clear(); - task_manager_.ReleaseWorkerResources(worker); task_manager_.ScheduleAndDispatchTasks(); ASSERT_TRUE(dependency_manager_.subscribed_tasks.empty()); // Task2 is now done so task can run. + AssertPinnedTaskArgumentsEquals(task.GetTaskSpecification().TaskId(), 2); ASSERT_EQ(num_callbacks, 2); ASSERT_EQ(leased_workers_.size(), 1); ASSERT_EQ(pool_.workers.size(), 0); + + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); AssertNoLeaks(); } @@ -342,6 +389,12 @@ TEST_F(ClusterTaskManagerTest, TestSpillAfterAssigned) { // The second task was spilled. ASSERT_EQ(spillback_reply.retry_at_raylet_address().raylet_id(), remote_node_id.Binary()); + + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); + ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), + task.GetTaskSpecification().TaskId()); + AssertNoLeaks(); } @@ -385,6 +438,12 @@ TEST_F(ClusterTaskManagerTest, TaskCancellationTest) { ASSERT_FALSE(callback_called); ASSERT_EQ(pool_.workers.size(), 0); ASSERT_EQ(leased_workers_.size(), 1); + + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); + ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), + task.GetTaskSpecification().TaskId()); + AssertNoLeaks(); } @@ -615,6 +674,12 @@ TEST_F(ClusterTaskManagerTest, BacklogReportTest) { task_manager_.FillResourceUsage(data); auto resource_load_by_shape = data->resource_load_by_shape(); ASSERT_EQ(resource_load_by_shape.resource_demands().size(), 0); + + while (!leased_workers_.empty()) { + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); + leased_workers_.erase(leased_workers_.begin()); + } AssertNoLeaks(); } } @@ -785,8 +850,9 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { }; /* Blocked on dependencies */ - dependency_manager_.task_ready_ = false; auto task = CreateTask({{ray::kCPU_ResourceLabel, 5}}, 2); + auto missing_arg = task.GetTaskSpecification().GetDependencyIds()[0]; + missing_objects_.insert(missing_arg); std::unordered_set expected_subscribed_tasks = { task.GetTaskSpecification().TaskId()}; task_manager_.QueueAndScheduleTask(task, &reply, callback); @@ -795,7 +861,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task is unblocked now */ - dependency_manager_.task_ready_ = true; + missing_objects_.erase(missing_arg); pool_.workers.clear(); auto id = task.GetTaskSpecification().TaskId(); task_manager_.TasksUnblocked({id}); @@ -804,7 +870,7 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Task argument gets evicted */ - dependency_manager_.task_ready_ = false; + missing_objects_.insert(missing_arg); pool_.PushWorker(std::dynamic_pointer_cast(worker)); task_manager_.ScheduleAndDispatchTasks(); ASSERT_EQ(dependency_manager_.subscribed_tasks, expected_subscribed_tasks); @@ -812,10 +878,16 @@ TEST_F(ClusterTaskManagerTest, ArgumentEvicted) { ASSERT_EQ(leased_workers_.size(), 0); /* Worker available and arguments available */ - dependency_manager_.task_ready_ = true; + missing_objects_.erase(missing_arg); task_manager_.TasksUnblocked({id}); ASSERT_EQ(num_callbacks, 1); ASSERT_EQ(leased_workers_.size(), 1); + + Task finished_task; + task_manager_.TaskFinished(leased_workers_.begin()->second, &finished_task); + ASSERT_EQ(finished_task.GetTaskSpecification().TaskId(), + task.GetTaskSpecification().TaskId()); + AssertNoLeaks(); } diff --git a/src/ray/raylet/test/util.h b/src/ray/raylet/test/util.h index 8527220e3df8..c43a386fba14 100644 --- a/src/ray/raylet/test/util.h +++ b/src/ray/raylet/test/util.h @@ -33,7 +33,7 @@ class MockWorker : public WorkerInterface { void AssignTaskId(const TaskID &task_id) {} - void SetAssignedTask(const Task &assigned_task) {} + void SetAssignedTask(const Task &assigned_task) { task_ = assigned_task; } const std::string IpAddress() const { return address_.ip_address(); } @@ -162,11 +162,7 @@ class MockWorker : public WorkerInterface { void SetBundleId(const BundleID &bundle_id) { bundle_id_ = bundle_id; } - Task &GetAssignedTask() { - RAY_CHECK(false) << "Method unused"; - auto *t = new Task(); - return *t; - } + Task &GetAssignedTask() { return task_; } bool IsRegistered() { RAY_CHECK(false) << "Method unused"; @@ -188,6 +184,7 @@ class MockWorker : public WorkerInterface { bool is_detached_actor_; BundleID bundle_id_; bool blocked_ = false; + Task task_; }; } // namespace raylet From 752da83bb7d6bd3f8eb337d2dd56c6eb545ed806 Mon Sep 17 00:00:00 2001 From: Dominic Ming Date: Fri, 29 Jan 2021 15:22:26 +0800 Subject: [PATCH 095/245] [Dashboard] Add the new dashboard code and prompt users to try it (#11667) --- dashboard/client/package-lock.json | 299 ++++++++++++- dashboard/client/package.json | 19 +- dashboard/client/src/App.tsx | 119 +++++- dashboard/client/src/api.ts | 5 +- .../client/src/components/ActorTable.tsx | 253 +++++++++++ dashboard/client/src/components/Loading.tsx | 10 + .../src/components/LogView/LogVirtualView.tsx | 221 ++++++++++ .../client/src/components/LogView/darcula.css | 59 +++ .../client/src/components/LogView/github.css | 96 +++++ .../client/src/components/LogView/index.css | 3 + .../client/src/components/PercentageBar.tsx | 57 +++ .../client/src/components/SearchComponent.tsx | 87 ++++ .../client/src/components/SpeedTools.tsx | 156 +++++++ .../client/src/components/StatesCounter.tsx | 31 ++ .../client/src/components/StatusChip.tsx | 90 ++++ dashboard/client/src/components/TitleCard.tsx | 34 ++ .../client/src/components/WorkerTable.tsx | 299 +++++++++++++ dashboard/client/src/logo.svg | 34 ++ dashboard/client/src/pages/actor/index.tsx | 36 ++ dashboard/client/src/pages/cmd/CMDResult.tsx | 137 ++++++ .../client/src/pages/dashboard/Dashboard.tsx | 6 + dashboard/client/src/pages/error/404.tsx | 32 ++ .../client/src/pages/exception/Loading.tsx | 21 + dashboard/client/src/pages/index/Index.tsx | 110 +++++ dashboard/client/src/pages/job/JobDetail.tsx | 246 +++++++++++ .../client/src/pages/job/hook/useJobDetail.ts | 73 ++++ .../client/src/pages/job/hook/useJobList.ts | 68 +++ dashboard/client/src/pages/job/index.tsx | 129 ++++++ dashboard/client/src/pages/layout/index.tsx | 167 ++++++++ dashboard/client/src/pages/log/Logs.tsx | 306 ++++++++++++++ .../client/src/pages/node/NodeDetail.tsx | 287 +++++++++++++ .../src/pages/node/hook/useNodeDetail.ts | 66 +++ .../client/src/pages/node/hook/useNodeList.ts | 74 ++++ dashboard/client/src/pages/node/index.tsx | 392 ++++++++++++++++++ dashboard/client/src/service/actor.ts | 14 + dashboard/client/src/service/cluster.ts | 6 + dashboard/client/src/service/job.ts | 10 + dashboard/client/src/service/log.ts | 35 ++ dashboard/client/src/service/node.ts | 10 + dashboard/client/src/service/util.ts | 52 +++ dashboard/client/src/theme.ts | 61 +++ dashboard/client/src/type/actor.ts | 94 +++++ dashboard/client/src/type/config.d.ts | 22 + dashboard/client/src/type/event.d.ts | 31 ++ dashboard/client/src/type/job.d.ts | 70 ++++ dashboard/client/src/type/node.d.ts | 62 +++ dashboard/client/src/type/raylet.d.ts | 28 ++ dashboard/client/src/type/worker.d.ts | 36 ++ dashboard/client/src/util/converter.ts | 27 ++ dashboard/client/src/util/func.tsx | 28 ++ dashboard/client/src/util/hook.ts | 63 +++ dashboard/client/src/util/localData.ts | 12 + 52 files changed, 4650 insertions(+), 33 deletions(-) create mode 100644 dashboard/client/src/components/ActorTable.tsx create mode 100644 dashboard/client/src/components/Loading.tsx create mode 100644 dashboard/client/src/components/LogView/LogVirtualView.tsx create mode 100644 dashboard/client/src/components/LogView/darcula.css create mode 100644 dashboard/client/src/components/LogView/github.css create mode 100644 dashboard/client/src/components/LogView/index.css create mode 100644 dashboard/client/src/components/PercentageBar.tsx create mode 100644 dashboard/client/src/components/SearchComponent.tsx create mode 100644 dashboard/client/src/components/SpeedTools.tsx create mode 100644 dashboard/client/src/components/StatesCounter.tsx create mode 100644 dashboard/client/src/components/StatusChip.tsx create mode 100644 dashboard/client/src/components/TitleCard.tsx create mode 100644 dashboard/client/src/components/WorkerTable.tsx create mode 100644 dashboard/client/src/logo.svg create mode 100644 dashboard/client/src/pages/actor/index.tsx create mode 100644 dashboard/client/src/pages/cmd/CMDResult.tsx create mode 100644 dashboard/client/src/pages/error/404.tsx create mode 100644 dashboard/client/src/pages/exception/Loading.tsx create mode 100644 dashboard/client/src/pages/index/Index.tsx create mode 100644 dashboard/client/src/pages/job/JobDetail.tsx create mode 100644 dashboard/client/src/pages/job/hook/useJobDetail.ts create mode 100644 dashboard/client/src/pages/job/hook/useJobList.ts create mode 100644 dashboard/client/src/pages/job/index.tsx create mode 100644 dashboard/client/src/pages/layout/index.tsx create mode 100644 dashboard/client/src/pages/log/Logs.tsx create mode 100644 dashboard/client/src/pages/node/NodeDetail.tsx create mode 100644 dashboard/client/src/pages/node/hook/useNodeDetail.ts create mode 100644 dashboard/client/src/pages/node/hook/useNodeList.ts create mode 100644 dashboard/client/src/pages/node/index.tsx create mode 100644 dashboard/client/src/service/actor.ts create mode 100644 dashboard/client/src/service/cluster.ts create mode 100644 dashboard/client/src/service/job.ts create mode 100644 dashboard/client/src/service/log.ts create mode 100644 dashboard/client/src/service/node.ts create mode 100644 dashboard/client/src/service/util.ts create mode 100644 dashboard/client/src/theme.ts create mode 100644 dashboard/client/src/type/actor.ts create mode 100644 dashboard/client/src/type/config.d.ts create mode 100644 dashboard/client/src/type/event.d.ts create mode 100644 dashboard/client/src/type/job.d.ts create mode 100644 dashboard/client/src/type/node.d.ts create mode 100644 dashboard/client/src/type/raylet.d.ts create mode 100644 dashboard/client/src/type/worker.d.ts create mode 100644 dashboard/client/src/util/converter.ts create mode 100644 dashboard/client/src/util/func.tsx create mode 100644 dashboard/client/src/util/hook.ts create mode 100644 dashboard/client/src/util/localData.ts diff --git a/dashboard/client/package-lock.json b/dashboard/client/package-lock.json index 8b66129425d1..eccde1558ae4 100644 --- a/dashboard/client/package-lock.json +++ b/dashboard/client/package-lock.json @@ -1,29 +1,41 @@ { - "name": "client", - "version": "0.1.0", + "name": "ray-dashboard-client", + "version": "1.0.0", "lockfileVersion": 2, "requires": true, "packages": { "": { - "version": "0.1.0", + "name": "ray-dashboard-client", + "version": "1.0.0", "dependencies": { "@material-ui/core": "4.11.0", "@material-ui/icons": "^4.9.1", "@material-ui/lab": "^4.0.0-alpha.56", + "@material-ui/pickers": "^3.2.10", "@reduxjs/toolkit": "^1.3.1", "@types/classnames": "^2.2.10", "@types/jest": "25.1.4", + "@types/lodash": "^4.14.161", + "@types/lowlight": "^0.0.1", "@types/node": "13.9.5", + "@types/numeral": "^0.0.26", "@types/react": "16.9.26", "@types/react-dom": "16.9.5", "@types/react-redux": "^7.1.7", "@types/react-router-dom": "^5.1.3", + "@types/react-window": "^1.8.2", + "axios": "^0.21.1", "classnames": "^2.2.6", + "dayjs": "^1.9.4", + "lodash": "^4.17.20", + "lowlight": "^1.14.0", + "numeral": "^2.0.6", "react": "^16.13.1", "react-dom": "^16.13.1", "react-redux": "^7.2.0", "react-router-dom": "^5.1.2", "react-scripts": "^3.4.3", + "react-window": "^1.8.5", "typeface-roboto": "0.0.75", "typescript": "3.8.3", "use-debounce": "^3.4.3" @@ -1320,6 +1332,11 @@ "resolved": "https://registry.npmjs.org/@csstools/normalize.css/-/normalize.css-10.1.0.tgz", "integrity": "sha512-ij4wRiunFfaJxjB0BdrYHIH8FxBJpOwNPhhAcunlmPdXudL1WQV1qoP9un6JsEBAgQH+7UXyyjh0g7jTxXK6tg==" }, + "node_modules/@date-io/core": { + "version": "1.3.13", + "resolved": "https://registry.npmjs.org/@date-io/core/-/core-1.3.13.tgz", + "integrity": "sha512-AlEKV7TxjeK+jxWVKcCFrfYAk8spX9aCyiToFIiLPtfQbsjmRGLIhb5VZgptQcJdHtLXo7+m0DuurwFgUToQuA==" + }, "node_modules/@emotion/hash": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.8.0.tgz", @@ -1859,6 +1876,26 @@ "node": ">=8.0.0" } }, + "node_modules/@material-ui/pickers": { + "version": "3.2.10", + "resolved": "https://registry.npmjs.org/@material-ui/pickers/-/pickers-3.2.10.tgz", + "integrity": "sha512-B8G6Obn5S3RCl7hwahkQj9sKUapwXWFjiaz/Bsw1fhYFdNMnDUolRiWQSoKPb1/oKe37Dtfszoywi1Ynbo3y8w==", + "dependencies": { + "@babel/runtime": "^7.6.0", + "@date-io/core": "1.x", + "@types/styled-jsx": "^2.2.8", + "clsx": "^1.0.2", + "react-transition-group": "^4.0.0", + "rifm": "^0.7.0" + }, + "peerDependencies": { + "@date-io/core": "^1.3.6", + "@material-ui/core": "^4.0.0", + "prop-types": "^15.6.0", + "react": "^16.8.4", + "react-dom": "^16.8.4" + } + }, "node_modules/@material-ui/styles": { "version": "4.10.0", "resolved": "https://registry.npmjs.org/@material-ui/styles/-/styles-4.10.0.tgz", @@ -2205,6 +2242,16 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.5.tgz", "integrity": "sha512-7+2BITlgjgDhH0vvwZU/HZJVyk+2XUlvxXe8dFMedNX/aMkaOq++rMAFXc0tM7ij15QaWlbdQASBR9dihi+bDQ==" }, + "node_modules/@types/lodash": { + "version": "4.14.168", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", + "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==" + }, + "node_modules/@types/lowlight": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/@types/lowlight/-/lowlight-0.0.1.tgz", + "integrity": "sha512-yPpbpV1KfpFOZ0ZZbsgwWumraiAKoX7/Ng75Ah//w+ZBt4j0xwrQ2aHSlk2kPzQVK4LiPbNFE1LjC00IL4nl/A==" + }, "node_modules/@types/minimatch": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/minimatch/-/minimatch-3.0.3.tgz", @@ -2215,6 +2262,11 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.9.5.tgz", "integrity": "sha512-hkzMMD3xu6BrJpGVLeQ3htQQNAcOrJjX7WFmtK8zWQpz2UJf13LCFF2ALA7c9OVdvc2vQJeDdjfR35M0sBCxvw==" }, + "node_modules/@types/numeral": { + "version": "0.0.26", + "resolved": "https://registry.npmjs.org/@types/numeral/-/numeral-0.0.26.tgz", + "integrity": "sha512-DwCsRqeOWopdEsm5KLTxKVKDSDoj+pzZD1vlwu1GQJ6IF3RhjuleYlRwyRH6MJLGaf3v8wFTnC6wo3yYfz0bnA==" + }, "node_modules/@types/parse-json": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.0.tgz", @@ -2285,11 +2337,27 @@ "@types/react": "*" } }, + "node_modules/@types/react-window": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/@types/react-window/-/react-window-1.8.2.tgz", + "integrity": "sha512-gP1xam68Wc4ZTAee++zx6pTdDAH08rAkQrWm4B4F/y6hhmlT9Mgx2q8lTCXnrPHXsr15XjRN9+K2DLKcz44qEQ==", + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/stack-utils": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-1.0.1.tgz", "integrity": "sha512-l42BggppR6zLmpfU6fq9HEa2oGPEI8yrSPL3GITjfRInppYFahObbIQOQK3UGxEnyQpltZLaPe75046NOZQikw==" }, + "node_modules/@types/styled-jsx": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@types/styled-jsx/-/styled-jsx-2.2.8.tgz", + "integrity": "sha512-Yjye9VwMdYeXfS71ihueWRSxrruuXTwKCbzue4+5b2rjnQ//AtyM7myZ1BEhNhBQ/nL/RE7bdToUoLln2miKvg==", + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/yargs": { "version": "13.0.11", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-13.0.11.tgz", @@ -3007,6 +3075,14 @@ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.10.1.tgz", "integrity": "sha512-zg7Hz2k5lI8kb7U32998pRRFin7zJlkfezGJjUc2heaD4Pw2wObakCDVzkKztTm/Ln7eiVvYsjqak0Ed4LkMDA==" }, + "node_modules/axios": { + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz", + "integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==", + "dependencies": { + "follow-redirects": "^1.10.0" + } + }, "node_modules/axobject-query": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", @@ -5158,6 +5234,11 @@ "webidl-conversions": "^4.0.2" } }, + "node_modules/dayjs": { + "version": "1.10.4", + "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.10.4.tgz", + "integrity": "sha512-RI/Hh4kqRc1UKLOAf/T5zdMMX5DQIlDxwUe3wSyMMnEbGunnpENCdbUgM+dW7kXidZqCttBrmw7BhN4TMddkCw==" + }, "node_modules/debug": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", @@ -6985,6 +7066,18 @@ "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=" }, + "node_modules/fault": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", + "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", + "dependencies": { + "format": "^0.2.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/faye-websocket": { "version": "0.10.0", "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.10.0.tgz", @@ -7318,6 +7411,14 @@ "node": ">= 0.12" } }, + "node_modules/format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha1-1hcBB+nv3E7TDJ3DkBbflCtctYs=", + "engines": { + "node": ">=0.4.x" + } + }, "node_modules/forwarded": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", @@ -7804,6 +7905,14 @@ "resolved": "https://registry.npmjs.org/hex-color-regex/-/hex-color-regex-1.1.0.tgz", "integrity": "sha512-l9sfDFsuqtOqKDsQdqrMRk0U85RZc0RtOR9yPI7mRVOa4FsR/BVnZ0shmQRM96Ji99kYZP/7hn1cedc1+ApsTQ==" }, + "node_modules/highlight.js": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.5.0.tgz", + "integrity": "sha512-xTmvd9HiIHR6L53TMC7TKolEj65zG1XU+Onr8oi86mYa+nLcIbxTTWkpW7CsEwv/vK7u1zb8alZIMLDqqN6KTw==", + "engines": { + "node": "*" + } + }, "node_modules/history": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", @@ -8191,12 +8300,9 @@ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, "node_modules/ini": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", - "integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==", - "engines": { - "node": "*" - } + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" }, "node_modules/inquirer": { "version": "7.0.4", @@ -11001,6 +11107,19 @@ "tslib": "^1.10.0" } }, + "node_modules/lowlight": { + "version": "1.18.0", + "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.18.0.tgz", + "integrity": "sha512-Zlc3GqclU71HRw5fTOy00zz5EOlqAdKMYhOFIO8ay4SQEDQgFuhR8JNwDIzAGMLoqTsWxe0elUNmq5o2USRAzw==", + "dependencies": { + "fault": "^1.0.0", + "highlight.js": "~10.5.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -11097,6 +11216,11 @@ "node": ">= 0.6" } }, + "node_modules/memoize-one": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-5.1.1.tgz", + "integrity": "sha512-HKeeBpWvqiVJD57ZUAsJNm71eHTykffzcLZVYWiVfQeI1rJtuEaS7hQiEpWfVVk18donPwJEcFKIkCmPJNOhHA==" + }, "node_modules/memory-fs": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/memory-fs/-/memory-fs-0.4.1.tgz", @@ -11737,6 +11861,14 @@ "resolved": "https://registry.npmjs.org/num2fraction/-/num2fraction-1.2.2.tgz", "integrity": "sha1-b2gragJ6Tp3fpFZM0lidHU5mnt4=" }, + "node_modules/numeral": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/numeral/-/numeral-2.0.6.tgz", + "integrity": "sha1-StCAk21EPCVhrtnyGX7//iX05QY=", + "engines": { + "node": "*" + } + }, "node_modules/nwsapi": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.0.tgz", @@ -14371,6 +14503,22 @@ "prop-types": "^15.6.2" } }, + "node_modules/react-window": { + "version": "1.8.6", + "resolved": "https://registry.npmjs.org/react-window/-/react-window-1.8.6.tgz", + "integrity": "sha512-8VwEEYyjz6DCnGBsd+MgkD0KJ2/OXFULyDtorIiTz+QzwoP94tBoA7CnbtyXMm+cCeAUER5KJcPtWl9cpKbOBg==", + "dependencies": { + "@babel/runtime": "^7.0.0", + "memoize-one": ">=3.1.1 <6" + }, + "engines": { + "node": ">8.0.0" + }, + "peerDependencies": { + "react": "^15.0.0 || ^16.0.0 || ^17.0.0", + "react-dom": "^15.0.0 || ^16.0.0 || ^17.0.0" + } + }, "node_modules/read-pkg": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", @@ -14961,6 +15109,17 @@ "resolved": "https://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", "integrity": "sha1-QzdOLiyglosO8VI0YLfXMP8i7rM=" }, + "node_modules/rifm": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/rifm/-/rifm-0.7.0.tgz", + "integrity": "sha512-DSOJTWHD67860I5ojetXdEQRIBvF6YcpNe53j0vn1vp9EUb9N80EiZTxgP+FkDKorWC8PZw052kTF4C1GOivCQ==", + "dependencies": { + "@babel/runtime": "^7.3.1" + }, + "peerDependencies": { + "react": ">=16.8" + } + }, "node_modules/rimraf": { "version": "2.6.3", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", @@ -19268,6 +19427,11 @@ "resolved": "https://registry.npmjs.org/@csstools/normalize.css/-/normalize.css-10.1.0.tgz", "integrity": "sha512-ij4wRiunFfaJxjB0BdrYHIH8FxBJpOwNPhhAcunlmPdXudL1WQV1qoP9un6JsEBAgQH+7UXyyjh0g7jTxXK6tg==" }, + "@date-io/core": { + "version": "1.3.13", + "resolved": "https://registry.npmjs.org/@date-io/core/-/core-1.3.13.tgz", + "integrity": "sha512-AlEKV7TxjeK+jxWVKcCFrfYAk8spX9aCyiToFIiLPtfQbsjmRGLIhb5VZgptQcJdHtLXo7+m0DuurwFgUToQuA==" + }, "@emotion/hash": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.8.0.tgz", @@ -19715,6 +19879,19 @@ "react-is": "^16.8.0" } }, + "@material-ui/pickers": { + "version": "3.2.10", + "resolved": "https://registry.npmjs.org/@material-ui/pickers/-/pickers-3.2.10.tgz", + "integrity": "sha512-B8G6Obn5S3RCl7hwahkQj9sKUapwXWFjiaz/Bsw1fhYFdNMnDUolRiWQSoKPb1/oKe37Dtfszoywi1Ynbo3y8w==", + "requires": { + "@babel/runtime": "^7.6.0", + "@date-io/core": "1.x", + "@types/styled-jsx": "^2.2.8", + "clsx": "^1.0.2", + "react-transition-group": "^4.0.0", + "rifm": "^0.7.0" + } + }, "@material-ui/styles": { "version": "4.10.0", "resolved": "https://registry.npmjs.org/@material-ui/styles/-/styles-4.10.0.tgz", @@ -20004,6 +20181,16 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.5.tgz", "integrity": "sha512-7+2BITlgjgDhH0vvwZU/HZJVyk+2XUlvxXe8dFMedNX/aMkaOq++rMAFXc0tM7ij15QaWlbdQASBR9dihi+bDQ==" }, + "@types/lodash": { + "version": "4.14.168", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", + "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==" + }, + "@types/lowlight": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/@types/lowlight/-/lowlight-0.0.1.tgz", + "integrity": "sha512-yPpbpV1KfpFOZ0ZZbsgwWumraiAKoX7/Ng75Ah//w+ZBt4j0xwrQ2aHSlk2kPzQVK4LiPbNFE1LjC00IL4nl/A==" + }, "@types/minimatch": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/minimatch/-/minimatch-3.0.3.tgz", @@ -20014,6 +20201,11 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.9.5.tgz", "integrity": "sha512-hkzMMD3xu6BrJpGVLeQ3htQQNAcOrJjX7WFmtK8zWQpz2UJf13LCFF2ALA7c9OVdvc2vQJeDdjfR35M0sBCxvw==" }, + "@types/numeral": { + "version": "0.0.26", + "resolved": "https://registry.npmjs.org/@types/numeral/-/numeral-0.0.26.tgz", + "integrity": "sha512-DwCsRqeOWopdEsm5KLTxKVKDSDoj+pzZD1vlwu1GQJ6IF3RhjuleYlRwyRH6MJLGaf3v8wFTnC6wo3yYfz0bnA==" + }, "@types/parse-json": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.0.tgz", @@ -20084,11 +20276,27 @@ "@types/react": "*" } }, + "@types/react-window": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/@types/react-window/-/react-window-1.8.2.tgz", + "integrity": "sha512-gP1xam68Wc4ZTAee++zx6pTdDAH08rAkQrWm4B4F/y6hhmlT9Mgx2q8lTCXnrPHXsr15XjRN9+K2DLKcz44qEQ==", + "requires": { + "@types/react": "*" + } + }, "@types/stack-utils": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-1.0.1.tgz", "integrity": "sha512-l42BggppR6zLmpfU6fq9HEa2oGPEI8yrSPL3GITjfRInppYFahObbIQOQK3UGxEnyQpltZLaPe75046NOZQikw==" }, + "@types/styled-jsx": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@types/styled-jsx/-/styled-jsx-2.2.8.tgz", + "integrity": "sha512-Yjye9VwMdYeXfS71ihueWRSxrruuXTwKCbzue4+5b2rjnQ//AtyM7myZ1BEhNhBQ/nL/RE7bdToUoLln2miKvg==", + "requires": { + "@types/react": "*" + } + }, "@types/yargs": { "version": "13.0.11", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-13.0.11.tgz", @@ -20693,6 +20901,14 @@ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.10.1.tgz", "integrity": "sha512-zg7Hz2k5lI8kb7U32998pRRFin7zJlkfezGJjUc2heaD4Pw2wObakCDVzkKztTm/Ln7eiVvYsjqak0Ed4LkMDA==" }, + "axios": { + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.1.tgz", + "integrity": "sha512-dKQiRHxGD9PPRIUNIWvZhPTPpl1rf/OxTYKsqKUDjBwYylTvV7SjSHJb9ratfyzM6wCdLCOYLzs73qpg5c4iGA==", + "requires": { + "follow-redirects": "^1.10.0" + } + }, "axobject-query": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-2.2.0.tgz", @@ -22520,6 +22736,11 @@ } } }, + "dayjs": { + "version": "1.10.4", + "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.10.4.tgz", + "integrity": "sha512-RI/Hh4kqRc1UKLOAf/T5zdMMX5DQIlDxwUe3wSyMMnEbGunnpENCdbUgM+dW7kXidZqCttBrmw7BhN4TMddkCw==" + }, "debug": { "version": "4.3.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", @@ -24038,6 +24259,14 @@ "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=" }, + "fault": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/fault/-/fault-1.0.4.tgz", + "integrity": "sha512-CJ0HCB5tL5fYTEA7ToAq5+kTwd++Borf1/bifxd9iT70QcXr4MRrO3Llf8Ifs70q+SJcGHFtnIE/Nw6giCtECA==", + "requires": { + "format": "^0.2.0" + } + }, "faye-websocket": { "version": "0.10.0", "resolved": "https://registry.npmjs.org/faye-websocket/-/faye-websocket-0.10.0.tgz", @@ -24312,6 +24541,11 @@ "mime-types": "^2.1.12" } }, + "format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha1-1hcBB+nv3E7TDJ3DkBbflCtctYs=" + }, "forwarded": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz", @@ -24712,6 +24946,11 @@ "resolved": "https://registry.npmjs.org/hex-color-regex/-/hex-color-regex-1.1.0.tgz", "integrity": "sha512-l9sfDFsuqtOqKDsQdqrMRk0U85RZc0RtOR9yPI7mRVOa4FsR/BVnZ0shmQRM96Ji99kYZP/7hn1cedc1+ApsTQ==" }, + "highlight.js": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.5.0.tgz", + "integrity": "sha512-xTmvd9HiIHR6L53TMC7TKolEj65zG1XU+Onr8oi86mYa+nLcIbxTTWkpW7CsEwv/vK7u1zb8alZIMLDqqN6KTw==" + }, "history": { "version": "4.10.1", "resolved": "https://registry.npmjs.org/history/-/history-4.10.1.tgz", @@ -25045,9 +25284,9 @@ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" }, "ini": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", - "integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==" + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" }, "inquirer": { "version": "7.0.4", @@ -27299,6 +27538,15 @@ "tslib": "^1.10.0" } }, + "lowlight": { + "version": "1.18.0", + "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-1.18.0.tgz", + "integrity": "sha512-Zlc3GqclU71HRw5fTOy00zz5EOlqAdKMYhOFIO8ay4SQEDQgFuhR8JNwDIzAGMLoqTsWxe0elUNmq5o2USRAzw==", + "requires": { + "fault": "^1.0.0", + "highlight.js": "~10.5.0" + } + }, "lru-cache": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", @@ -27381,6 +27629,11 @@ "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g=" }, + "memoize-one": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-5.1.1.tgz", + "integrity": "sha512-HKeeBpWvqiVJD57ZUAsJNm71eHTykffzcLZVYWiVfQeI1rJtuEaS7hQiEpWfVVk18donPwJEcFKIkCmPJNOhHA==" + }, "memory-fs": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/memory-fs/-/memory-fs-0.4.1.tgz", @@ -27933,6 +28186,11 @@ "resolved": "https://registry.npmjs.org/num2fraction/-/num2fraction-1.2.2.tgz", "integrity": "sha1-b2gragJ6Tp3fpFZM0lidHU5mnt4=" }, + "numeral": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/numeral/-/numeral-2.0.6.tgz", + "integrity": "sha1-StCAk21EPCVhrtnyGX7//iX05QY=" + }, "nwsapi": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.0.tgz", @@ -30091,6 +30349,15 @@ "prop-types": "^15.6.2" } }, + "react-window": { + "version": "1.8.6", + "resolved": "https://registry.npmjs.org/react-window/-/react-window-1.8.6.tgz", + "integrity": "sha512-8VwEEYyjz6DCnGBsd+MgkD0KJ2/OXFULyDtorIiTz+QzwoP94tBoA7CnbtyXMm+cCeAUER5KJcPtWl9cpKbOBg==", + "requires": { + "@babel/runtime": "^7.0.0", + "memoize-one": ">=3.1.1 <6" + } + }, "read-pkg": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", @@ -30574,6 +30841,14 @@ "resolved": "https://registry.npmjs.org/rgba-regex/-/rgba-regex-1.0.0.tgz", "integrity": "sha1-QzdOLiyglosO8VI0YLfXMP8i7rM=" }, + "rifm": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/rifm/-/rifm-0.7.0.tgz", + "integrity": "sha512-DSOJTWHD67860I5ojetXdEQRIBvF6YcpNe53j0vn1vp9EUb9N80EiZTxgP+FkDKorWC8PZw052kTF4C1GOivCQ==", + "requires": { + "@babel/runtime": "^7.3.1" + } + }, "rimraf": { "version": "2.6.3", "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", diff --git a/dashboard/client/package.json b/dashboard/client/package.json index 3ac262ef70d5..535d3b48362f 100644 --- a/dashboard/client/package.json +++ b/dashboard/client/package.json @@ -1,25 +1,36 @@ { - "name": "client", - "version": "0.1.0", + "name": "ray-dashboard-client", + "version": "1.0.0", "private": true, "dependencies": { "@material-ui/core": "4.11.0", "@material-ui/icons": "^4.9.1", "@material-ui/lab": "^4.0.0-alpha.56", + "@material-ui/pickers": "^3.2.10", "@reduxjs/toolkit": "^1.3.1", "@types/classnames": "^2.2.10", "@types/jest": "25.1.4", + "@types/lodash": "^4.14.161", + "@types/lowlight": "^0.0.1", "@types/node": "13.9.5", + "@types/numeral": "^0.0.26", "@types/react": "16.9.26", "@types/react-dom": "16.9.5", "@types/react-redux": "^7.1.7", "@types/react-router-dom": "^5.1.3", + "@types/react-window": "^1.8.2", + "axios": "^0.21.1", "classnames": "^2.2.6", + "dayjs": "^1.9.4", + "lodash": "^4.17.20", + "lowlight": "^1.14.0", + "numeral": "^2.0.6", "react": "^16.13.1", "react-dom": "^16.13.1", "react-redux": "^7.2.0", "react-router-dom": "^5.1.2", "react-scripts": "^3.4.3", + "react-window": "^1.8.5", "typeface-roboto": "0.0.75", "typescript": "3.8.3", "use-debounce": "^3.4.3" @@ -40,6 +51,7 @@ "eslint": "./node_modules/.bin/eslint \"src/**\"" }, "eslintConfig": { + "ignorePatterns": ["*.svg", "*.css"], "extends": [ "plugin:import/warnings", "react-app" @@ -110,5 +122,6 @@ "last 1 firefox version", "last 1 safari version" ] - } + }, + "proxy": "http://localhost:8265" } diff --git a/dashboard/client/src/App.tsx b/dashboard/client/src/App.tsx index c0bdae6a13dd..be2a8fc0beb6 100644 --- a/dashboard/client/src/App.tsx +++ b/dashboard/client/src/App.tsx @@ -1,21 +1,112 @@ import { CssBaseline } from "@material-ui/core"; -import React from "react"; +import { ThemeProvider } from "@material-ui/core/styles"; +import React, { Suspense, useEffect, useState } from "react"; import { Provider } from "react-redux"; -import { BrowserRouter, Route } from "react-router-dom"; +import { HashRouter, Route, Switch } from "react-router-dom"; import Dashboard from "./pages/dashboard/Dashboard"; +import Loading from "./pages/exception/Loading"; +import { getNodeList } from "./service/node"; import { store } from "./store"; +import { darkTheme, lightTheme } from "./theme"; +import { getLocalStorage, setLocalStorage } from "./util/localData"; -class App extends React.Component { - render() { - return ( - - - - - - - ); - } -} +// lazy loading fro prevent loading too much code at once +const Actors = React.lazy(() => import("./pages/actor")); +const CMDResult = React.lazy(() => import("./pages/cmd/CMDResult")); +const Index = React.lazy(() => import("./pages/index/Index")); +const Job = React.lazy(() => import("./pages/job")); +const JobDetail = React.lazy(() => import("./pages/job/JobDetail")); +const BasicLayout = React.lazy(() => import("./pages/layout")); +const Logs = React.lazy(() => import("./pages/log/Logs")); +const Node = React.lazy(() => import("./pages/node")); +const NodeDetail = React.lazy(() => import("./pages/node/NodeDetail")); + +// key to store theme in local storage +const RAY_DASHBOARD_THEME_KEY = "ray-dashboard-theme"; + +// a global map for relations +export const GlobalContext = React.createContext({ + nodeMap: {} as { [key: string]: string }, + ipLogMap: {} as { [key: string]: string }, + namespaceMap: {} as { [key: string]: string[] }, +}); + +export const getDefaultTheme = () => + getLocalStorage(RAY_DASHBOARD_THEME_KEY) || "light"; +export const setLocalTheme = (theme: string) => + setLocalStorage(RAY_DASHBOARD_THEME_KEY, theme); + +const App = () => { + const [theme, _setTheme] = useState(getDefaultTheme()); + const [context, setContext] = useState<{ + nodeMap: { [key: string]: string }; + ipLogMap: { [key: string]: string }; + namespaceMap: { [key: string]: string[] }; + }>({ nodeMap: {}, ipLogMap: {}, namespaceMap: {} }); + const getTheme = (name: string) => { + switch (name) { + case "dark": + return darkTheme; + case "light": + default: + return lightTheme; + } + }; + const setTheme = (name: string) => { + setLocalTheme(name); + _setTheme(name); + }; + useEffect(() => { + getNodeList().then((res) => { + if (res?.data?.data?.summary) { + const nodeMap = {} as { [key: string]: string }; + const ipLogMap = {} as { [key: string]: string }; + res.data.data.summary.forEach(({ hostname, raylet, ip, logUrl }) => { + nodeMap[hostname] = raylet.nodeId; + ipLogMap[ip] = logUrl; + }); + setContext({ nodeMap, ipLogMap, namespaceMap: {} }); + } + }); + }, []); + + return ( + + + + + + + + + ( + + + + + + ( + + )} + exact + path="/log/:host?/:path?" + /> + + + + + + )} + /> + + + + + + + ); +}; export default App; diff --git a/dashboard/client/src/api.ts b/dashboard/client/src/api.ts index e2ff52464e84..b7f4f5f41477 100644 --- a/dashboard/client/src/api.ts +++ b/dashboard/client/src/api.ts @@ -1,7 +1,4 @@ -const base = - process.env.NODE_ENV === "development" - ? "http://localhost:8265" - : window.location.origin; +const base = window.location.origin; type APIResponse = { result: boolean; diff --git a/dashboard/client/src/components/ActorTable.tsx b/dashboard/client/src/components/ActorTable.tsx new file mode 100644 index 000000000000..b90e5cf34a68 --- /dev/null +++ b/dashboard/client/src/components/ActorTable.tsx @@ -0,0 +1,253 @@ +import { + InputAdornment, + Table, + TableBody, + TableCell, + TableHead, + TableRow, + TextField, + TextFieldProps, +} from "@material-ui/core"; +import { orange } from "@material-ui/core/colors"; +import { SearchOutlined } from "@material-ui/icons"; +import Autocomplete from "@material-ui/lab/Autocomplete"; +import Pagination from "@material-ui/lab/Pagination"; +import React, { useContext, useState } from "react"; +import { Link } from "react-router-dom"; +import { GlobalContext } from "../App"; +import { Actor } from "../type/actor"; +import { Worker } from "../type/worker"; +import { longTextCut } from "../util/func"; +import { useFilter } from "../util/hook"; +import StateCounter from "./StatesCounter"; +import { StatusChip } from "./StatusChip"; +import RayletWorkerTable, { ExpandableTableRow } from "./WorkerTable"; + +const ActorTable = ({ + actors = {}, + workers = [], +}: { + actors: { [actorId: string]: Actor }; + workers?: Worker[]; +}) => { + const [pageNo, setPageNo] = useState(1); + const { changeFilter, filterFunc } = useFilter(); + const [pageSize, setPageSize] = useState(10); + const { ipLogMap } = useContext(GlobalContext); + const actorList = Object.values(actors || {}) + .map((e) => ({ + ...e, + functionDesc: Object.values( + e.taskSpec?.functionDescriptor?.javaFunctionDescriptor || + e.taskSpec?.functionDescriptor?.pythonFunctionDescriptor || + {}, + ).join(" "), + })) + .filter(filterFunc); + const list = actorList.slice((pageNo - 1) * pageSize, pageNo * pageSize); + + return ( + +
+ e.state)), + )} + onInputChange={(_: any, value: string) => { + changeFilter("state", value.trim()); + }} + renderInput={(params: TextFieldProps) => ( + + )} + /> + e.address?.ipAddress)), + )} + onInputChange={(_: any, value: string) => { + changeFilter("address.ipAddress", value.trim()); + }} + renderInput={(params: TextFieldProps) => ( + + )} + /> + { + changeFilter("pid", value.trim()); + }, + endAdornment: ( + + + + ), + }} + /> + { + changeFilter("functionDesc", value.trim()); + }, + endAdornment: ( + + + + ), + }} + /> + { + changeFilter("name", value.trim()); + }, + endAdornment: ( + + + + ), + }} + /> + { + changeFilter("actorId", value.trim()); + }, + endAdornment: ( + + + + ), + }} + /> + { + setPageSize(Math.min(Number(value), 500) || 10); + }, + }} + /> +
+
+
+ setPageNo(num)} + count={Math.ceil(actorList.length / pageSize)} + /> +
+
+ +
+
+ + + + {[ + "", + "ID(Num Restarts)", + "Name", + "Task Func Desc", + "Job Id", + "Pid", + "IP", + "Port", + "State", + "Log", + ].map((col) => ( + + {col} + + ))} + + + + {list.map( + ({ + actorId, + functionDesc, + jobId, + pid, + address, + state, + name, + numRestarts, + }) => ( + + e.pid === pid && + address.ipAddress === e.coreWorkerStats[0].ipAddress, + ).length + } + expandComponent={ + + e.pid === pid && + address.ipAddress === e.coreWorkerStats[0].ipAddress, + )} + mini + /> + } + key={actorId} + > + 0 ? orange[500] : "inherit", + }} + > + {actorId}({numRestarts}) + + {name} + + {longTextCut(functionDesc, 60)} + + {jobId} + {pid} + {address?.ipAddress} + {address?.port} + + + + + {ipLogMap[address?.ipAddress] && ( + + Log + + )} + + + ), + )} + +
+
+ ); +}; + +export default ActorTable; diff --git a/dashboard/client/src/components/Loading.tsx b/dashboard/client/src/components/Loading.tsx new file mode 100644 index 000000000000..6c1cb1e8f0ea --- /dev/null +++ b/dashboard/client/src/components/Loading.tsx @@ -0,0 +1,10 @@ +import { Backdrop, CircularProgress } from "@material-ui/core"; +import React from "react"; + +const Loading = ({ loading }: { loading: boolean }) => ( + + + +); + +export default Loading; diff --git a/dashboard/client/src/components/LogView/LogVirtualView.tsx b/dashboard/client/src/components/LogView/LogVirtualView.tsx new file mode 100644 index 000000000000..2046989c2702 --- /dev/null +++ b/dashboard/client/src/components/LogView/LogVirtualView.tsx @@ -0,0 +1,221 @@ +import dayjs from "dayjs"; +import low from "lowlight"; +import React, { + CSSProperties, + MutableRefObject, + useEffect, + useRef, + useState, +} from "react"; +import { FixedSizeList as List } from "react-window"; +import "./darcula.css"; +import "./github.css"; +import "./index.css"; +import { getDefaultTheme } from "../../App"; + +const uniqueKeySelector = () => Math.random().toString(16).slice(-8); + +const timeReg = /(?:(?!0000)[0-9]{4}-(?:(?:0[1-9]|1[0-2])-(?:0[1-9]|1[0-9]|2[0-8])|(?:0[13-9]|1[0-2])-(?:29|30)|(?:0[13578]|1[02])-31)|(?:[0-9]{2}(?:0[48]|[2468][048]|[13579][26])|(?:0[48]|[2468][048]|[13579][26])00)-02-29)\s+([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9]/; + +const value2react = ( + { type, tagName, properties, children, value = "" }: any, + key: string, + keywords: string = "", +) => { + switch (type) { + case "element": + return React.createElement( + tagName, + { + className: properties.className[0], + key: `${key}line${uniqueKeySelector()}`, + }, + children.map((e: any, i: number) => + value2react(e, `${key}-${i}`, keywords), + ), + ); + case "text": + if (keywords && value.includes(keywords)) { + const afterChildren = []; + const vals = value.split(keywords); + let tmp = vals.shift(); + if (!tmp) { + return React.createElement( + "span", + { className: "find-kws" }, + keywords, + ); + } + while (typeof tmp === "string") { + if (tmp !== "") { + afterChildren.push(tmp); + } else { + afterChildren.push( + React.createElement("span", { className: "find-kws" }, keywords), + ); + } + + tmp = vals.shift(); + if (tmp) { + afterChildren.push( + React.createElement("span", { className: "find-kws" }, keywords), + ); + } + } + return afterChildren; + } + return value; + default: + return []; + } +}; + +export type LogVirtualViewProps = { + content: string; + width?: number; + height?: number; + fontSize?: number; + theme?: "light" | "dark"; + language?: string; + focusLine?: number; + keywords?: string; + style?: { [key: string]: string | number }; + listRef?: MutableRefObject; + onScrollBottom?: (event: Event) => void; + revert?: boolean; + startTime?: string; + endTime?: string; +}; + +const LogVirtualView: React.FC = ({ + content, + width = "100%", + height, + fontSize = 12, + theme = getDefaultTheme(), + keywords = "", + language = "dos", + focusLine = 1, + style = {}, + listRef, + onScrollBottom, + revert = false, + startTime, + endTime, +}) => { + const [logs, setLogs] = useState<{ i: number; origin: string }[]>([]); + const total = logs.length; + const timmer = useRef>(); + const el = useRef(null); + const outter = useRef(null); + if (listRef) { + listRef.current = outter.current; + } + const itemRenderer = ({ + index, + style: s, + }: { + index: number; + style: CSSProperties; + }) => { + const { i, origin } = logs[revert ? logs.length - 1 - index : index]; + return ( +
+ + {i + 1} + + {low + .highlight(language, origin) + .value.map((v) => value2react(v, index.toString(), keywords))} +
+ ); + }; + + useEffect(() => { + const originContent = content.split("\n"); + if (timmer.current) { + clearTimeout(timmer.current); + } + timmer.current = setTimeout(() => { + setLogs( + originContent + .map((e, i) => ({ + i, + origin: e, + time: (e?.match(timeReg) || [""])[0], + })) + .filter((e) => { + let bool = e.origin.includes(keywords); + if ( + e.time && + startTime && + !dayjs(e.time).isAfter(dayjs(startTime)) + ) { + bool = false; + } + if (e.time && endTime && !dayjs(e.time).isBefore(dayjs(endTime))) { + bool = false; + } + return bool; + }) + .map((e) => ({ + ...e, + })), + ); + }, 500); + }, [content, keywords, language, startTime, endTime]); + + useEffect(() => { + if (el.current) { + el.current?.scrollTo((focusLine - 1) * (fontSize + 6)); + } + }, [focusLine, fontSize]); + + useEffect(() => { + if (outter.current) { + const scrollFunc = (event: any) => { + const { target } = event; + if ( + target && + target.scrollTop + target.clientHeight === target.scrollHeight + ) { + if (onScrollBottom) { + onScrollBottom(event); + } + } + }; + outter.current.addEventListener("scroll", scrollFunc); + return () => outter?.current?.removeEventListener("scroll", scrollFunc); + } + }, [onScrollBottom]); + + return ( + + {itemRenderer} + + ); +}; + +export default LogVirtualView; diff --git a/dashboard/client/src/components/LogView/darcula.css b/dashboard/client/src/components/LogView/darcula.css new file mode 100644 index 000000000000..8564bf89570d --- /dev/null +++ b/dashboard/client/src/components/LogView/darcula.css @@ -0,0 +1,59 @@ +/* +Dracula Theme v1.2.0 +https://github.com/zenorocha/dracula-theme +Copyright 2015, All rights reserved +Code licensed under the MIT license +http://zenorocha.mit-license.org +@author Éverton Ribeiro +@author Zeno Rocha +*/ +.hljs-dark { + display: block; + overflow-x: auto; + padding: 0.5em; + color: #f8f8f2; +} +.hljs-dark .hljs-number, +.hljs-dark .hljs-keyword, +.hljs-dark .hljs-selector-tag, +.hljs-dark .hljs-literal, +.hljs-dark .hljs-section, +.hljs-dark .hljs-link { + color: #8be9fd; +} +.hljs-dark .hljs-function .hljs-keyword { + color: #ff79c6; +} +.hljs-dark .hljs-string, +.hljs-dark .hljs-title, +.hljs-dark .hljs-name, +.hljs-dark .hljs-type, +.hljs-dark .hljs-attribute, +.hljs-dark .hljs-symbol, +.hljs-dark .hljs-bullet, +.hljs-dark .hljs-addition, +.hljs-dark .hljs-variable, +.hljs-dark .hljs-template-tag, +.hljs-dark .hljs-template-variable { + color: #f1fa8c; +} +.hljs-dark .hljs-comment, +.hljs-dark .hljs-quote, +.hljs-dark .hljs-deletion, +.hljs-dark .hljs-meta { + color: #6272a4; +} +.hljs-dark .hljs-keyword, +.hljs-dark .hljs-selector-tag, +.hljs-dark .hljs-literal, +.hljs-dark .hljs-title, +.hljs-dark .hljs-section, +.hljs-dark .hljs-doctag, +.hljs-dark .hljs-type, +.hljs-dark .hljs-name, +.hljs-dark .hljs-strong { + font-weight: bold; +} +.hljs-dark .hljs-emphasis { + font-style: italic; +} diff --git a/dashboard/client/src/components/LogView/github.css b/dashboard/client/src/components/LogView/github.css new file mode 100644 index 000000000000..ca16d3f7393e --- /dev/null +++ b/dashboard/client/src/components/LogView/github.css @@ -0,0 +1,96 @@ +/* +github.com style (c) Vasily Polovnyov +*/ + +.hljs-light { + display: block; + overflow-x: auto; + padding: 0.5em; + color: #333; +} + +.hljs-light .hljs-comment, +.hljs-light .hljs-quote { + color: #998; + font-style: italic; +} + +.hljs-light .hljs-keyword, +.hljs-light .hljs-selector-tag, +.hljs-light .hljs-subst { + color: #333; + font-weight: bold; +} + +.hljs-light .hljs-number, +.hljs-light .hljs-literal, +.hljs-light .hljs-variable, +.hljs-light .hljs-template-variable, +.hljs-light .hljs-tag .hljs-attr { + color: #008080; +} + +.hljs-light .hljs-string, +.hljs-light .hljs-doctag { + color: #d14; +} + +.hljs-light .hljs-title, +.hljs-light .hljs-section, +.hljs-light .hljs-selector-id { + color: #900; + font-weight: bold; +} + +.hljs-light .hljs-subst { + font-weight: normal; +} + +.hljs-light .hljs-type, +.hljs-light .hljs-class .hljs-title { + color: #458; + font-weight: bold; +} + +.hljs-light .hljs-tag, +.hljs-light .hljs-name, +.hljs-light .hljs-attribute { + color: #000080; + font-weight: normal; +} + +.hljs-light .hljs-regexp, +.hljs-light .hljs-link { + color: #009926; +} + +.hljs-light .hljs-symbol, +.hljs-light .hljs-bullet { + color: #990073; +} + +.hljs-light .hljs-built_in, +.hljs-light .hljs-builtin-name { + color: #0086b3; +} + +.hljs-light .hljs-meta { + color: #999; + font-weight: bold; +} + +.hljs-light .hljs-deletion { + background: #fdd; +} + +.hljs-light .hljs-addition { + background: #dfd; +} + +.hljs-light .hljs-emphasis { + font-style: italic; +} + +.hljs-light .hljs-strong { + font-weight: bold; +} diff --git a/dashboard/client/src/components/LogView/index.css b/dashboard/client/src/components/LogView/index.css new file mode 100644 index 000000000000..32e5f884f2bc --- /dev/null +++ b/dashboard/client/src/components/LogView/index.css @@ -0,0 +1,3 @@ +span.find-kws { + background-color: #ffd800; +} diff --git a/dashboard/client/src/components/PercentageBar.tsx b/dashboard/client/src/components/PercentageBar.tsx new file mode 100644 index 000000000000..6b2cc48ade68 --- /dev/null +++ b/dashboard/client/src/components/PercentageBar.tsx @@ -0,0 +1,57 @@ +import { makeStyles } from "@material-ui/core"; +import React, { PropsWithChildren } from "react"; + +const useStyle = makeStyles((theme) => ({ + container: { + background: "linear-gradient(45deg, #21CBF3ee 30%, #2196F3ee 90%)", + border: `1px solid #ffffffbb`, + padding: "0 12px", + height: 18, + lineHeight: "18px", + position: "relative", + boxSizing: "content-box", + borderRadius: 4, + }, + displayBar: { + background: theme.palette.background.paper, + position: "absolute", + right: 0, + height: 18, + transition: "0.5s width", + borderRadius: 2, + borderTopLeftRadius: 0, + borderBottomLeftRadius: 0, + border: "2px solid transparent", + boxSizing: "border-box", + }, + text: { + fontSize: 12, + zIndex: 2, + position: "relative", + color: theme.palette.text.primary, + width: "100%", + textAlign: "center", + }, +})); + +const PercentageBar = ( + props: PropsWithChildren<{ num: number; total: number }>, +) => { + const { num, total } = props; + const classes = useStyle(); + const per = Math.round((num / total) * 100); + + return ( +
+
+
{props.children}
+
+ ); +}; + +export default PercentageBar; diff --git a/dashboard/client/src/components/SearchComponent.tsx b/dashboard/client/src/components/SearchComponent.tsx new file mode 100644 index 000000000000..02170b13c31f --- /dev/null +++ b/dashboard/client/src/components/SearchComponent.tsx @@ -0,0 +1,87 @@ +import { + InputAdornment, + makeStyles, + MenuItem, + TextField, +} from "@material-ui/core"; +import { SearchOutlined } from "@material-ui/icons"; +import React from "react"; + +const useStyles = makeStyles((theme) => ({ + search: { + margin: theme.spacing(1), + marginTop: 0, + }, +})); + +export const SearchInput = ({ + label, + onChange, + defaultValue, +}: { + label: string; + defaultValue?: string; + onChange?: (value: string) => void; +}) => { + const classes = useStyles(); + + return ( + { + if (onChange) { + onChange(value); + } + }, + defaultValue, + endAdornment: ( + + + + ), + }} + /> + ); +}; + +export const SearchSelect = ({ + label, + onChange, + options, +}: { + label: string; + onChange?: (value: string) => void; + options: (string | [string, string])[]; +}) => { + const classes = useStyles(); + return ( + { + if (onChange) { + onChange(value as string); + } + }, + style: { + width: 100, + }, + }} + > + All + {options.map((e) => + typeof e === "string" ? ( + {e} + ) : ( + {e[1]} + ), + )} + + ); +}; diff --git a/dashboard/client/src/components/SpeedTools.tsx b/dashboard/client/src/components/SpeedTools.tsx new file mode 100644 index 000000000000..7094a41176a7 --- /dev/null +++ b/dashboard/client/src/components/SpeedTools.tsx @@ -0,0 +1,156 @@ +import { + Grow, + makeStyles, + Paper, + Tab, + Tabs, + TextField, +} from "@material-ui/core"; +import { red } from "@material-ui/core/colors"; +import { Build, Close } from "@material-ui/icons"; +import React, { useState } from "react"; +import { StatusChip } from "./StatusChip"; + +const chunkArray = (myArray: string[], chunk_size: number) => { + const results = []; + + while (myArray.length) { + results.push(myArray.splice(0, chunk_size)); + } + + return results; +}; + +const revertBit = (str: string) => { + return chunkArray(str.split(""), 2) + .reverse() + .map((e) => e.join("")) + .join(""); +}; + +const detectFlag = (str: string, offset: number) => { + const flag = parseInt(str, 16); + const mask = 1 << offset; + + return Number(!!(flag & mask)); +}; + +const useStyle = makeStyles((theme) => ({ + toolContainer: { + background: theme.palette.primary.main, + width: 48, + height: 48, + borderRadius: 48, + position: "fixed", + bottom: 100, + left: 50, + color: theme.palette.primary.contrastText, + }, + icon: { + position: "absolute", + left: 12, + cursor: "pointer", + top: 12, + }, + popover: { + position: "absolute", + left: 50, + bottom: 48, + width: 500, + height: 300, + padding: 6, + border: "1px solid", + borderColor: theme.palette.text.disabled, + }, + close: { + float: "right", + color: theme.palette.error.main, + cursor: "pointer", + }, +})); + +const ObjectIdReader = () => { + const [id, setId] = useState(""); + const tagList = [ + ["Create From Task", 15, 1], + ["Put Object", 14, 0], + ["Return Object", 14, 1], + ] as [string, number, number][]; + + return ( +
+ { + setId(value); + }, + }} + /> +
+ {id.length === 40 ? ( +
+ Job ID: {id.slice(24, 28)}
+ Actor ID: {id.slice(16, 28)}
+ Task ID: {id.slice(0, 28)}
+ Index: {parseInt(revertBit(id.slice(32)), 16)}
+ Flag: {revertBit(id.slice(28, 32))} +
+
+ {tagList + .filter( + ([a, b, c]) => detectFlag(revertBit(id.slice(28, 32)), b) === c, + ) + .map(([name]) => ( + + ))} +
+ ) : ( + + Object ID should be 40 letters long + + )} +
+
+ ); +}; + +const Tools = () => { + const [sel, setSel] = useState("oid_converter"); + const toolMap = { + oid_converter: , + } as { [key: string]: JSX.Element }; + + return ( +
+ setSel(val)}> + Object ID Reader} + /> + + {toolMap[sel]} +
+ ); +}; + +const SpeedTools = () => { + const [show, setShow] = useState(false); + const classes = useStyle(); + + return ( + + setShow(!show)} /> + + + setShow(false)} /> + + + + + ); +}; + +export default SpeedTools; diff --git a/dashboard/client/src/components/StatesCounter.tsx b/dashboard/client/src/components/StatesCounter.tsx new file mode 100644 index 000000000000..b5fc987e5f6c --- /dev/null +++ b/dashboard/client/src/components/StatesCounter.tsx @@ -0,0 +1,31 @@ +import { Grid } from "@material-ui/core"; +import React from "react"; +import { StatusChip } from "./StatusChip"; + +const StateCounter = ({ + type, + list, +}: { + type: string; + list: { state: string }[]; +}) => { + const stateMap = {} as { [state: string]: number }; + list.forEach(({ state }) => { + stateMap[state] = stateMap[state] + 1 || 1; + }); + + return ( + + + + + {Object.entries(stateMap).map(([s, num]) => ( + + + + ))} + + ); +}; + +export default StateCounter; diff --git a/dashboard/client/src/components/StatusChip.tsx b/dashboard/client/src/components/StatusChip.tsx new file mode 100644 index 000000000000..dc9fb11fa705 --- /dev/null +++ b/dashboard/client/src/components/StatusChip.tsx @@ -0,0 +1,90 @@ +import { Color } from "@material-ui/core"; +import { + blue, + blueGrey, + cyan, + green, + grey, + lightBlue, + red, +} from "@material-ui/core/colors"; +import { CSSProperties } from "@material-ui/core/styles/withStyles"; +import React, { ReactNode } from "react"; +import { ActorEnum } from "../type/actor"; + +const colorMap = { + node: { + ALIVE: green, + DEAD: red, + }, + actor: { + [ActorEnum.ALIVE]: green, + [ActorEnum.DEAD]: red, + [ActorEnum.PENDING]: blue, + [ActorEnum.RECONSTRUCTING]: lightBlue, + }, + job: { + INIT: grey, + SUBMITTED: blue, + DISPATCHED: lightBlue, + RUNNING: green, + COMPLETED: cyan, + FINISHED: cyan, + FAILED: red, + }, +} as { + [key: string]: { + [key: string]: Color; + }; +}; + +const typeMap = { + deps: blue, + INFO: cyan, + ERROR: red, +} as { + [key: string]: Color; +}; + +export const StatusChip = ({ + type, + status, + suffix, +}: { + type: string; + status: string | ActorEnum | ReactNode; + suffix?: string; +}) => { + const style = { + padding: "2px 8px", + border: "solid 1px", + borderRadius: 4, + fontSize: 12, + margin: 2, + } as CSSProperties; + + let color = blueGrey as Color; + + if (typeMap[type]) { + color = typeMap[type]; + } else if ( + typeof status === "string" && + colorMap[type] && + colorMap[type][status] + ) { + color = colorMap[type][status]; + } + + style.color = color[500]; + style.borderColor = color[500]; + if (color !== blueGrey) { + style.backgroundColor = `${color[500]}20`; + } + + return ( + + {status} + {suffix} + + ); +}; diff --git a/dashboard/client/src/components/TitleCard.tsx b/dashboard/client/src/components/TitleCard.tsx new file mode 100644 index 000000000000..db088f775e60 --- /dev/null +++ b/dashboard/client/src/components/TitleCard.tsx @@ -0,0 +1,34 @@ +import { makeStyles, Paper } from "@material-ui/core"; +import React, { PropsWithChildren, ReactNode } from "react"; + +const useStyles = makeStyles((theme) => ({ + card: { + padding: theme.spacing(2), + paddingTop: theme.spacing(1.5), + margin: [theme.spacing(2), theme.spacing(1)].map((e) => `${e}px`).join(" "), + }, + title: { + fontSize: theme.typography.fontSize + 2, + fontWeight: 500, + color: theme.palette.text.secondary, + marginBottom: theme.spacing(1), + }, + body: { + padding: theme.spacing(0.5), + }, +})); + +const TitleCard = ({ + title, + children, +}: PropsWithChildren<{ title: ReactNode | string }>) => { + const classes = useStyles(); + return ( + +
{title}
+
{children}
+
+ ); +}; + +export default TitleCard; diff --git a/dashboard/client/src/components/WorkerTable.tsx b/dashboard/client/src/components/WorkerTable.tsx new file mode 100644 index 000000000000..aa6bba57b710 --- /dev/null +++ b/dashboard/client/src/components/WorkerTable.tsx @@ -0,0 +1,299 @@ +import { + Button, + Grid, + IconButton, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, +} from "@material-ui/core"; +import { KeyboardArrowDown, KeyboardArrowRight } from "@material-ui/icons"; +import dayjs from "dayjs"; +import React, { + PropsWithChildren, + ReactNode, + useContext, + useEffect, + useState, +} from "react"; +import { Link } from "react-router-dom"; +import { GlobalContext } from "../App"; +import { Actor } from "../type/actor"; +import { CoreWorkerStats, Worker } from "../type/worker"; +import { memoryConverter } from "../util/converter"; +import { longTextCut } from "../util/func"; + +import { useFilter } from "../util/hook"; +import ActorTable from "./ActorTable"; +import PercentageBar from "./PercentageBar"; +import { SearchInput } from "./SearchComponent"; + +export const ExpandableTableRow = ({ + children, + expandComponent, + length, + stateKey = "", + ...otherProps +}: PropsWithChildren<{ + expandComponent: ReactNode; + length: number; + stateKey?: string; +}>) => { + const [isExpanded, setIsExpanded] = React.useState(false); + + useEffect(() => { + if (stateKey.startsWith("ON")) { + setIsExpanded(true); + } else if (stateKey.startsWith("OFF")) { + setIsExpanded(false); + } + }, [stateKey]); + + if (length < 1) { + return ( + + + {children} + + ); + } + + return ( + + + + setIsExpanded(!isExpanded)} + > + {length} + {isExpanded ? : } + + + {children} + + {isExpanded && ( + + {expandComponent} + + )} + + ); +}; + +const WorkerDetailTable = ({ + actorMap, + coreWorkerStats, +}: { + actorMap: { [actorId: string]: Actor }; + coreWorkerStats: CoreWorkerStats[]; +}) => { + const actors = {} as { [actorId: string]: Actor }; + (coreWorkerStats || []) + .filter((e) => actorMap[e.actorId]) + .forEach((e) => (actors[e.actorId] = actorMap[e.actorId])); + + if (!Object.values(actors).length) { + return

The Worker Haven't Had Related Actor Yet.

; + } + + return ( + + + + ); +}; + +const RayletWorkerTable = ({ + workers = [], + actorMap, + mini, +}: { + workers: Worker[]; + actorMap: { [actorId: string]: Actor }; + mini?: boolean; +}) => { + const { changeFilter, filterFunc } = useFilter(); + const [key, setKey] = useState(""); + const { nodeMap, ipLogMap } = useContext(GlobalContext); + const open = () => setKey(`ON${Math.random()}`); + const close = () => setKey(`OFF${Math.random()}`); + + return ( + + {!mini && ( +
+ changeFilter("pid", value)} + /> + + +
+ )}{" "} + + + + {[ + "", + "Pid", + "CPU", + "CPU Times", + "Memory", + "CMD Line", + "Create Time", + "Log", + "Ops", + "IP/Hostname", + ].map((col) => ( + + {col} + + ))} + + + + {workers + .filter(filterFunc) + .sort((aWorker, bWorker) => { + const a = + (aWorker.coreWorkerStats || []).filter( + (e) => actorMap[e.actorId], + ).length || 0; + const b = + (bWorker.coreWorkerStats || []).filter( + (e) => actorMap[e.actorId], + ).length || 0; + return b - a; + }) + .map( + ({ + pid, + cpuPercent, + cpuTimes, + memoryInfo, + cmdline, + createTime, + coreWorkerStats = [], + language, + ip, + hostname, + }) => ( + + } + length={ + (coreWorkerStats || []).filter((e) => actorMap[e.actorId]) + .length + } + key={pid} + stateKey={key} + > + {pid} + + + {cpuPercent}% + + + +
+ {Object.entries(cpuTimes || {}).map(([key, val]) => ( +
+ {key}:{val} +
+ ))} +
+
+ +
+ {Object.entries(memoryInfo || {}).map(([key, val]) => ( +
+ {key}:{memoryConverter(val)} +
+ ))} +
+
+ + {cmdline && longTextCut(cmdline.filter((e) => e).join(" "))} + + + {dayjs(createTime * 1000).format("YYYY/MM/DD HH:mm:ss")} + + + + {ipLogMap[ip] && ( + + + Log + + + )} + + + + {language === "JAVA" && ( +
+ {" "} + + +
+ )} +
+ + {ip} +
+ {nodeMap[hostname] ? ( + + {hostname} + + ) : ( + hostname + )} +
+
+ ), + )} +
+
+
+ ); +}; + +export default RayletWorkerTable; diff --git a/dashboard/client/src/logo.svg b/dashboard/client/src/logo.svg new file mode 100644 index 000000000000..70be9ee548c6 --- /dev/null +++ b/dashboard/client/src/logo.svg @@ -0,0 +1,34 @@ + + + + +Ray Logo + + + + + + + + + + diff --git a/dashboard/client/src/pages/actor/index.tsx b/dashboard/client/src/pages/actor/index.tsx new file mode 100644 index 000000000000..cbcd264e26af --- /dev/null +++ b/dashboard/client/src/pages/actor/index.tsx @@ -0,0 +1,36 @@ +import { makeStyles } from "@material-ui/core"; +import React, { useEffect, useState } from "react"; +import ActorTable from "../../components/ActorTable"; +import TitleCard from "../../components/TitleCard"; +import { getActors } from "../../service/actor"; +import { Actor } from "../../type/actor"; + +const useStyles = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + width: "100%", + }, +})); + +const Actors = () => { + const classes = useStyles(); + const [actors, setActors] = useState<{ [actorId: string]: Actor }>({}); + + useEffect(() => { + getActors().then((res) => { + if (res?.data?.data?.actors) { + setActors(res.data.data.actors); + } + }); + }, []); + + return ( +
+ + + +
+ ); +}; + +export default Actors; diff --git a/dashboard/client/src/pages/cmd/CMDResult.tsx b/dashboard/client/src/pages/cmd/CMDResult.tsx new file mode 100644 index 000000000000..ed87c10d8e7c --- /dev/null +++ b/dashboard/client/src/pages/cmd/CMDResult.tsx @@ -0,0 +1,137 @@ +import { + Button, + Grid, + makeStyles, + MenuItem, + Paper, + Select, +} from "@material-ui/core"; +import React, { useCallback, useEffect, useState } from "react"; +import { RouteComponentProps } from "react-router-dom"; +import LogVirtualView from "../../components/LogView/LogVirtualView"; +import TitleCard from "../../components/TitleCard"; +import { getJmap, getJstack, getJstat } from "../../service/util"; + +const useStyles = makeStyles((theme) => ({ + root: { + padding: theme.spacing(4), + width: "100%", + }, + table: { + marginTop: theme.spacing(4), + padding: theme.spacing(2), + }, + pageMeta: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + }, + search: { + margin: theme.spacing(1), + }, +})); + +const CMDResult = ( + props: RouteComponentProps<{ cmd: string; ip: string; pid: string }>, +) => { + const classes = useStyles(); + const { + match: { params }, + } = props; + const { cmd, ip, pid } = params; + const [result, setResult] = useState(); + const [option, setOption] = useState("gcutil"); + const executeJstat = useCallback( + () => + getJstat(ip, pid, option) + .then((rsp) => { + if (rsp.data.result) { + setResult(rsp.data.data.output); + } else { + setResult(rsp.data.msg); + } + }) + .catch((err) => setResult(err.toString())), + [ip, pid, option], + ); + + useEffect(() => { + switch (cmd) { + case "jstack": + getJstack(ip, pid) + .then((rsp) => { + if (rsp.data.result) { + setResult(rsp.data.data.output); + } else { + setResult(rsp.data.msg); + } + }) + .catch((err) => setResult(err.toString())); + break; + case "jmap": + getJmap(ip, pid) + .then((rsp) => { + if (rsp.data.result) { + setResult(rsp.data.data.output); + } else { + setResult(rsp.data.msg); + } + }) + .catch((err) => setResult(err.toString())); + break; + case "jstat": + executeJstat(); + break; + default: + setResult(`Command ${cmd} is not supported.`); + break; + } + }, [cmd, executeJstat, ip, pid]); + + return ( +
+ + {cmd === "jstat" && ( + + + + + + + + + + + )} + + + + +
+ ); +}; + +export default CMDResult; diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index 0ffbce7f5d5f..d7eeaf936b45 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -1,4 +1,5 @@ import { + Button, createStyles, makeStyles, Tab, @@ -8,6 +9,7 @@ import { } from "@material-ui/core"; import React, { useCallback, useEffect, useRef } from "react"; import { useDispatch, useSelector } from "react-redux"; +import { useHistory } from "react-router-dom"; import { getActorGroups, getNodeInfo, getTuneAvailability } from "../../api"; import { StoreState } from "../../store"; import LastUpdated from "./LastUpdated"; @@ -59,6 +61,7 @@ const Dashboard: React.FC = () => { const tuneAvailability = useSelector(tuneAvailabilitySelector); const tab = useSelector(tabSelector); const classes = useDashboardStyles(); + const history = useHistory(); // Polling Function const refreshInfo = useCallback(async () => { @@ -103,6 +106,9 @@ const Dashboard: React.FC = () => { return (
Ray Dashboard + { + return ( +
+
+ + + + 404 NOT FOUND +

+ We can't provide the page you wanted yet, better try with another path + next time. +

+
+
+ ); +}; + +export default Error404; diff --git a/dashboard/client/src/pages/exception/Loading.tsx b/dashboard/client/src/pages/exception/Loading.tsx new file mode 100644 index 000000000000..24140c4dc0de --- /dev/null +++ b/dashboard/client/src/pages/exception/Loading.tsx @@ -0,0 +1,21 @@ +import React from "react"; +import Logo from "../../logo.svg"; + +export default () => { + return ( +
+
+ Loading +
+ Loading... +
+
+ ); +}; diff --git a/dashboard/client/src/pages/index/Index.tsx b/dashboard/client/src/pages/index/Index.tsx new file mode 100644 index 000000000000..9612164499f4 --- /dev/null +++ b/dashboard/client/src/pages/index/Index.tsx @@ -0,0 +1,110 @@ +import { + makeStyles, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, +} from "@material-ui/core"; +import React, { useEffect, useState } from "react"; +import { version } from "../../../package.json"; +import TitleCard from "../../components/TitleCard"; +import { getRayConfig } from "../../service/cluster"; +import { getNodeList } from "../../service/node"; +import { RayConfig } from "../../type/config"; +import { NodeDetail } from "../../type/node"; +import { memoryConverter } from "../../util/converter"; + +const useStyle = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + }, + label: { + fontWeight: "bold", + }, +})); + +const getVal = (key: string, value: any) => { + if (key === "containerMemory") { + return memoryConverter(value * 1024 * 1024); + } + return JSON.stringify(value); +}; + +const useIndex = () => { + const [rayConfig, setConfig] = useState(); + const [nodes, setNodes] = useState([]); + useEffect(() => { + getRayConfig().then((res) => { + if (res?.data?.data?.config) { + setConfig(res.data.data.config); + } + }); + }, []); + useEffect(() => { + getNodeList().then((res) => { + if (res?.data?.data?.summary) { + setNodes(res.data.data.summary); + } + }); + }, []); + + return { rayConfig, nodes }; +}; + +const Index = () => { + const { rayConfig } = useIndex(); + const classes = useStyle(); + + return ( +
+ +

Dashboard Frontend Version: {version}

+ {rayConfig?.imageUrl && ( +

+ Image Url:{" "} + + {rayConfig.imageUrl} + +

+ )} + {rayConfig?.sourceCodeLink && ( +

+ Source Code:{" "} + + {rayConfig.sourceCodeLink} + +

+ )} +
+ {rayConfig && ( + + + + Key + Value + + + {Object.entries(rayConfig).map(([key, value]) => ( + + {key} + {getVal(key, value)} + + ))} + + + + )} +
+ ); +}; + +export default Index; diff --git a/dashboard/client/src/pages/job/JobDetail.tsx b/dashboard/client/src/pages/job/JobDetail.tsx new file mode 100644 index 000000000000..b720b9c057de --- /dev/null +++ b/dashboard/client/src/pages/job/JobDetail.tsx @@ -0,0 +1,246 @@ +import { + Grid, + makeStyles, + Switch, + Tab, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Tabs, +} from "@material-ui/core"; +import React from "react"; +import { Link, RouteComponentProps } from "react-router-dom"; +import ActorTable from "../../components/ActorTable"; +import Loading from "../../components/Loading"; +import { StatusChip } from "../../components/StatusChip"; +import TitleCard from "../../components/TitleCard"; +import RayletWorkerTable from "../../components/WorkerTable"; +import { longTextCut } from "../../util/func"; +import { useJobDetail } from "./hook/useJobDetail"; + +const useStyle = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + }, + paper: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + marginBottom: theme.spacing(2), + }, + label: { + fontWeight: "bold", + }, + pageMeta: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + }, + tab: { + marginBottom: theme.spacing(2), + }, + dependenciesChip: { + margin: theme.spacing(0.5), + wordBreak: "break-all", + }, + alert: { + color: theme.palette.error.main, + }, +})); + +const JobDetailPage = (props: RouteComponentProps<{ id: string }>) => { + const classes = useStyle(); + const { + actorMap, + jobInfo, + job, + msg, + selectedTab, + handleChange, + handleSwitchChange, + params, + refreshing, + ipLogMap, + } = useJobDetail(props); + + if (!job || !jobInfo) { + return ( +
+ + + +
+ Auto Refresh: + +
+ Request Status: {msg}
+
+
+ ); + } + + return ( +
+ + +
+ Auto Refresh: + +
+ Request Status: {msg}
+
+ + + + + + + + {selectedTab === "info" && ( + + + Driver IP:{" "} + {jobInfo.driverIpAddress} + + {ipLogMap[jobInfo.driverIpAddress] && ( + + Driver Log:{" "} + + Log + + + )} + + Driver Pid:{" "} + {jobInfo.driverPid} + + {jobInfo.eventUrl && ( + + Event Link:{" "} + + Event Log + + + )} + {jobInfo.failErrorMessage && ( + + Fail Error:{" "} + + {jobInfo.failErrorMessage} + + + )} + + )} + {jobInfo?.dependencies && selectedTab === "dep" && ( +
+ {jobInfo?.dependencies?.python && ( + +
+ {jobInfo.dependencies.python.map((e) => ( + + ))} +
+
+ )} + {jobInfo?.dependencies?.java && ( + + + + + + {["Name", "Version", "URL"].map((col) => ( + + {col} + + ))} + + + + {jobInfo.dependencies.java.map( + ({ name, version, url }) => ( + + {name} + {version} + + + {url} + + + + ), + )} + +
+
+
+ )} +
+ )} + {selectedTab === "worker" && ( +
+ + + +
+ )} + {selectedTab === "actor" && ( +
+ + + +
+ )} +
+
+ ); +}; + +export default JobDetailPage; diff --git a/dashboard/client/src/pages/job/hook/useJobDetail.ts b/dashboard/client/src/pages/job/hook/useJobDetail.ts new file mode 100644 index 000000000000..695fca760931 --- /dev/null +++ b/dashboard/client/src/pages/job/hook/useJobDetail.ts @@ -0,0 +1,73 @@ +import { useCallback, useContext, useEffect, useRef, useState } from "react"; +import { RouteComponentProps } from "react-router-dom"; +import { GlobalContext } from "../../../App"; +import { getJobDetail } from "../../../service/job"; +import { JobDetail } from "../../../type/job"; + +export const useJobDetail = (props: RouteComponentProps<{ id: string }>) => { + const { + match: { params }, + } = props; + const [job, setJob] = useState(); + const [msg, setMsg] = useState("Loading the job detail"); + const [refreshing, setRefresh] = useState(true); + const [selectedTab, setTab] = useState("info"); + const { ipLogMap } = useContext(GlobalContext); + const tot = useRef(); + const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { + setTab(newValue); + }; + const handleSwitchChange = (event: React.ChangeEvent) => { + setRefresh(event.target.checked); + }; + const getJob = useCallback(async () => { + if (!refreshing) { + return; + } + const rsp = await getJobDetail(params.id); + + if (rsp.data?.data?.detail) { + setJob(rsp.data.data.detail); + } + + if (rsp.data?.msg) { + setMsg(rsp.data.msg || ""); + } + + if (rsp.data.result === false) { + setMsg("Job Query Error Please Check JobId"); + setJob(undefined); + setRefresh(false); + } + + tot.current = setTimeout(getJob, 4000); + }, [refreshing, params.id]); + + useEffect(() => { + if (tot.current) { + clearTimeout(tot.current); + } + getJob(); + return () => { + if (tot.current) { + clearTimeout(tot.current); + } + }; + }, [getJob]); + + const { jobInfo } = job || {}; + const actorMap = job?.jobActors; + + return { + actorMap, + jobInfo, + job, + msg, + selectedTab, + handleChange, + handleSwitchChange, + params, + refreshing, + ipLogMap, + }; +}; diff --git a/dashboard/client/src/pages/job/hook/useJobList.ts b/dashboard/client/src/pages/job/hook/useJobList.ts new file mode 100644 index 000000000000..04f97532f75c --- /dev/null +++ b/dashboard/client/src/pages/job/hook/useJobList.ts @@ -0,0 +1,68 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { getJobList } from "../../../service/job"; +import { Job } from "../../../type/job"; + +export const useJobList = () => { + const [jobList, setList] = useState([]); + const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); + const [msg, setMsg] = useState("Loading the job list..."); + const [isRefreshing, setRefresh] = useState(true); + const [filter, setFilter] = useState< + { + key: "jobId" | "name" | "language" | "state" | "namespaceId"; + val: string; + }[] + >([]); + const refreshRef = useRef(isRefreshing); + const tot = useRef(); + const changeFilter = ( + key: "jobId" | "name" | "language" | "state" | "namespaceId", + val: string, + ) => { + const f = filter.find((e) => e.key === key); + if (f) { + f.val = val; + } else { + filter.push({ key, val }); + } + setFilter([...filter]); + }; + const onSwitchChange = (event: React.ChangeEvent) => { + setRefresh(event.target.checked); + }; + refreshRef.current = isRefreshing; + const getJob = useCallback(async () => { + if (!refreshRef.current) { + return; + } + const rsp = await getJobList(); + + if (rsp?.data?.data?.summary) { + setList(rsp.data.data.summary.sort((a, b) => b.timestamp - a.timestamp)); + setMsg(rsp.data.msg || ""); + } + + tot.current = setTimeout(getJob, 4000); + }, []); + + useEffect(() => { + getJob(); + return () => { + if (tot.current) { + clearTimeout(tot.current); + } + }; + }, [getJob]); + return { + jobList: jobList.filter((node) => + filter.every((f) => node[f.key] && node[f.key].includes(f.val)), + ), + msg, + isRefreshing, + onSwitchChange, + changeFilter, + page, + originalJobs: jobList, + setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), + }; +}; diff --git a/dashboard/client/src/pages/job/index.tsx b/dashboard/client/src/pages/job/index.tsx new file mode 100644 index 000000000000..8d2a4aaa4c96 --- /dev/null +++ b/dashboard/client/src/pages/job/index.tsx @@ -0,0 +1,129 @@ +import { + Switch, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, +} from "@material-ui/core"; +import { makeStyles } from "@material-ui/core/styles"; +import Pagination from "@material-ui/lab/Pagination"; +import dayjs from "dayjs"; +import React from "react"; +import { Link } from "react-router-dom"; +import Loading from "../../components/Loading"; +import { SearchInput, SearchSelect } from "../../components/SearchComponent"; +import TitleCard from "../../components/TitleCard"; +import { useJobList } from "./hook/useJobList"; + +const useStyles = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + width: "100%", + }, +})); + +const columns = ["ID", "DriverIpAddress", "DriverPid", "IsDead", "Timestamp"]; + +const JobList = () => { + const classes = useStyles(); + const { + msg, + isRefreshing, + onSwitchChange, + jobList, + changeFilter, + page, + setPage, + } = useJobList(); + + return ( +
+ + + Auto Refresh: + +
+ Request Status: {msg} +
+ + + changeFilter("jobId", value)} + /> + changeFilter("language", value)} + options={["JAVA", "PYTHON"]} + /> + + setPage("pageSize", Math.min(Number(value), 500) || 10) + } + /> +
+ setPage("pageNo", pageNo)} + /> +
+ + + + {columns.map((col) => ( + + {col} + + ))} + + + + {jobList + .slice( + (page.pageNo - 1) * page.pageSize, + page.pageNo * page.pageSize, + ) + .map( + ({ + jobId = "", + driverIpAddress, + isDead, + driverPid, + state, + timestamp, + namespaceId, + }) => ( + + + {jobId} + + {driverIpAddress} + {driverPid} + + {isDead ? "true" : "false"} + + + {dayjs(timestamp * 1000).format("YYYY/MM/DD HH:mm:ss")} + + {namespaceId} + + ), + )} + +
+
+
+
+ ); +}; + +export default JobList; diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx new file mode 100644 index 000000000000..b484a29db646 --- /dev/null +++ b/dashboard/client/src/pages/layout/index.tsx @@ -0,0 +1,167 @@ +import { IconButton, Tooltip } from "@material-ui/core"; +import Drawer from "@material-ui/core/Drawer"; +import List from "@material-ui/core/List"; +import ListItem from "@material-ui/core/ListItem"; +import ListItemText from "@material-ui/core/ListItemText"; +import { makeStyles } from "@material-ui/core/styles"; +import Typography from "@material-ui/core/Typography"; +import { NightsStay, VerticalAlignTop, WbSunny } from "@material-ui/icons"; +import classnames from "classnames"; +import React, { PropsWithChildren } from "react"; +import { RouteComponentProps } from "react-router-dom"; + +import SpeedTools from "../../components/SpeedTools"; +import Logo from "../../logo.svg"; + +const drawerWidth = 200; + +const useStyles = makeStyles((theme) => ({ + root: { + display: "flex", + "& a": { + color: theme.palette.primary.main, + }, + }, + drawer: { + width: drawerWidth, + flexShrink: 0, + background: theme.palette.background.paper, + }, + drawerPaper: { + width: drawerWidth, + border: "none", + background: theme.palette.background.paper, + boxShadow: theme.shadows[1], + }, + title: { + padding: theme.spacing(2), + textAlign: "center", + lineHeight: "36px", + }, + divider: { + background: "rgba(255, 255, 255, .12)", + }, + menuItem: { + cursor: "pointer", + "&:hover": { + background: theme.palette.primary.main, + }, + }, + selected: { + background: `linear-gradient(45deg, ${theme.palette.primary.main} 30%, ${theme.palette.secondary.main} 90%)`, + }, + child: { + flex: 1, + }, +})); + +const BasicLayout = ( + props: PropsWithChildren< + { setTheme: (theme: string) => void; theme: string } & RouteComponentProps + >, +) => { + const classes = useStyles(); + const { location, history, children, setTheme, theme } = props; + + return ( +
+ + + Ray
Ray Dashboard +
+ + history.push("/summary")} + > + SUMMARY + + history.push("/node")} + > + NODES + + history.push("/job")} + > + JOBS + + history.push("/actors")} + > + ACTORS + + history.push("/log")} + > + LOGS + + history.push("/")} + > + BACK TO LEGACY + + + { + window.scrollTo(0, 0); + }} + > + + + + + { + setTheme(theme === "dark" ? "light" : "dark"); + }} + > + + {theme === "dark" ? : } + + + + + +
+
{children}
+
+ ); +}; + +export default BasicLayout; diff --git a/dashboard/client/src/pages/log/Logs.tsx b/dashboard/client/src/pages/log/Logs.tsx new file mode 100644 index 000000000000..12218d52a0fa --- /dev/null +++ b/dashboard/client/src/pages/log/Logs.tsx @@ -0,0 +1,306 @@ +import { + Button, + InputAdornment, + LinearProgress, + List, + ListItem, + makeStyles, + Paper, + Switch, + TextField, +} from "@material-ui/core"; +import { SearchOutlined } from "@material-ui/icons"; +import React, { useEffect, useRef, useState } from "react"; +import { RouteComponentProps } from "react-router-dom"; +import LogVirtualView from "../../components/LogView/LogVirtualView"; +import { SearchInput } from "../../components/SearchComponent"; +import TitleCard from "../../components/TitleCard"; +import { getLogDetail } from "../../service/log"; + +const useStyles = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + width: "100%", + }, + table: { + marginTop: theme.spacing(4), + padding: theme.spacing(2), + }, + pageMeta: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + }, + search: { + margin: theme.spacing(1), + }, +})); + +type LogsProps = RouteComponentProps<{ host?: string; path?: string }> & { + theme?: "dark" | "light"; +}; + +const useLogs = (props: LogsProps) => { + const { + match: { params }, + location: { search: urlSearch }, + theme, + } = props; + const { host, path } = params; + const searchMap = new URLSearchParams(urlSearch); + const urlFileName = searchMap.get("fileName"); + const el = useRef(null); + const [origin, setOrigin] = useState(); + const [search, setSearch] = useState<{ + keywords?: string; + lineNumber?: string; + fontSize?: number; + revert?: boolean; + }>(); + const [fileName, setFileName] = useState(searchMap.get("fileName") || ""); + const [log, setLogs] = useState< + undefined | string | { [key: string]: string }[] + >(); + const [startTime, setStart] = useState(); + const [endTime, setEnd] = useState(); + + useEffect(() => { + setFileName(urlFileName || ""); + }, [urlFileName]); + + useEffect(() => { + let url = "log_index"; + setLogs("Loading..."); + if (host) { + url = decodeURIComponent(host); + setOrigin(new URL(url).origin); + if (path) { + url += decodeURIComponent(path); + } + } else { + setOrigin(undefined); + } + getLogDetail(url) + .then((res) => { + if (res) { + setLogs(res); + } else { + setLogs("(null)"); + } + }) + .catch(() => { + setLogs("Failed to load"); + }); + }, [host, path]); + + return { + log, + origin, + host, + path, + el, + search, + setSearch, + theme, + fileName, + setFileName, + startTime, + setStart, + endTime, + setEnd, + }; +}; + +const Logs = (props: LogsProps) => { + const classes = useStyles(); + const { + log, + origin, + path, + el, + search, + setSearch, + theme, + fileName, + setFileName, + startTime, + setStart, + endTime, + setEnd, + } = useLogs(props); + let href = "#/log/"; + + if (origin) { + if (path) { + const after = decodeURIComponent(path).split("/"); + after.pop(); + if (after.length > 1) { + href += encodeURIComponent(origin); + href += "/"; + href += encodeURIComponent(after.join("/")); + } + } + } + + return ( +
+ + + {!origin &&

Please choose an url to get log path

} + {origin && ( +

+ Now Path: {origin} + {decodeURIComponent(path || "")} +

+ )} + {origin && ( +
+ + {typeof log === "object" && ( + { + setFileName(val); + }} + /> + )} +
+ )} +
+ + {typeof log === "object" && ( + + {log + .filter((e) => !fileName || e?.name?.includes(fileName)) + .map((e: { [key: string]: string }) => ( + + + {e.name} + + + ))} + + )} + {typeof log === "string" && log !== "Loading..." && ( +
+
+ { + setSearch({ ...search, keywords: value }); + }, + type: "", + endAdornment: ( + + + + ), + }} + /> + { + setSearch({ ...search, lineNumber: value }); + }, + type: "", + endAdornment: ( + + + + ), + }} + /> + { + setSearch({ ...search, fontSize: Number(value) }); + }, + type: "", + }} + /> + { + setStart(val.target.value); + }} + InputLabelProps={{ + shrink: true, + }} + /> + { + setEnd(val.target.value); + }} + InputLabelProps={{ + shrink: true, + }} + /> +
+ Reverse:{" "} + setSearch({ ...search, revert: v })} + /> + +
+
+ +
+ )} + {log === "Loading..." && ( +
+
+ +
+ )} +
+
+
+ ); +}; + +export default Logs; diff --git a/dashboard/client/src/pages/node/NodeDetail.tsx b/dashboard/client/src/pages/node/NodeDetail.tsx new file mode 100644 index 000000000000..6f5187bdb822 --- /dev/null +++ b/dashboard/client/src/pages/node/NodeDetail.tsx @@ -0,0 +1,287 @@ +import { + Grid, + makeStyles, + Switch, + Tab, + TableContainer, + Tabs, +} from "@material-ui/core"; +import dayjs from "dayjs"; +import React from "react"; +import { Link, RouteComponentProps } from "react-router-dom"; +import ActorTable from "../../components/ActorTable"; +import Loading from "../../components/Loading"; +import PercentageBar from "../../components/PercentageBar"; +import { StatusChip } from "../../components/StatusChip"; +import TitleCard from "../../components/TitleCard"; +import RayletWorkerTable from "../../components/WorkerTable"; +import { ViewMeasures } from "../../type/raylet"; +import { memoryConverter } from "../../util/converter"; +import { useNodeDetail } from "./hook/useNodeDetail"; + +const useStyle = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + }, + paper: { + padding: theme.spacing(2), + marginTop: theme.spacing(2), + marginBottom: theme.spacing(2), + }, + label: { + fontWeight: "bold", + }, + tab: { + marginBottom: theme.spacing(2), + }, +})); + +const showMeasureKeys = [ + "local_total_resource", + "local_available_resource", + "actor_stats", + "task_dependency_manager_stats", + "reconstruction_policy_stats", + "scheduling_queue_stats", + "object_manager_stats", +]; + +const ViewDataDisplayer = ({ view }: { view?: ViewMeasures }) => { + if (!view) { + return null; + } + const { tags = "", ...otherProps } = view; + + return ( + + {tags.split(",").pop()?.split(":").slice(1).join(":")}= + {Object.keys(otherProps).length > 0 ? ( + JSON.stringify(Object.values(otherProps).pop()) + ) : ( + null + )} + + ); +}; + +const NodeDetailPage = (props: RouteComponentProps<{ id: string }>) => { + const classes = useStyle(); + const { + params, + selectedTab, + nodeDetail, + msg, + isRefreshing, + onRefreshChange, + raylet, + handleChange, + } = useNodeDetail(props); + + return ( +
+ + + +
+ Auto Refresh: + +
+ Request Status: {msg} +
+ + + + + + + + {nodeDetail && selectedTab === "info" && ( +
+ + +
Hostname
{" "} + {nodeDetail.hostname} +
+ +
IP
{nodeDetail.ip} +
+
+ + +
CPU (Logic/Physic)
{" "} + {nodeDetail.cpus[0]}/ {nodeDetail.cpus[1]} +
+ +
Load (1/5/15min)
{" "} + {nodeDetail?.loadAvg[0] && + nodeDetail.loadAvg[0] + .map((e) => Number(e).toFixed(2)) + .join("/")} +
+
+ + +
Load per CPU (1/5/15min)
{" "} + {nodeDetail?.loadAvg[1] && + nodeDetail.loadAvg[1] + .map((e) => Number(e).toFixed(2)) + .join("/")} +
+ +
Boot Time
{" "} + {dayjs(nodeDetail.bootTime * 1000).format( + "YYYY/MM/DD HH:mm:ss", + )} +
+
+ + +
Sent Tps
{" "} + {memoryConverter(nodeDetail?.net[0])}/s +
+ +
Recieved Tps
{" "} + {memoryConverter(nodeDetail?.net[1])}/s +
+
+ + +
Memory
{" "} + {nodeDetail?.mem && ( + + {memoryConverter(nodeDetail?.mem[0] - nodeDetail?.mem[1])}/ + {memoryConverter(nodeDetail?.mem[0])}({nodeDetail?.mem[2]}%) + + )} +
+ +
CPU
{" "} + + {nodeDetail.cpu}% + +
+
+ + {nodeDetail?.disk && + Object.entries(nodeDetail?.disk).map(([path, obj]) => ( + +
Disk ({path})
{" "} + {obj && ( + + {memoryConverter(obj.used)}/{memoryConverter(obj.total)} + ({obj.percent}%, {memoryConverter(obj.free)} free) + + )} +
+ ))} +
+ + +
Logs
{" "} + + log + +
+
+
+ )} + {raylet && Object.keys(raylet).length > 0 && selectedTab === "raylet" && ( + +
+ + +
Command
+
+
+ {nodeDetail?.cmdline.join(" ")} +
+
+
+ + +
Pid
{raylet?.pid} +
+ +
Workers Num
{" "} + {raylet?.numWorkers} +
+ +
Node Manager Port
{" "} + {raylet?.nodeManagerPort} +
+
+ {showMeasureKeys + .map((e) => raylet.viewData.find((view) => view.viewName === e)) + .map((e) => + e ? ( + +

+ {e.viewName + .split("_") + .map((e) => e[0].toUpperCase() + e.slice(1)) + .join(" ")} +

+ + {e.measures.map((e) => ( + + ))} + +
+ ) : null, + )} +
+
+ )} + {nodeDetail?.workers && selectedTab === "worker" && ( + + + + + + )} + {nodeDetail?.actors && selectedTab === "actor" && ( + + + + + + )} +
+
+ ); +}; + +export default NodeDetailPage; diff --git a/dashboard/client/src/pages/node/hook/useNodeDetail.ts b/dashboard/client/src/pages/node/hook/useNodeDetail.ts new file mode 100644 index 000000000000..1ca3570a20ff --- /dev/null +++ b/dashboard/client/src/pages/node/hook/useNodeDetail.ts @@ -0,0 +1,66 @@ +import { useCallback, useContext, useEffect, useRef, useState } from "react"; +import { RouteComponentProps } from "react-router-dom"; +import { GlobalContext } from "../../../App"; +import { getNodeDetail } from "../../../service/node"; +import { NodeDetailExtend } from "../../../type/node"; + +export const useNodeDetail = (props: RouteComponentProps<{ id: string }>) => { + const { + match: { params }, + } = props; + const [selectedTab, setTab] = useState("info"); + const [nodeDetail, setNode] = useState(); + const [msg, setMsg] = useState("Loading the node infos..."); + const { namespaceMap } = useContext(GlobalContext); + const [isRefreshing, setRefresh] = useState(true); + const tot = useRef(); + const onRefreshChange = (event: React.ChangeEvent) => { + setRefresh(event.target.checked); + }; + const getDetail = useCallback(async () => { + if (!isRefreshing) { + return; + } + const { data } = await getNodeDetail(params.id); + const { data: rspData, msg, result } = data; + if (rspData?.detail) { + setNode(rspData.detail); + } + + if (msg) { + setMsg(msg); + } + + if (result === false) { + setMsg("Node Query Error Please Check Node Name"); + setRefresh(false); + } + + tot.current = setTimeout(getDetail, 4000); + }, [isRefreshing, params.id]); + const raylet = nodeDetail?.raylet; + const handleChange = (event: React.ChangeEvent<{}>, newValue: string) => { + setTab(newValue); + }; + + useEffect(() => { + getDetail(); + return () => { + if (tot.current) { + clearTimeout(tot.current); + } + }; + }, [getDetail]); + + return { + params, + selectedTab, + nodeDetail, + msg, + isRefreshing, + onRefreshChange, + raylet, + handleChange, + namespaceMap, + }; +}; diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts new file mode 100644 index 000000000000..96a3339ba4e8 --- /dev/null +++ b/dashboard/client/src/pages/node/hook/useNodeList.ts @@ -0,0 +1,74 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { getNodeList } from "../../../service/node"; +import { NodeDetail } from "../../../type/node"; +import { useSorter } from "../../../util/hook"; + +export const useNodeList = () => { + const [nodeList, setList] = useState([]); + const [msg, setMsg] = useState("Loading the nodes infos..."); + const [isRefreshing, setRefresh] = useState(true); + const [mode, setMode] = useState("table"); + const [filter, setFilter] = useState< + { key: "hostname" | "ip" | "state"; val: string }[] + >([]); + const [page, setPage] = useState({ pageSize: 10, pageNo: 1 }); + const { sorterFunc, setOrderDesc, setSortKey, sorterKey } = useSorter("cpu"); + const tot = useRef(); + const changeFilter = (key: "hostname" | "ip" | "state", val: string) => { + const f = filter.find((e) => e.key === key); + if (f) { + f.val = val; + } else { + filter.push({ key, val }); + } + setFilter([...filter]); + }; + const onSwitchChange = (event: React.ChangeEvent) => { + setRefresh(event.target.checked); + }; + const getList = useCallback(async () => { + if (!isRefreshing) { + return; + } + const { data } = await getNodeList(); + const { data: rspData, msg } = data; + setList(rspData.summary || []); + if (msg) { + setMsg(msg); + } else { + setMsg(""); + } + tot.current = setTimeout(getList, 4000); + }, [isRefreshing]); + + useEffect(() => { + getList(); + return () => { + if (tot.current) { + clearTimeout(tot.current); + } + }; + }, [getList]); + + return { + nodeList: nodeList + .map((e) => ({ ...e, state: e.raylet.state })) + .sort((a, b) => (a.raylet.nodeId > b.raylet.nodeId ? 1 : -1)) + .sort(sorterFunc) + .filter((node) => + filter.every((f) => node[f.key] && node[f.key].includes(f.val)), + ), + msg, + isRefreshing, + onSwitchChange, + changeFilter, + page, + originalNodes: nodeList, + setPage: (key: string, val: number) => setPage({ ...page, [key]: val }), + sorterKey, + setSortKey, + setOrderDesc, + mode, + setMode, + }; +}; diff --git a/dashboard/client/src/pages/node/index.tsx b/dashboard/client/src/pages/node/index.tsx new file mode 100644 index 000000000000..3713fdc15748 --- /dev/null +++ b/dashboard/client/src/pages/node/index.tsx @@ -0,0 +1,392 @@ +import { + Button, + ButtonGroup, + Grid, + Paper, + Switch, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Tooltip, +} from "@material-ui/core"; +import { makeStyles } from "@material-ui/core/styles"; +import Pagination from "@material-ui/lab/Pagination"; +import dayjs from "dayjs"; +import React from "react"; +import { Link } from "react-router-dom"; +import Loading from "../../components/Loading"; +import PercentageBar from "../../components/PercentageBar"; +import { SearchInput, SearchSelect } from "../../components/SearchComponent"; +import StateCounter from "../../components/StatesCounter"; +import { StatusChip } from "../../components/StatusChip"; +import TitleCard from "../../components/TitleCard"; +import { NodeDetail } from "../../type/node"; +import { memoryConverter } from "../../util/converter"; +import { useNodeList } from "./hook/useNodeList"; + +const useStyles = makeStyles((theme) => ({ + root: { + padding: theme.spacing(2), + width: "100%", + position: "relative", + }, +})); + +const columns = [ + "State", + "ID", + "Host", + "IP", + "CPU Usage", + "Memory", + "Disk(root)", + "Sent", + "Received", + "BRPC Port", + "Time Info", + "Log", +]; + +export const brpcLinkChanger = (href: string) => { + const { location } = window; + const { pathname } = location; + const pathArr = pathname.split("/"); + if (pathArr.some((e) => e.split(".").length > 1)) { + const index = pathArr.findIndex((e) => e.includes(".")); + const resultArr = pathArr.slice(0, index); + resultArr.push(href); + return `${location.protocol}//${location.host}${resultArr.join("/")}`; + } + + return `http://${href}`; +}; + +export const NodeCard = (props: { node: NodeDetail }) => { + const { node } = props; + + if (!node) { + return null; + } + + const { raylet, hostname, ip, cpu, mem, net, disk, logUrl } = node; + const { nodeId, state, brpcPort } = raylet; + + return ( + +

+ {nodeId}{" "} +

+

+ + + + + + {hostname}({ip}) + + {net && net[0] >= 0 && ( + + Sent{" "} + {memoryConverter(net[0])}/s{" "} + Received{" "} + {memoryConverter(net[1])}/s + + )} + +

+ + {cpu >= 0 && ( + + CPU + + {cpu}% + + + )} + {mem && ( + + Memory + + {memoryConverter(mem[0] - mem[1])}/{memoryConverter(mem[0])}( + {mem[2]}%) + + + )} + {disk && disk["/"] && ( + + Disk('/') + + {memoryConverter(disk["/"].used)}/ + {memoryConverter(disk["/"].total)}({disk["/"].percent}%) + + + )} + + + + + + + + + +
+ ); +}; + +const Nodes = () => { + const classes = useStyles(); + const { + msg, + isRefreshing, + onSwitchChange, + nodeList, + changeFilter, + page, + setPage, + setSortKey, + setOrderDesc, + mode, + setMode, + } = useNodeList(); + + return ( +
+ + + Auto Refresh: + +
+ Request Status: {msg} +
+ + + + + + + changeFilter("hostname", value.trim())} + /> + + + changeFilter("ip", value.trim())} + /> + + + changeFilter("state", value.trim())} + options={["ALIVE", "DEAD"]} + /> + + + + setPage("pageSize", Math.min(Number(value), 500) || 10) + } + /> + + + setSortKey(val)} + /> + + + + Reverse: + setOrderDesc(checked)} /> + + + + + + + + + +
+ setPage("pageNo", pageNo)} + /> +
+ {mode === "table" && ( + + + + + {columns.map((col) => ( + + {col} + + ))} + + + + {nodeList + .slice( + (page.pageNo - 1) * page.pageSize, + page.pageNo * page.pageSize, + ) + .map( + ( + { + hostname = "", + ip = "", + cpu = 0, + mem = [], + disk, + net = [0, 0], + raylet, + logUrl, + }: NodeDetail, + i, + ) => ( + + + + + + + + {raylet.nodeId.slice(0, 5)} + + + + {hostname} + {ip} + + + {cpu}% + + + + + {memoryConverter(mem[0] - mem[1])}/ + {memoryConverter(mem[0])}({mem[2]}%) + + + + {disk && disk["/"] && ( + + {memoryConverter(disk["/"].used)}/ + {memoryConverter(disk["/"].total)}( + {disk["/"].percent}%) + + )} + + + {memoryConverter(net[0])}/s + + + {memoryConverter(net[1])}/s + + + {raylet.brpcPort && ( + + {raylet.brpcPort} + + )} + + + {!!raylet.startTime && ( +

+ Start Time:{" "} + {dayjs(raylet.startTime * 1000).format( + "YYYY/MM/DD HH:mm:ss", + )} +

+ )} + {!!raylet.terminateTime && ( +

+ End Time:{" "} + {dayjs(raylet.terminateTime * 1000).format( + "YYYY/MM/DD HH:mm:ss", + )} +

+ )} +
+ + + Log + + +
+ ), + )} +
+
+
+ )} + {mode === "card" && ( + + {nodeList + .slice( + (page.pageNo - 1) * page.pageSize, + page.pageNo * page.pageSize, + ) + .map((e) => ( + + + + ))} + + )} +
+
+ ); +}; + +export default Nodes; diff --git a/dashboard/client/src/service/actor.ts b/dashboard/client/src/service/actor.ts new file mode 100644 index 000000000000..425fd62a44de --- /dev/null +++ b/dashboard/client/src/service/actor.ts @@ -0,0 +1,14 @@ +import axios from "axios"; +import { Actor } from "../type/actor"; + +export const getActors = () => { + return axios.get<{ + result: boolean; + message: string; + data: { + actors: { + [actorId: string]: Actor; + }; + }; + }>("logical/actors"); +}; diff --git a/dashboard/client/src/service/cluster.ts b/dashboard/client/src/service/cluster.ts new file mode 100644 index 000000000000..9bf53e76dbb9 --- /dev/null +++ b/dashboard/client/src/service/cluster.ts @@ -0,0 +1,6 @@ +import axios from "axios"; +import { RayConfigRsp } from "../type/config"; + +export const getRayConfig = () => { + return axios.get("api/ray_config"); +}; diff --git a/dashboard/client/src/service/job.ts b/dashboard/client/src/service/job.ts new file mode 100644 index 000000000000..fc5d5452db68 --- /dev/null +++ b/dashboard/client/src/service/job.ts @@ -0,0 +1,10 @@ +import axios from "axios"; +import { JobDetailRsp, JobListRsp } from "../type/job"; + +export const getJobList = () => { + return axios.get("jobs?view=summary"); +}; + +export const getJobDetail = (id: string) => { + return axios.get(`jobs/${id}`); +}; diff --git a/dashboard/client/src/service/log.ts b/dashboard/client/src/service/log.ts new file mode 100644 index 000000000000..b485b12f1684 --- /dev/null +++ b/dashboard/client/src/service/log.ts @@ -0,0 +1,35 @@ +import axios from "axios"; + +export const getLogDetail = async (url: string) => { + if (window.location.pathname !== "/" && url !== "log_index") { + const pathArr = window.location.pathname.split("/"); + if (pathArr.length > 1) { + const idx = pathArr.findIndex((e) => e.includes(":")); + if (idx > -1) { + const afterArr = pathArr.slice(0, idx); + afterArr.push(url.replace(/https?:\/\//, "")); + url = afterArr.join("/"); + } + } + } + const rsp = await axios.get( + url === "log_index" ? url : `log_proxy?url=${encodeURIComponent(url)}`, + ); + if (rsp.headers["content-type"]?.includes("html")) { + const el = document.createElement("div"); + el.innerHTML = rsp.data; + const arr = [].map.call( + el.getElementsByTagName("li"), + (li: HTMLLIElement) => { + const a = li.children[0] as HTMLAnchorElement; + return { + name: li.innerText, + href: li.innerText.includes("http") ? a.href : a.pathname, + } as { [key: string]: string }; + }, + ); + return arr as { [key: string]: string }[]; + } + + return rsp.data as string; +}; diff --git a/dashboard/client/src/service/node.ts b/dashboard/client/src/service/node.ts new file mode 100644 index 000000000000..5eac1dc9cafb --- /dev/null +++ b/dashboard/client/src/service/node.ts @@ -0,0 +1,10 @@ +import axios from "axios"; +import { NodeDetailRsp, NodeListRsp } from "../type/node"; + +export const getNodeList = async () => { + return await axios.get("nodes?view=summary"); +}; + +export const getNodeDetail = async (id: string) => { + return await axios.get(`nodes/${id}`); +}; diff --git a/dashboard/client/src/service/util.ts b/dashboard/client/src/service/util.ts new file mode 100644 index 000000000000..966c82db2919 --- /dev/null +++ b/dashboard/client/src/service/util.ts @@ -0,0 +1,52 @@ +import axios from "axios"; + +type CMDRsp = { + result: boolean; + msg: string; + data: { + output: string; + }; +}; + +export const getJstack = (ip: string, pid: string) => { + return axios.get("utils/jstack", { + params: { + ip, + pid, + }, + }); +}; + +export const getJmap = (ip: string, pid: string) => { + return axios.get("utils/jmap", { + params: { + ip, + pid, + }, + }); +}; + +export const getJstat = (ip: string, pid: string, options: string) => { + return axios.get("utils/jstat", { + params: { + ip, + pid, + options, + }, + }); +}; + +type NamespacesRsp = { + result: boolean; + msg: string; + data: { + namespaces: { + namespaceId: string; + hostNameList: string[]; + }[]; + }; +}; + +export const getNamespaces = () => { + return axios.get("namespaces"); +}; diff --git a/dashboard/client/src/theme.ts b/dashboard/client/src/theme.ts new file mode 100644 index 000000000000..f83d58b5ad46 --- /dev/null +++ b/dashboard/client/src/theme.ts @@ -0,0 +1,61 @@ +import { blue, blueGrey, grey, lightBlue } from "@material-ui/core/colors"; +import { createMuiTheme } from "@material-ui/core/styles"; + +const basicTheme = { + typography: { + fontSize: 12, + fontFamily: [ + "-apple-system", + "BlinkMacSystemFont", + '"Segoe UI"', + "Roboto", + '"Helvetica Neue"', + "Arial", + "sans-serif", + '"Apple Color Emoji"', + '"Segoe UI Emoji"', + '"Segoe UI Symbol"', + ].join(","), + }, + props: { + MuiPaper: { + elevation: 0, + }, + }, +}; + +export const lightTheme = createMuiTheme({ + ...basicTheme, + palette: { + primary: blue, + secondary: lightBlue, + text: { + primary: grey[900], + secondary: grey[800], + disabled: grey[400], + hint: grey[300], + }, + background: { + paper: "#fff", + default: blueGrey[50], + }, + }, +}); + +export const darkTheme = createMuiTheme({ + ...basicTheme, + palette: { + primary: blue, + secondary: lightBlue, + text: { + primary: blueGrey[50], + secondary: blueGrey[100], + disabled: blueGrey[200], + hint: blueGrey[300], + }, + background: { + paper: grey[800], + default: grey[900], + }, + }, +}); diff --git a/dashboard/client/src/type/actor.ts b/dashboard/client/src/type/actor.ts new file mode 100644 index 000000000000..8a00c0e41269 --- /dev/null +++ b/dashboard/client/src/type/actor.ts @@ -0,0 +1,94 @@ +export enum ActorEnum { + ALIVE = "ALIVE", + PENDING = "PENDING", + RECONSTRUCTING = "RECONSTRUCTING", + DEAD = "DEAD", +} + +export type Address = { + rayletId: string; + ipAddress: string; + port: number; + workerId: string; +}; + +export type TaskSpec = { + actorCreationTaskSpec: { + actorId: string; + dynamicWorkerOptions: string[]; + extensionData: string; + isAsyncio: boolean; + isDetached: boolean; + maxActorRestarts: boolean; + maxConcurrency: number; + name: string; + }; + args: { + data: string; + metadata: string; + nestedInlinedIds: string[]; + objectIds: string[]; + }[]; + callerAddress: { + ipAddress: string; + port: number; + rayletId: string; + workerId: string; + }; + callerId: string; + functionDescriptor: { + javaFunctionDescriptor: { + className: string; + functionName: string; + signature: string; + }; + pythonFunctionDescriptor: { + className: string; + functionName: string; + signature: string; + }; + }; + jobId: string; + language: string; + maxRetries: number; + numReturns: string; + parentCounter: string; + parentTaskId: string; + requiredPlacementResources: { + [key: string]: number; + }; + requiredResources: { + [key: string]: number; + }; + sourceActorId: string; + taskId: string; + type: string; +}; + +export type Actor = { + actorId: string; + children: { [key: string]: Actor }; + taskSpec: TaskSpec; + ipAddress: string; + isDirectCall: boolean; + jobId: string; + numExecutedTasks: number; + numLocalObjects: number; + numObjectIdsInScope: number; + state: ActorEnum | string; // PENDING, ALIVE, RECONSTRUCTING, DEAD + taskQueueLength: number; + usedObjectStoreMemory: number; + usedResources: { [key: string]: string | number }; + timestamp: number; + actorTitle: string; + averageTaskExecutionSpeed: number; + nodeId: string; + pid: number; + ownerAddress: Address; + address: Address; + maxReconstructions: string; + remainingReconstructions: string; + isDetached: false; + name: string; + numRestarts: string; +}; diff --git a/dashboard/client/src/type/config.d.ts b/dashboard/client/src/type/config.d.ts new file mode 100644 index 000000000000..40a34a25fcd5 --- /dev/null +++ b/dashboard/client/src/type/config.d.ts @@ -0,0 +1,22 @@ +export type RayConfig = { + userName: string; + workNodeNumber: number; + headNodeNumber: number; + containerVcores: number; + containerMemory: number; + clusterName: string; + supremeFo: boolean; + jobManagerPort: number; + externalRedisAddresses: string; + envParams: string; + sourceCodeLink: string; + imageUrl: string; +}; + +export type RayConfigRsp = { + result: boolean; + msg: string; + data: { + config: RayConfig; + }; +}; diff --git a/dashboard/client/src/type/event.d.ts b/dashboard/client/src/type/event.d.ts new file mode 100644 index 000000000000..4f586f9a04d5 --- /dev/null +++ b/dashboard/client/src/type/event.d.ts @@ -0,0 +1,31 @@ +export type Event = { + eventId: string; + jobId: string; + nodeId: string; + sourceType: string; + sourceHostname: string; + sourcePid: number; + label: string; + message: string; + timestamp: number; + severity: string; +}; + +export type EventRsp = { + result: boolean; + msg: string; + data: { + jobId: string; + events: Event[]; + }; +}; + +export type EventGlobalRsp = { + result: boolean; + msg: string; + data: { + events: { + global: Event[]; + }; + }; +}; diff --git a/dashboard/client/src/type/job.d.ts b/dashboard/client/src/type/job.d.ts new file mode 100644 index 000000000000..c5ca4dce874c --- /dev/null +++ b/dashboard/client/src/type/job.d.ts @@ -0,0 +1,70 @@ +import { Actor } from "./actor"; +import { Worker } from "./worker"; + +export type Job = { + jobId: string; + name: string; + owner: string; + language: string; + driverEntry: string; + state: string; + timestamp: number; + namespaceId: string; + driverPid: number; + driverIpAddress: string; + isDead: boolean; +}; + +export type PythonDependenciey = string; + +export type JavaDependency = { + name: string; + version: string; + md5: string; + url: string; +}; + +export type JobInfo = { + url: string; + driverArgs: string; + customConfig: { + [k: string]: string; + }; + jvmOptions: string; + dependencies: { + python: PythonDependenciey[]; + java: JavaDependency[]; + }; + driverStarted: boolean; + submitTime: string; + startTime: null | string | number; + endTime: null | string | number; + driverIpAddress: string; + driverHostname: string; + driverPid: number; + eventUrl: string; + failErrorMessage: string; + driverCmdline: string; +} & Job; + +export type JobDetail = { + jobInfo: JobInfo; + jobActors: { [id: string]: Actor }; + jobWorkers: Worker[]; +}; + +export type JobDetailRsp = { + data: { + detail: JobDetail; + }; + msg: string; + result: boolean; +}; + +export type JobListRsp = { + data: { + summary: Job[]; + }; + msg: string; + result: boolean; +}; diff --git a/dashboard/client/src/type/node.d.ts b/dashboard/client/src/type/node.d.ts new file mode 100644 index 000000000000..12106d9adab0 --- /dev/null +++ b/dashboard/client/src/type/node.d.ts @@ -0,0 +1,62 @@ +import { Actor } from "./actor"; +import { Raylet } from "./raylet"; +import { Worker } from "./worker"; + +export type NodeDetail = { + now: number; + hostname: string; + ip: string; + cpu: number; // cpu usage + cpus: number[]; // Logic CPU Count, Physical CPU Count + mem: number[]; // total memory, free memory, memory used ratio + bootTime: number; // start time + loadAvg: number[][]; // recent 1,5,15 minitues system load,load per cpu http://man7.org/linux/man-pages/man3/getloadavg.3.html + disk: { + // disk used on root + "/": { + total: number; + used: number; + free: number; + percent: number; + }; + // disk used on tmp + "/tmp": { + total: number; + used: number; + free: number; + percent: number; + }; + }; + net: number[]; // sent tps, received tps + raylet: Raylet; + logCounts: number; + errorCounts: number; + actors: { [id: string]: Actor }; + cmdline: string[]; + state: string; + logUrl: string; +}; + +export type NodeListRsp = { + data: { + summary: NodeDetail[]; + }; + result: boolean; + msg: string; +}; + +export type NodeDetailExtend = { + workers: Worker[]; + raylet: Raylet; + actors: { + [actorId: string]: Actor; + }; +} & NodeDetail; + +export type NodeDetailRsp = { + data: { + detail: NodeDetailExtend; + }; + msg: string; + result: boolean; +}; diff --git a/dashboard/client/src/type/raylet.d.ts b/dashboard/client/src/type/raylet.d.ts new file mode 100644 index 000000000000..459b4c2b9086 --- /dev/null +++ b/dashboard/client/src/type/raylet.d.ts @@ -0,0 +1,28 @@ +export type ViewMeasures = { + tags: string; + int_value?: number; + double_value?: number; + distribution_min?: number; + distribution_mean?: number; + distribution_max?: number; + distribution_count?: number; + distribution_bucket_boundaries?: number[]; + distribution_bucket_counts?: number[]; +}; + +export type ViewData = { + viewName: string; + measures: ViewMeasures[]; +}; + +export type Raylet = { + viewData: ViewData[]; + numWorkers: number; + pid: number; + nodeId: string; + nodeManagerPort: number; + brpcPort: pid; + state: string; + startTime: number; + terminateTime: number; +}; diff --git a/dashboard/client/src/type/worker.d.ts b/dashboard/client/src/type/worker.d.ts new file mode 100644 index 000000000000..cf35bfa018dd --- /dev/null +++ b/dashboard/client/src/type/worker.d.ts @@ -0,0 +1,36 @@ +export type CoreWorkerStats = { + currentTaskFuncDesc: string; + ipAddress: string; + port: string; + actorId: string; + usedResources: { [key: string]: number }; + numExecutedTasks: number; + workerId: string; + actorTitle: string; + jobId: string; +}; + +export type Worker = { + createTime: number; + cpuPercent: number; + cmdline: string[]; + memoryInfo: { + rss: number; // aka “Resident Set Size”, this is the non-swapped physical memory a process has used. On UNIX it matches “top“‘s RES column). On Windows this is an alias for wset field and it matches “Mem Usage” column of taskmgr.exe. + vms: number; // aka “Virtual Memory Size”, this is the total amount of virtual memory used by the process. On UNIX it matches “top“‘s VIRT column. On Windows this is an alias for pagefile field and it matches “Mem Usage” “VM Size” column of taskmgr.exe. + pfaults: number; // number of page faults. + pageins: number; // number of actual pageins. + [key: string]: number; + }; + cpuTimes: { + user: number; + system: number; + childrenUser: number; + childrenUystem: number; + iowait?: number; + }; + pid: number; + coreWorkerStats: CoreWorkerStats[]; + language: string; + hostname: string; + ip: hostname; +}; diff --git a/dashboard/client/src/util/converter.ts b/dashboard/client/src/util/converter.ts new file mode 100644 index 000000000000..427ae86b78f3 --- /dev/null +++ b/dashboard/client/src/util/converter.ts @@ -0,0 +1,27 @@ +export const memoryConverter = (bytes: number) => { + if (bytes < 1024) { + return `${bytes}KB`; + } + + if (bytes < 1024 ** 2) { + return `${(bytes / 1024 ** 1).toFixed(2)}KB`; + } + + if (bytes < 1024 ** 3) { + return `${(bytes / 1024 ** 2).toFixed(2)}MB`; + } + + if (bytes < 1024 ** 4) { + return `${(bytes / 1024 ** 3).toFixed(2)}GB`; + } + + if (bytes < 1024 ** 5) { + return `${(bytes / 1024 ** 4).toFixed(2)}TB`; + } + + if (bytes < 1024 ** 6) { + return `${(bytes / 1024 ** 5).toFixed(2)}TB`; + } + + return ""; +}; diff --git a/dashboard/client/src/util/func.tsx b/dashboard/client/src/util/func.tsx new file mode 100644 index 000000000000..c07ef70fe85b --- /dev/null +++ b/dashboard/client/src/util/func.tsx @@ -0,0 +1,28 @@ +import { Tooltip } from "@material-ui/core"; +import React, { CSSProperties } from "react"; + +export const longTextCut = (text: string = "", len: number = 28) => ( + + {text.length > len ? text.slice(0, len) + "..." : text} + +); + +export const jsonFormat = (str: string | object) => { + const preStyle = { + textAlign: "left", + wordBreak: "break-all", + whiteSpace: "pre-wrap", + } as CSSProperties; + if (typeof str === "object") { + return
{JSON.stringify(str, null, 2)}
; + } + try { + const j = JSON.parse(str); + if (typeof j !== "object") { + return JSON.stringify(j); + } + return
{JSON.stringify(j, null, 2)}
; + } catch (e) { + return str; + } +}; diff --git a/dashboard/client/src/util/hook.ts b/dashboard/client/src/util/hook.ts new file mode 100644 index 000000000000..3c6f61b06ef8 --- /dev/null +++ b/dashboard/client/src/util/hook.ts @@ -0,0 +1,63 @@ +import { get } from "lodash"; +import { useState } from "react"; + +export const useFilter = () => { + const [filters, setFilters] = useState<{ key: KeyType; val: string }[]>([]); + const changeFilter = (key: KeyType, val: string) => { + const f = filters.find((e) => e.key === key); + if (f) { + f.val = val; + } else { + filters.push({ key, val }); + } + setFilters([...filters]); + }; + const filterFunc = (instance: { [key: string]: any }) => { + return filters.every( + (f) => !f.val || get(instance, f.key, "").toString().includes(f.val), + ); + }; + + return { + changeFilter, + filterFunc, + }; +}; + +export const useSorter = (initialSortKey?: string) => { + const [sorter, setSorter] = useState({ + key: initialSortKey || "", + desc: false, + }); + + const sorterFunc = ( + instanceA: { [key: string]: any }, + instanceB: { [key: string]: any }, + ) => { + if (!sorter.key) { + return 0; + } + + let [b, a] = [instanceA, instanceB]; + if (sorter.desc) { + [a, b] = [instanceA, instanceB]; + } + + if (!get(a, sorter.key)) { + return -1; + } + + if (!get(b, sorter.key)) { + return 1; + } + + return get(a, sorter.key) > get(b, sorter.key) ? 1 : -1; + }; + + return { + sorterFunc, + setSortKey: (key: string) => setSorter({ ...sorter, key }), + setOrderDesc: (desc: boolean) => setSorter({ ...sorter, desc }), + sorterKey: sorter.key, + }; +}; diff --git a/dashboard/client/src/util/localData.ts b/dashboard/client/src/util/localData.ts new file mode 100644 index 000000000000..0066c4788b95 --- /dev/null +++ b/dashboard/client/src/util/localData.ts @@ -0,0 +1,12 @@ +export const getLocalStorage = (key: string) => { + const data = window.localStorage.getItem(key); + try { + return JSON.parse(data || "") as T; + } catch { + return data; + } +}; + +export const setLocalStorage = (key: string, value: any) => { + return window.localStorage.setItem(key, JSON.stringify(value)); +}; From 0f3a3e14aafb5c339c72fb0536e46630d9301ad7 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 29 Jan 2021 20:24:09 +0800 Subject: [PATCH 096/245] Only delete local object in CoreWorkerPlasmaStoreProvider:::WarmupStore (#13788) --- src/ray/core_worker/store_provider/plasma_store_provider.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ray/core_worker/store_provider/plasma_store_provider.cc b/src/ray/core_worker/store_provider/plasma_store_provider.cc index 831f2629a9b1..a8f116287228 100644 --- a/src/ray/core_worker/store_provider/plasma_store_provider.cc +++ b/src/ray/core_worker/store_provider/plasma_store_provider.cc @@ -429,7 +429,7 @@ Status CoreWorkerPlasmaStoreProvider::WarmupStore() { RAY_RETURN_NOT_OK(Create(nullptr, 8, object_id, rpc::Address(), &data)); RAY_RETURN_NOT_OK(Seal(object_id)); RAY_RETURN_NOT_OK(Release(object_id)); - RAY_RETURN_NOT_OK(Delete({object_id}, false)); + RAY_RETURN_NOT_OK(Delete({object_id}, true)); return Status::OK(); } From 9a413144b1b89b31dca9a0ecf97435b42d43256a Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Fri, 29 Jan 2021 17:14:46 +0100 Subject: [PATCH 097/245] [tune] dynamic global checkpointing interval (#13736) * Add scalability tests * Move experiment checkpointing into a manager class * Dynamic global checkpointing * Actually write checkpoints * Remove debug message * Pass `force` * Pre-review * Revert scalability commits * Revert scalability commits * Apply suggestions from code review --- python/ray/tune/tests/test_trial_runner_3.py | 19 ++ python/ray/tune/trial_runner.py | 195 ++++++++++++++----- python/ray/tune/utils/util.py | 6 +- 3 files changed, 169 insertions(+), 51 deletions(-) diff --git a/python/ray/tune/tests/test_trial_runner_3.py b/python/ray/tune/tests/test_trial_runner_3.py index ab10112d47d4..b0c4a7063546 100644 --- a/python/ray/tune/tests/test_trial_runner_3.py +++ b/python/ray/tune/tests/test_trial_runner_3.py @@ -695,6 +695,25 @@ def num_checkpoints(trial): self.assertTrue(trials[0].has_checkpoint()) self.assertEqual(num_checkpoints(trials[0]), 2) + @patch("ray.tune.syncer.CLOUD_SYNC_PERIOD", 0) + def testCheckpointAutoPeriod(self): + # This makes checkpointing take 2 seconds. + def sync_up(source, target): + time.sleep(2) + return True + + runner = TrialRunner( + local_checkpoint_dir=self.tmpdir, + checkpoint_period="auto", + sync_to_cloud=sync_up, + remote_checkpoint_dir="fake") + runner.add_trial(Trial("__fake", config={"user_checkpoint_freq": 1})) + + runner.step() # Run one step, this will trigger checkpointing + + self.assertGreaterEqual(runner._checkpoint_manager._checkpoint_period, + 38.) + class SearchAlgorithmTest(unittest.TestCase): @classmethod diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index c487190f7f66..d8b45b19bc7f 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union import click from datetime import datetime @@ -16,11 +16,11 @@ from ray.tune.ray_trial_executor import RayTrialExecutor from ray.tune.result import (DEFAULT_METRIC, TIME_THIS_ITER_S, RESULT_DUPLICATE, SHOULD_CHECKPOINT) -from ray.tune.syncer import get_cloud_syncer +from ray.tune.syncer import CloudSyncer, get_cloud_syncer from ray.tune.trial import Checkpoint, Trial from ray.tune.schedulers import FIFOScheduler, TrialScheduler -from ray.tune.suggest import BasicVariantGenerator -from ray.tune.utils import warn_if_slow, flatten_dict, env_integer +from ray.tune.suggest import BasicVariantGenerator, SearchAlgorithm +from ray.tune.utils import warn_if_slow, flatten_dict from ray.tune.utils.log import Verbosity, has_verbosity from ray.tune.utils.placement_groups import TUNE_MAX_PENDING_TRIALS_PG from ray.tune.utils.serialization import TuneFunctionDecoder, \ @@ -42,6 +42,106 @@ def _find_newest_ckpt(ckpt_dir): return max(full_paths) +class _ExperimentCheckpointManager: + """Helper class for managing experiment-level checkpoints. + + This class implements the ``checkpoint()`` method used to checkpoint + experiment state. When called, this will serialize and write to disk + the state of the trial runner, trial executor, and search algorithm, to + a specified checkpoint file. + + The checkpoint period is automatically adjusted to + ``max(10, time_per_checkpoint * 19)``. This means that at most 5% of the + time (1/20) will be used for writing checkpoints, while 95% of the time + (19/20) will be used to handle the rest of the training loop. + + """ + + def __init__(self, checkpoint_dir: str, + checkpoint_period: Union[int, float, str], start_time: float, + session_str: str, syncer: CloudSyncer): + self._checkpoint_dir = checkpoint_dir + self._auto_checkpoint_enabled = checkpoint_period == "auto" + if self._auto_checkpoint_enabled: + self._checkpoint_period = 10. # Initial value + else: + self._checkpoint_period = float(checkpoint_period) + + self._start_time = start_time + self._session_str = session_str + + self._syncer = syncer + + self._last_checkpoint_time = 0. + + @property + def auto_checkpoint_enabled(self): + return self._auto_checkpoint_enabled + + def checkpoint(self, + checkpoint_file: str, + trial_runner: "TrialRunner", + trial_executor: RayTrialExecutor, + search_alg: SearchAlgorithm, + force=False): + """Saves execution state to `self._local_checkpoint_dir`. + + Overwrites the current session checkpoint, which starts when self + is instantiated. Throttle depends on self._checkpoint_period. + + Also automatically saves the search algorithm to the local + checkpoint dir. + + Args: + force (bool): Forces a checkpoint despite checkpoint_period. + """ + if not self._checkpoint_dir: + return + + now = time.time() + if now - self._last_checkpoint_time < self._checkpoint_period and ( + not force): + return + + def _serialize_and_write(): + runner_state = { + "checkpoints": list(trial_executor.get_checkpoints().values()), + "runner_data": trial_runner.__getstate__(), + "stats": { + "start_time": self._start_time, + "timestamp": self._last_checkpoint_time + } + } + tmp_file_name = os.path.join(self._checkpoint_dir, + ".tmp_checkpoint") + with open(tmp_file_name, "w") as f: + json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder) + + os.replace(tmp_file_name, checkpoint_file) + search_alg.save_to_dir( + self._checkpoint_dir, session_str=self._session_str) + + checkpoint_time_start = time.monotonic() + _serialize_and_write() + if force: + self._syncer.sync_up() + else: + self._syncer.sync_up_if_needed() + checkpoint_time_taken = time.monotonic() - checkpoint_time_start + + if self._auto_checkpoint_enabled: + # Multiplying this time by 19 means we spend ~5% of the time + # writing global checkpoints and 95% of the time processing trials + self._checkpoint_period = max(10., checkpoint_time_taken * 19) + logger.debug(f"Global experiment checkpointing took " + f"{checkpoint_time_taken:.2f} seconds. " + f"Adjusting checkpoint period to " + f"{self._checkpoint_period:.2f} seconds.") + + self._last_checkpoint_time = time.time() + return self._checkpoint_dir + + class TrialRunner: """A TrialRunner implements the event loop for scheduling trials on Ray. @@ -82,8 +182,10 @@ class TrialRunner: If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution. - checkpoint_period (int): Trial runner checkpoint periodicity in - seconds. Defaults to 10. + checkpoint_period (int|str): Trial runner checkpoint periodicity in + seconds. Defaults to ``"auto"``, which adjusts checkpointing + time so that at most 5% of the time is spent on writing + checkpoints. trial_executor (TrialExecutor): Defaults to RayTrialExecutor. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the @@ -183,9 +285,7 @@ def __init__(self, self._start_time = time.time() self._last_checkpoint_time = -float("inf") - if checkpoint_period is None: - checkpoint_period = env_integer("TUNE_GLOBAL_CHECKPOINT_S", 10) - self._checkpoint_period = checkpoint_period + self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None @@ -196,6 +296,20 @@ def __init__(self, self._callbacks = CallbackList(callbacks or []) + if checkpoint_period is None: + checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto") + + self._checkpoint_period = checkpoint_period + self._checkpoint_manager = self._create_checkpoint_manager() + + def _create_checkpoint_manager(self): + return _ExperimentCheckpointManager( + checkpoint_dir=self._local_checkpoint_dir, + checkpoint_period=self._checkpoint_period, + start_time=self._start_time, + session_str=self._session_str, + syncer=self._syncer) + @property def resumed(self): return self._resumed @@ -269,36 +383,23 @@ def checkpoint(self, force=False): Args: force (bool): Forces a checkpoint despite checkpoint_period. """ - if not self._local_checkpoint_dir: - return - now = time.time() - if now - self._last_checkpoint_time < self._checkpoint_period and ( - not force): - return - self._last_checkpoint_time = now - runner_state = { - "checkpoints": list( - self.trial_executor.get_checkpoints().values()), - "runner_data": self.__getstate__(), - "stats": { - "start_time": self._start_time, - "timestamp": self._last_checkpoint_time - } - } - tmp_file_name = os.path.join(self._local_checkpoint_dir, - ".tmp_checkpoint") - with open(tmp_file_name, "w") as f: - json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder) - - os.replace(tmp_file_name, self.checkpoint_file) - self._search_alg.save_to_dir( - self._local_checkpoint_dir, session_str=self._session_str) - - if force: - self._syncer.sync_up() - else: - self._syncer.sync_up_if_needed() - return self._local_checkpoint_dir + with warn_if_slow( + "experiment_checkpoint", + message="Checkpointing the experiment state took " + "{duration:.3f} s, which may be a performance " + "bottleneck. Please ensure the " + "`TUNE_GLOBAL_CHECKPOINT_S` environment variable is " + "something significantly higher than this duration " + "to ensure compute time is mostly spent on the main " + "training loop.", + disable=self._checkpoint_manager.auto_checkpoint_enabled): + + self._checkpoint_manager.checkpoint( + checkpoint_file=self.checkpoint_file, + trial_runner=self, + trial_executor=self.trial_executor, + search_alg=self._search_alg, + force=force) def resume(self, run_errored_only=False): """Resumes all checkpointed trials from previous run. @@ -406,16 +507,7 @@ def _start_trial(trial: Trial) -> bool: self._stop_experiment_if_needed() try: - with warn_if_slow( - "experiment_checkpoint", - message="Checkpointing the experiment state took " - "{duration:.3f} s, which may be a performance " - "bottleneck. Please ensure the " - "`TUNE_GLOBAL_CHECKPOINT_S` environment variable is " - "something significantly higher than this duration " - "to ensure compute time is mostly spent on the main " - "training loop."): - self.checkpoint() + self.checkpoint() except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") self._iteration += 1 @@ -1028,7 +1120,8 @@ def __getstate__(self): for k in [ "_trials", "_stop_queue", "_server", "_search_alg", "_scheduler_alg", "_pending_trial_queue_times", - "trial_executor", "_syncer", "_callbacks" + "trial_executor", "_syncer", "_callbacks", + "_checkpoint_manager" ]: del state[k] state["launch_web_server"] = bool(self._server) @@ -1045,5 +1138,7 @@ def __setstate__(self, state): self.__dict__.setdefault("_start_time", start_time) self.__dict__.update(state) + self._checkpoint_manager = self._create_checkpoint_manager() + if launch_web_server: self._server = TuneServer(self, self._server_port) diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py index 47a6b648eb1a..688261fdb2c0 100644 --- a/python/ray/tune/utils/util.py +++ b/python/ray/tune/utils/util.py @@ -133,11 +133,13 @@ class warn_if_slow: def __init__(self, name: str, threshold: Optional[float] = None, - message: Optional[str] = None): + message: Optional[str] = None, + disable: bool = False): self.name = name self.threshold = threshold or self.DEFAULT_THRESHOLD self.message = message or self.DEFAULT_MESSAGE self.too_slow = False + self.disable = disable def __enter__(self): self.start = time.time() @@ -145,6 +147,8 @@ def __enter__(self): def __exit__(self, type, value, traceback): now = time.time() + if self.disable: + return if now - self.start > self.threshold and now - START_OF_TIME > 60.0: self.too_slow = True duration = now - self.start From 4d6817c6832f64ae7340fb62989eb28b7c1ff3d1 Mon Sep 17 00:00:00 2001 From: Ameer Haj Ali Date: Fri, 29 Jan 2021 19:41:56 +0200 Subject: [PATCH 098/245] [autoscaler] Better validation for min_workers and max_workers (#13779) * prepare for head node * move command runner interface outside _private * remove space * Eric * flake * min_workers in multi node type * fixing edge cases * eric not idle * fix target_workers to consider min_workers of node types * idle timeout * minor * minor fix * test * lint * eric v2 * eric 3 * min_workers constraint before bin packing * Update resource_demand_scheduler.py * Revert "Update resource_demand_scheduler.py" This reverts commit 818a63a2c86d8437b3ef21c5035d701c1d1127b5. * reducing diff * make get_nodes_to_launch return a dict * merge * weird merge fix * auto fill instance types for AWS * Alex/Eric * Update doc/source/cluster/autoscaling.rst * merge autofill and input from user * logger.exception * make the yaml use the default autofill * docs Eric * remove test_autoscaler_yaml from windows tests * lets try changing the test a bit * return test * lets see * edward * Limit max launch concurrency * commenting frac TODO * move to resource demand scheduler * use STATUS UP TO DATE * Eric * make logger of gc freed refs debug instead of info * add cluster name to docker mount prefix directory * grrR * fix tests * moving docker directory to sdk * move the import to prevent circular dependency * smallf fix * ian * fix max launch concurrency bug to assume failing nodes as pending and consider only load_metric's connected nodes as running * small fix * deflake test_joblib * lint * placement groups bypass * remove space * Eric * first ocmmit * lint * exmaple * documentation * hmm * file path fix * fix test * some format issue in docs * modified docs * joblib strikes again on windows * add ability to not start autoscaler/monitor * a * remove worker_default * Remove default pod type from operator * Remove worker_default_node_type from rewrite_legacy_yaml_to_availble_node_types * deprecate useless fields * fix error msg * validate sum min_workers < max_workers * 1 more edge case test * lint Co-authored-by: Ameer Haj Ali Co-authored-by: Alex Wu Co-authored-by: Alex Wu Co-authored-by: Eric Liang Co-authored-by: Ameer Haj Ali Co-authored-by: root Co-authored-by: Dmitri Gekhtman --- python/ray/autoscaler/_private/util.py | 8 ++++++++ python/ray/tests/test_autoscaler_yaml.py | 25 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 2bd1e13e9c38..32758dec649f 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -86,6 +86,14 @@ def validate_config(config: Dict[str, Any]) -> None: raise ValueError( "`head_node_type` must be one of `available_node_types`.") + sum_min_workers = sum( + config["available_node_types"][node_type].get("min_workers", 0) + for node_type in config["available_node_types"]) + if sum_min_workers > config["max_workers"]: + raise ValueError( + "The specified global `max_workers` is smaller than the " + "sum of `min_workers` of all the available node types.") + def prepare_config(config): with_defaults = fillout_defaults(config) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index b712c8955e97..e5220771f389 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -45,6 +45,31 @@ def testValidateDefaultConfig(self): except Exception: self.fail("Config did not pass validation test!") + def testValidateDefaultConfigMinMaxWorkers(self): + aws_config_path = os.path.join( + RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") + with open(aws_config_path) as f: + config = yaml.safe_load(f) + config = prepare_config(config) + for node_type in config["available_node_types"]: + config["available_node_types"][node_type]["resources"] = config[ + "available_node_types"][node_type].get("resources", {}) + try: + validate_config(config) + except Exception: + self.fail("Config did not pass validation test!") + + config["max_workers"] = 0 # the sum of min_workers is 1. + with pytest.raises(ValueError): + validate_config(config) + + # make sure edge case of exactly 1 passes too. + config["max_workers"] = 1 + try: + validate_config(config) + except Exception: + self.fail("Config did not pass validation test!") + @pytest.mark.skipif( sys.platform.startswith("win"), reason="TODO(ameer): fails on Windows.") From b20a38febb41cb2eafad6f1882a5cd3b929c8f6f Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 29 Jan 2021 09:50:28 -0800 Subject: [PATCH 099/245] [autoscaler] Avoid launching GPU nodes when the workload only has CPU tasks. (#13776) * wip * avoid gpus * update * update --- python/ray/autoscaler/_private/constants.py | 3 ++ .../_private/resource_demand_scheduler.py | 13 ++++++- .../tests/test_resource_demand_scheduler.py | 37 +++++++++++++++++-- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/python/ray/autoscaler/_private/constants.py b/python/ray/autoscaler/_private/constants.py index 3fd3ec65e095..2fbf6ec325e4 100644 --- a/python/ray/autoscaler/_private/constants.py +++ b/python/ray/autoscaler/_private/constants.py @@ -15,6 +15,9 @@ def env_integer(key, default): # Whether event logging to driver is enabled. Set to 0 to disable. AUTOSCALER_EVENTS = env_integer("AUTOSCALER_EVENTS", 1) +# Whether to avoid launching GPU nodes for CPU only tasks. +AUTOSCALER_CONSERVE_GPU_NODES = env_integer("AUTOSCALER_CONSERVE_GPU_NODES", 1) + # How long to wait for a node to start, in seconds NODE_START_WAIT_S = env_integer("AUTOSCALER_NODE_START_WAIT_S", 900) diff --git a/python/ray/autoscaler/_private/resource_demand_scheduler.py b/python/ray/autoscaler/_private/resource_demand_scheduler.py index 523fd7d2f028..0a08e0579b2e 100644 --- a/python/ray/autoscaler/_private/resource_demand_scheduler.py +++ b/python/ray/autoscaler/_private/resource_demand_scheduler.py @@ -17,6 +17,7 @@ from ray.autoscaler.node_provider import NodeProvider from ray.gcs_utils import PlacementGroupTableData from ray.core.generated.common_pb2 import PlacementStrategy +from ray.autoscaler._private.constants import AUTOSCALER_CONSERVE_GPU_NODES from ray.autoscaler.tags import ( TAG_RAY_USER_NODE_TYPE, NODE_KIND_UNMANAGED, NODE_TYPE_LEGACY_WORKER, NODE_KIND_WORKER, NODE_TYPE_LEGACY_HEAD, TAG_RAY_NODE_KIND, NODE_KIND_HEAD) @@ -639,7 +640,7 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], # resources. This will behave properly with the current utilization # score heuristic, but it's a little dangerous and misleading. logger.warning( - f"The autoscaler could not find a node type to satisfy the" + f"The autoscaler could not find a node type to satisfy the " f"request: {resources}. If this request is related to " f"placement groups the resource request will resolve itself, " f"otherwise please specify a node type with the necessary " @@ -664,8 +665,16 @@ def get_nodes_for(node_types: Dict[NodeType, NodeTypeConfigDict], def _utilization_score(node_resources: ResourceDict, - resources: ResourceDict) -> float: + resources: List[ResourceDict]) -> float: remaining = copy.deepcopy(node_resources) + is_gpu_node = "GPU" in node_resources + any_gpu_task = any("GPU" in r for r in resources) + + # Avoid launching GPU nodes if there aren't any GPU tasks at all. Note that + # if there *is* a GPU task, then CPU tasks can be scheduled as well. + if AUTOSCALER_CONSERVE_GPU_NODES: + if is_gpu_node and not any_gpu_task: + return None fittable = [] for r in resources: diff --git a/python/ray/tests/test_resource_demand_scheduler.py b/python/ray/tests/test_resource_demand_scheduler.py index 536cbe18bc5a..977c2f2b8148 100644 --- a/python/ray/tests/test_resource_demand_scheduler.py +++ b/python/ray/tests/test_resource_demand_scheduler.py @@ -105,6 +105,14 @@ def test_util_score(): (8, 8) +def test_gpu_node_util_score(): + # Avoid scheduling CPU tasks on GPU node. + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1}]) is None + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"CPU": 1, "GPU": 1}]) \ + == (1.0, 1.0) + assert _utilization_score({"GPU": 1, "CPU": 1}, [{"GPU": 1}]) == (0.0, 0.5) + + def test_bin_pack(): assert get_bin_pack_residual([], [{"GPU": 2}, {"GPU": 2}])[0] == \ [{"GPU": 2}, {"GPU": 2}] @@ -247,6 +255,32 @@ def test_get_nodes_packing_heuristic(): } +def test_gpu_node_avoid_cpu_task(): + types = { + "cpu": { + "resources": { + "CPU": 1 + }, + "max_workers": 10, + }, + "gpu": { + "resources": { + "GPU": 1, + "CPU": 100, + }, + "max_workers": 10, + }, + } + r1 = [{"CPU": 1}] * 100 + assert get_nodes_for(types, {}, "empty_node", 100, r1) == {"cpu": 10} + r2 = [{"GPU": 1}] + [{"CPU": 1}] * 100 + assert get_nodes_for(types, {}, "empty_node", 100, r2) == \ + {"gpu": 1} + r3 = [{"GPU": 1}] * 4 + [{"CPU": 1}] * 404 + assert get_nodes_for(types, {}, "empty_node", 100, r3) == \ + {"gpu": 4, "cpu": 4} + + def test_get_nodes_respects_max_limit(): types = { "m4.large": { @@ -2029,7 +2063,6 @@ def testRequestResourcesIdleTimeout(self): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3 @@ -2146,7 +2179,6 @@ def testRequestResourcesRaceConditionsLong(self): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3, @@ -2260,7 +2292,6 @@ def testRequestResourcesRaceConditionWithMinWorker(self): "node_config": {}, "resources": { "CPU": 2, - "GPU": 1, "WORKER": 1 }, "max_workers": 3, From 0b598c0f05d14cc2dfce12d275423e233de5e0bc Mon Sep 17 00:00:00 2001 From: "Siyuan (Ryans) Zhuang" Date: Fri, 29 Jan 2021 10:27:05 -0800 Subject: [PATCH 100/245] [Serialization] API for deregistering serializers; code & doc cleanup (#13471) * make methods private, remove confusion brackets and usages * unregister serializer; fix doc * Cleanup doc * rename unregister -> deregister --- doc/source/serialization.rst | 99 ++++++++++++++------------ python/ray/actor.py | 2 +- python/ray/serialization.py | 17 ++--- python/ray/tests/test_serialization.py | 7 ++ python/ray/util/__init__.py | 3 +- python/ray/util/serialization.py | 11 +++ 6 files changed, 84 insertions(+), 55 deletions(-) diff --git a/doc/source/serialization.rst b/doc/source/serialization.rst index a5e58a339f6f..b36d48627e8f 100644 --- a/doc/source/serialization.rst +++ b/doc/source/serialization.rst @@ -5,24 +5,24 @@ Serialization Since Ray processes do not share memory space, data transferred between workers and nodes will need to **serialized** and **deserialized**. Ray uses the `Plasma object store `_ to efficiently transfer objects across different processes and different nodes. Numpy arrays in the object store are shared between workers on the same node (zero-copy deserialization). +Overview +-------- + +Ray has decided to use a customized `Pickle protocol version 5 `_ backport to replace the original PyArrow serializer. This gets rid of several previous limitations (e.g. cannot serialize recursive objects). + +Ray is currently compatible with Pickle protocol version 5, while Ray supports serialization of a wider range of objects (e.g. lambda & nested functions, dynamic classes) with the help of cloudpickle. + .. _plasma-store: Plasma Object Store -------------------- +~~~~~~~~~~~~~~~~~~~ Plasma is an in-memory object store that is being developed as part of Apache Arrow. Ray uses Plasma to efficiently transfer objects across different processes and different nodes. All objects in Plasma object store are **immutable** and held in shared memory. This is so that they can be accessed efficiently by many workers on the same node. Each node has its own object store. When data is put into the object store, it does not get automatically broadcasted to other nodes. Data remains local to the writer until requested by another task or actor on another node. -Overview --------- - -Ray has decided to use a customized `Pickle protocol version 5 `_ backport to replace the original PyArrow serializer. This gets rid of several previous limitations (e.g. cannot serialize recursive objects). - -Ray is currently compatible with Pickle protocol version 5, while Ray supports serialization of a wider range of objects (e.g. lambda & nested functions, dynamic classes) with the help of cloudpickle. - Numpy Arrays ------------- +~~~~~~~~~~~~ Ray optimizes for numpy arrays by using Pickle protocol 5 with out-of-band data. The numpy array is stored as a read-only object, and all Ray workers on the same node can read the numpy array in the object store without copying (zero-copy reads). Each numpy array object in the worker process holds a pointer to the relevant array held in shared memory. Any writes to the read-only object will require the user to first copy it into the local process memory. @@ -48,7 +48,7 @@ Serialization notes - Lock objects are mostly unserializable, because copying a lock is meaningless and could cause serious concurrency problems. You may have to come up with a workaround if your object contains a lock. Customized Serialization -________________________ +------------------------ Sometimes you may want to customize your serialization process because the default serializer used by Ray (pickle5 + cloudpickle) does @@ -61,29 +61,29 @@ There are at least 3 ways to define your custom serialization process: function inside the corresponding class. This is commonly done by most Python libraries. Example code: -.. code-block:: python + .. code-block:: python - import ray - import sqlite3 + import ray + import sqlite3 - ray.init() + ray.init() - class DBConnection: - def __init__(self, path): - self.path = path - self.conn = sqlite3.connect(path) + class DBConnection: + def __init__(self, path): + self.path = path + self.conn = sqlite3.connect(path) - # without '__reduce__', the instance is unserializable. - def __reduce__(self): - deserializer = DBConnection - serialized_data = (self.path,) - return deserializer, serialized_data + # without '__reduce__', the instance is unserializable. + def __reduce__(self): + deserializer = DBConnection + serialized_data = (self.path,) + return deserializer, serialized_data - original = DBConnection("/tmp/db") - print(original.conn) + original = DBConnection("/tmp/db") + print(original.conn) - copied = ray.get(ray.put(original)) - print(copied.conn) + copied = ray.get(ray.put(original)) + print(copied.conn) 2. If you want to customize the serialization of a type of objects, but you cannot access or modify the corresponding class, you can @@ -112,8 +112,17 @@ There are at least 3 ways to define your custom serialization process: A, serializer=custom_serializer, deserializer=custom_deserializer) ray.get(ray.put(A(1))) # success! + # You can deregister the serializer at any time. + ray.util.deregister_serializer(A) + ray.get(ray.put(A(1))) # fail! + + # Nothing happens when deregister an unavailable serializer. + ray.util.deregister_serializer(A) + NOTE: Serializers are managed locally for each Ray worker. So for every Ray worker, - if you want to use the serializer, you need to register the serializer. + if you want to use the serializer, you need to register the serializer. Deregister + a serializer also only applies locally. + If you register a new serializer for a class, the new serializer would replace the old serializer immediately in the worker. This API is also idempotent, there are no side effects caused by re-registering the same serializer. @@ -121,29 +130,29 @@ There are at least 3 ways to define your custom serialization process: 3. We also provide you an example, if you want to customize the serialization of a specific object: -.. code-block:: python + .. code-block:: python - import threading + import threading - class A: - def __init__(self, x): - self.x = x - self.lock = threading.Lock() # could not serialize! + class A: + def __init__(self, x): + self.x = x + self.lock = threading.Lock() # could not serialize! - ray.get(ray.put(A(1))) # fail! + ray.get(ray.put(A(1))) # fail! - class SerializationHelperForA: - """A helper class for serialization.""" - def __init__(self, a): - self.a = a + class SerializationHelperForA: + """A helper class for serialization.""" + def __init__(self, a): + self.a = a - def __reduce__(self): - return A, (self.a.x,) + def __reduce__(self): + return A, (self.a.x,) - ray.get(ray.put(SerializationHelperForA(A(1)))) # success! - # the serializer only works for a specific object, not all A - # instances, so we still expect failure here. - ray.get(ray.put(A(1))) # still fail! + ray.get(ray.put(SerializationHelperForA(A(1)))) # success! + # the serializer only works for a specific object, not all A + # instances, so we still expect failure here. + ray.get(ray.put(A(1))) # still fail! Troubleshooting diff --git a/python/ray/actor.py b/python/ray/actor.py index 547a2929db15..7ff9f1f33e04 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -937,7 +937,7 @@ def _deserialization_helper(cls, state, outer_object_ref=None): def __reduce__(self): """This code path is used by pickling but not by Ray forking.""" state = self._serialization_helper() - return ActorHandle._deserialization_helper, (state) + return ActorHandle._deserialization_helper, state def modify_class(cls): diff --git a/python/ray/serialization.py b/python/ray/serialization.py index 724cf477ef61..a2009e4fd453 100644 --- a/python/ray/serialization.py +++ b/python/ray/serialization.py @@ -31,7 +31,7 @@ class DeserializationError(Exception): pass -def object_ref_deserializer(reduced_obj_ref, owner_address): +def _object_ref_deserializer(binary, owner_address): # NOTE(suquark): This function should be a global function so # cloudpickle can access it directly. Otherwise couldpickle # has to dump the whole function definition, which is inefficient. @@ -40,9 +40,7 @@ def object_ref_deserializer(reduced_obj_ref, owner_address): # the core worker to resolve the value. This is to make sure # that the ref count for the ObjectRef is greater than 0 by the # time the core worker resolves the value of the object. - - # UniqueIDs are serialized as (class name, (unique bytes,)). - obj_ref = reduced_obj_ref[0](*reduced_obj_ref[1]) + obj_ref = ray.ObjectRef(binary) # TODO(edoakes): we should be able to just capture a reference # to 'self' here instead, but this function is itself pickled @@ -61,7 +59,7 @@ def object_ref_deserializer(reduced_obj_ref, owner_address): return obj_ref -def actor_handle_deserializer(serialized_obj): +def _actor_handle_deserializer(serialized_obj): # If this actor handle was stored in another object, then tell the # core worker. context = ray.worker.global_worker.get_serialization_context() @@ -85,7 +83,7 @@ def actor_handle_reducer(obj): serialized, actor_handle_id = obj._serialization_helper() # Update ref counting for the actor handle self.add_contained_object_ref(actor_handle_id) - return actor_handle_deserializer, (serialized, ) + return _actor_handle_deserializer, (serialized, ) self._register_cloudpickle_reducer(ray.actor.ActorHandle, actor_handle_reducer) @@ -96,13 +94,16 @@ def object_ref_reducer(obj): worker.check_connected() obj, owner_address = ( worker.core_worker.serialize_and_promote_object_ref(obj)) - return object_ref_deserializer, (obj.__reduce__(), owner_address) + return _object_ref_deserializer, (obj.binary(), owner_address) self._register_cloudpickle_reducer(ray.ObjectRef, object_ref_reducer) def _register_cloudpickle_reducer(self, cls, reducer): pickle.CloudPickler.dispatch[cls] = reducer + def _unregister_cloudpickle_reducer(self, cls): + pickle.CloudPickler.dispatch.pop(cls, None) + def _register_cloudpickle_serializer(self, cls, custom_serializer, custom_deserializer): def _CloudPicklerReducer(obj): @@ -198,7 +199,7 @@ def _deserialize_object(self, data, metadata, object_ref): elif metadata_fields[ 0] == ray_constants.OBJECT_METADATA_TYPE_ACTOR_HANDLE: obj = self._deserialize_msgpack_data(data, metadata_fields) - return actor_handle_deserializer(obj) + return _actor_handle_deserializer(obj) # Otherwise, return an exception object based on # the error type. try: diff --git a/python/ray/tests/test_serialization.py b/python/ray/tests/test_serialization.py index 8c72ba209420..7b5f32f96a70 100644 --- a/python/ray/tests/test_serialization.py +++ b/python/ray/tests/test_serialization.py @@ -616,6 +616,13 @@ def custom_deserializer(x): A, serializer=custom_serializer, deserializer=custom_deserializer) ray.get(ray.put(A(1))) + ray.util.deregister_serializer(A) + with pytest.raises(Exception): + ray.get(ray.put(A(1))) + + # deregister again takes no effects + ray.util.deregister_serializer(A) + if __name__ == "__main__": import pytest diff --git a/python/ray/util/__init__.py b/python/ray/util/__init__.py index b2dc97bbd41a..b682f15dc878 100644 --- a/python/ray/util/__init__.py +++ b/python/ray/util/__init__.py @@ -6,7 +6,7 @@ from ray.util.placement_group import (placement_group, placement_group_table, remove_placement_group) from ray.util import rpdb as pdb -from ray.util.serialization import register_serializer +from ray.util.serialization import register_serializer, deregister_serializer from ray.util.client_connect import connect, disconnect @@ -25,4 +25,5 @@ "connect", "disconnect", "register_serializer", + "deregister_serializer", ] diff --git a/python/ray/util/serialization.py b/python/ray/util/serialization.py index a93bbab55acb..cb9e2b1b9dac 100644 --- a/python/ray/util/serialization.py +++ b/python/ray/util/serialization.py @@ -16,3 +16,14 @@ def register_serializer(cls, *, serializer, deserializer): """ context = ray.worker.global_worker.get_serialization_context() context._register_cloudpickle_serializer(cls, serializer, deserializer) + + +def deregister_serializer(cls): + """Deregister the serializer associated with the type ``cls``. + There is no effect if the serializer is unavailable. + + Args: + cls: A Python class/type. + """ + context = ray.worker.global_worker.get_serialization_context() + context._unregister_cloudpickle_reducer(cls) From 1a9a0024d56cbf09bf87d9651e0a8ac8a7f22e63 Mon Sep 17 00:00:00 2001 From: Ian Rodney Date: Fri, 29 Jan 2021 12:28:40 -0800 Subject: [PATCH 101/245] [Wheel] Build Py36 & Py38 in separate deploy (#13797) --- .travis.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8cff56d419d2..6ee68c003d94 100644 --- a/.travis.yml +++ b/.travis.yml @@ -197,6 +197,7 @@ matrix: env: # - PYTHON=3.6 - LINUX_WHEELS=1 LINUX_JARS=1 + - DOCKER_BUILD_PY37=1 - PYTHONWARNINGS=ignore - RAY_INSTALL_JAVA=1 language: java @@ -493,7 +494,7 @@ deploy: on: repo: ray-project/ray all_branches: true - condition: $LINUX_WHEELS = 1 || $MAC_WHEELS = 1 + condition: ($LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37=1) || $MAC_WHEELS = 1 - provider: s3 edge: true # This supposedly opts in to deploy v2. @@ -509,7 +510,7 @@ deploy: on: branch: master repo: ray-project/ray - condition: $LINUX_WHEELS = 1 || $MAC_WHEELS = 1 + condition: ($LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37=1) || $MAC_WHEELS = 1 - provider: script edge: true # This supposedly opts in to deploy v2. @@ -518,7 +519,7 @@ deploy: on: repo: ray-project/ray all_branches: true - condition: $LINUX_WHEELS = 1 + condition: $LINUX_WHEELS = 1 && $DOCKER_BUILD_PY37 = 1 # Upload jars so that we can debug locally for every commit - provider: s3 @@ -560,4 +561,4 @@ deploy: on: repo: ray-project/ray all_branches: true - condition: $LINUX_WHEELS = 1 \ No newline at end of file + condition: $LINUX_WHEELS = 1 && $DOCKER_BUILD_PY36_38 = 1 From c21a79ae6e35bdc01b4b40d27f67502489d73390 Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Fri, 29 Jan 2021 12:38:06 -0800 Subject: [PATCH 102/245] [Object Spilling] 100GB shuffle release test (#13729) --- release/RELEASE_PROCESS.rst | 16 ++ release/data_processing_tests/README.rst | 9 + release/data_processing_tests/cluster.yaml | 128 +++++++++++++ .../workloads/streaming_shuffle.py | 177 ++++++++++++++++++ 4 files changed, 330 insertions(+) create mode 100644 release/data_processing_tests/README.rst create mode 100644 release/data_processing_tests/cluster.yaml create mode 100644 release/data_processing_tests/workloads/streaming_shuffle.py diff --git a/release/RELEASE_PROCESS.rst b/release/RELEASE_PROCESS.rst index 018f56bdf941..80afb3589316 100644 --- a/release/RELEASE_PROCESS.rst +++ b/release/RELEASE_PROCESS.rst @@ -148,6 +148,22 @@ is generally the easiest way to run release tests. Run the ``python/ray/tests/test_k8s_*`` to make sure K8s cluster launcher and operator works. Make sure the docker image is the released version. +6. **Data processing tests** + + .. code-block:: bash + + data_processing_tests/README.rst + + Follow the instructions to kick off the tests and check the status of the workloads. + Data processing tests make sure all the data processing features are reliable and performant. + The following tests should be run. + + - ``data_processing_tests/workloads/streaming_shuffle.py`` run the 100GB streaming shuffle in a single node & fake 4 nodes cluster. + + **IMPORTANT** Check if the workload scripts has terminated. If so, please record the result (both read/write bandwidth and the shuffle result) to the ``release_logs/data_processing_tests/[test_name]``. + Both shuffling runtime and read/write bandwidth shouldn't be decreasing more than 15% compared to the previous release. + + Identify and Resolve Release Blockers ------------------------------------- If a release blocking issue arises in the course of testing, you should diff --git a/release/data_processing_tests/README.rst b/release/data_processing_tests/README.rst new file mode 100644 index 000000000000..3db8eeb9ce67 --- /dev/null +++ b/release/data_processing_tests/README.rst @@ -0,0 +1,9 @@ +Running script +-------------- + +Run `unset RAY_ADDRESS; python workloads/streaming_shuffle.py` + +Cluster configurations +---------------------- + +Make sure the test runs in i3.8xl (IO optimized instance). \ No newline at end of file diff --git a/release/data_processing_tests/cluster.yaml b/release/data_processing_tests/cluster.yaml new file mode 100644 index 000000000000..903dd2564def --- /dev/null +++ b/release/data_processing_tests/cluster.yaml @@ -0,0 +1,128 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: native-shuffle-tests + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 0 + +# The autoscaler will scale up the cluster faster with higher upscaling speed. +# E.g., if the task requires adding more nodes then autoscaler will gradually +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. +# This number should be > 0. +upscaling_speed: 1.0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "" # You can change this to latest-cpu if you don't need GPU support and want a faster startup + # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull + container_name: "" + # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image + # if no cached version is present. + pull_before_run: True + run_options: [] # Extra options to pass into "docker run" + + # Example of running a GPU head with CPU workers + # head_image: "rayproject/ray-ml:latest-gpu" + # Allow Ray to automatically detect GPUs + + # worker_image: "rayproject/ray-ml:latest-cpu" + # worker_run_options: [] + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + # Whether to allow node reuse. If set to False, nodes will be terminated + # instead of stopped. + cache_stopped_nodes: True # If not present, the default is True. + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +head_node: + InstanceType: i3.8xlarge + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 1000 + + # Additional options in the boto docs. + +# Provider-specific config for worker nodes, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +worker_nodes: + InstanceType: i3.8xlarge + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 1000 + +# Patterns for files to exclude when running rsync up or rsync down +rsync_exclude: + - "**/.git" + - "**/.git/**" + +# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for +# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided +# as a value, the behavior will match git's behavior for finding and using .gitignore files. +rsync_filter: + - ".gitignore" + +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + +# List of shell commands to run to set up nodes. +setup_commands: + - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl + # Not necessary. + - sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 65535" >> /etc/security/limits.conf; echo "* hard nofile 65535" >> /etc/security/limits.conf;' + - pip install tqdm + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + # - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --system-config='{"automatic_object_spilling_enabled":true,"max_io_workers":1,"object_spilling_config":"{\"type\":\"filesystem\",\"params\":{\"directory_path\":\"/tmp/spill\"}}"}' + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + # - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/release/data_processing_tests/workloads/streaming_shuffle.py b/release/data_processing_tests/workloads/streaming_shuffle.py new file mode 100644 index 000000000000..903042bb9956 --- /dev/null +++ b/release/data_processing_tests/workloads/streaming_shuffle.py @@ -0,0 +1,177 @@ +import time +import json +import ray +import numpy as np +from typing import List +from tqdm import tqdm + +from ray.cluster_utils import Cluster + +num_nodes = 4 +num_cpus = 4 +partition_size = int(500e6) # 500MB +# Number of map & reduce tasks == num_partitions. +# Number of objects == num_partitions ^ 2. +num_partitions = 200 +# There are two int64 per row, so we divide by 8 * 2 bytes. +rows_per_partition = partition_size // (8 * 2) +object_store_size = 20 * 1024 * 1024 * 1024 # 20G + +system_config = { + "automatic_object_spilling_enabled": True, + "max_io_workers": 1, + "object_spilling_config": json.dumps( + { + "type": "filesystem", + "params": { + "directory_path": "/tmp/spill" + } + }, + separators=(",", ":")) +} + + +def display_spilling_info(address): + state = ray.state.GlobalState() + state._initialize_global_state(address, + ray.ray_constants.REDIS_DEFAULT_PASSWORD) + raylet = state.node_table()[0] + memory_summary = ray.internal.internal_api.memory_summary( + raylet["NodeManagerAddress"], raylet["NodeManagerPort"]) + for line in memory_summary.split("\n"): + if "Spilled" in line: + print(line) + if "Restored" in line: + print(line) + print("\n\n") + + +@ray.remote +class Counter: + def __init__(self): + self.num_map = 0 + self.num_reduce = 0 + + def inc(self): + self.num_map += 1 + # print("Num map tasks finished", self.num_map) + + def inc2(self): + self.num_reduce += 1 + # print("Num reduce tasks finished", self.num_reduce) + + def finish(self): + pass + + +# object store peak memory: O(partition size / num partitions) +# heap memory: O(partition size / num partitions) +@ray.remote(num_returns=num_partitions) +def shuffle_map_streaming( + i, counter_handle=None) -> List["ObjectRef[np.ndarray]"]: + outputs = [ + ray.put( + np.ones((rows_per_partition // num_partitions, 2), dtype=np.int64)) + for _ in range(num_partitions) + ] + counter_handle.inc.remote() + return outputs + + +# object store peak memory: O(partition size / num partitions) +# heap memory: O(partition size) -- TODO can be reduced too +@ray.remote +def shuffle_reduce_streaming(*inputs, counter_handle=None) -> np.ndarray: + out = None + for chunk in inputs: + if out is None: + out = ray.get(chunk) + else: + out = np.concatenate([out, ray.get(chunk)]) + counter_handle.inc2.remote() + return out + + +shuffle_map = shuffle_map_streaming +shuffle_reduce = shuffle_reduce_streaming + + +def run_shuffle(): + counter = Counter.remote() + start = time.time() + print("start map") + shuffle_map_out = [ + shuffle_map.remote(i, counter_handle=counter) + for i in range(num_partitions) + ] + # wait until all map is done before reduce phase. + for out in tqdm(shuffle_map_out): + ray.get(out) + + # Start reducing + shuffle_reduce_out = [ + shuffle_reduce.remote( + *[shuffle_map_out[i][j] for i in range(num_partitions)], + counter_handle=counter) for j in range(num_partitions) + ] + + print("start shuffle.") + pbar = tqdm(total=num_partitions) + total_rows = 0 + ready, unready = ray.wait(shuffle_reduce_out) + while unready: + ready, unready = ray.wait(unready) + for output in ready: + pbar.update(1) + total_rows += ray.get(output).shape[0] + delta = time.time() - start + + ray.get(counter.finish.remote()) + print("Shuffled", total_rows * 8 * 2, "bytes in", delta, + "seconds in a single node.\n") + + +def run_single_node(): + address = ray.init( + num_cpus=num_cpus * num_nodes, + object_store_memory=object_store_size, + _system_config=system_config) + + # Run shuffle. + print( + "\n\nTest streaming shuffle with a single node.\n" + f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" + "GB") + run_shuffle() + time.sleep(5) + display_spilling_info(address["redis_address"]) + ray.shutdown() + time.sleep(5) + + +def run_multi_nodes(): + c = Cluster() + c.add_node( + num_cpus=4, + object_store_memory=object_store_size, + _system_config=system_config) + ray.init(address=c.address) + for _ in range(num_nodes - 1): # subtract a head node. + c.add_node(num_cpus=4, object_store_memory=object_store_size) + c.wait_for_nodes() + + # Run shuffle. + print( + f"\n\nTest streaming shuffle with {num_nodes} nodes.\n" + f"Shuffle size: {partition_size * num_partitions / 1024 / 1024 / 1024}" + "GB") + run_shuffle() + time.sleep(5) + display_spilling_info(c.address) + ray.shutdown() + c.shutdown() + time.sleep(5) + + +run_single_node() +run_multi_nodes() From 9441f85e1aac0201345b2c936db22382e1555c68 Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Fri, 29 Jan 2021 12:58:41 -0800 Subject: [PATCH 103/245] [client] Hook runtime context (#13750) Change-Id: I701d21e53900b5f3fb0e23e09f59e8316c7ba623 --- python/ray/runtime_context.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/ray/runtime_context.py b/python/ray/runtime_context.py index fa922cfa0267..fed3ab132ae0 100644 --- a/python/ray/runtime_context.py +++ b/python/ray/runtime_context.py @@ -1,5 +1,6 @@ import ray.worker import logging +from ray._private.client_mode_hook import client_mode_hook logger = logging.getLogger(__name__) @@ -149,6 +150,7 @@ def should_capture_child_tasks_in_placement_group(self): _runtime_context = None +@client_mode_hook def get_runtime_context(): global _runtime_context if _runtime_context is None: From 50808024eb1a93da879a7f84f2d89c3bb6328348 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 29 Jan 2021 15:43:01 -0800 Subject: [PATCH 104/245] Revert "[autoscaler] Better validation for min_workers and max_workers (#13779)" (#13807) This reverts commit 4d6817c6832f64ae7340fb62989eb28b7c1ff3d1. --- python/ray/autoscaler/_private/util.py | 8 -------- python/ray/tests/test_autoscaler_yaml.py | 25 ------------------------ 2 files changed, 33 deletions(-) diff --git a/python/ray/autoscaler/_private/util.py b/python/ray/autoscaler/_private/util.py index 32758dec649f..2bd1e13e9c38 100644 --- a/python/ray/autoscaler/_private/util.py +++ b/python/ray/autoscaler/_private/util.py @@ -86,14 +86,6 @@ def validate_config(config: Dict[str, Any]) -> None: raise ValueError( "`head_node_type` must be one of `available_node_types`.") - sum_min_workers = sum( - config["available_node_types"][node_type].get("min_workers", 0) - for node_type in config["available_node_types"]) - if sum_min_workers > config["max_workers"]: - raise ValueError( - "The specified global `max_workers` is smaller than the " - "sum of `min_workers` of all the available node types.") - def prepare_config(config): with_defaults = fillout_defaults(config) diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py index e5220771f389..b712c8955e97 100644 --- a/python/ray/tests/test_autoscaler_yaml.py +++ b/python/ray/tests/test_autoscaler_yaml.py @@ -45,31 +45,6 @@ def testValidateDefaultConfig(self): except Exception: self.fail("Config did not pass validation test!") - def testValidateDefaultConfigMinMaxWorkers(self): - aws_config_path = os.path.join( - RAY_PATH, "autoscaler/aws/example-multi-node-type.yaml") - with open(aws_config_path) as f: - config = yaml.safe_load(f) - config = prepare_config(config) - for node_type in config["available_node_types"]: - config["available_node_types"][node_type]["resources"] = config[ - "available_node_types"][node_type].get("resources", {}) - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") - - config["max_workers"] = 0 # the sum of min_workers is 1. - with pytest.raises(ValueError): - validate_config(config) - - # make sure edge case of exactly 1 passes too. - config["max_workers"] = 1 - try: - validate_config(config) - except Exception: - self.fail("Config did not pass validation test!") - @pytest.mark.skipif( sys.platform.startswith("win"), reason="TODO(ameer): fails on Windows.") From 194656731dea7c22deeadf12ebd4a21bffefac26 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 29 Jan 2021 15:47:21 -0800 Subject: [PATCH 105/245] [CI] Deflake test_basics and skip test_component_failures_3 (#13801) --- ci/travis/ci.sh | 3 + python/ray/tests/BUILD | 4 +- python/ray/tests/test_basic.py | 126 +-------------------------- python/ray/tests/test_basic_3.py | 142 +++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 126 deletions(-) create mode 100644 python/ray/tests/test_basic_3.py diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 82286c8c211c..2527a4c5b1cb 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -145,8 +145,11 @@ test_python() { -python/ray/tests:test_advanced_3 # test_invalid_unicode_in_worker_log() fails on Windows -python/ray/tests:test_autoscaler_aws -python/ray/tests:test_component_failures + -python/ray/tests:test_component_failures_3 # timeout -python/ray/tests:test_basic_2 # hangs on shared cluster tests -python/ray/tests:test_basic_2_client_mode + -python/ray/tests:test_basic_3 # timeout + -python/ray/tests:test_basic_3_client_mode -python/ray/tests:test_cli -python/ray/tests:test_failure -python/ray/tests:test_global_gc diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD index 97980a641a4a..6bb68b8543cb 100644 --- a/python/ray/tests/BUILD +++ b/python/ray/tests/BUILD @@ -23,6 +23,7 @@ py_test_module_list( "test_autoscaling_policy.py", "test_basic.py", "test_basic_2.py", + "test_basic_3.py", "test_cancel.py", "test_cli.py", "test_component_failures_2.py", @@ -174,11 +175,12 @@ py_test_module_list( "test_advanced.py", "test_basic.py", "test_basic_2.py", + "test_basic_3.py", ], size = "medium", extra_srcs = SRCS, name_suffix = "_client_mode", - # TODO(barakmich): py_test will support env in Bazel 4.0.0... + # TODO(barakmich): py_test will support env in Bazel 4.0.0... # Until then, we can use tags. #env = {"RAY_CLIENT_MODE": "1"}, tags = ["exclusive", "client_tests"], diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 4475bb6ea464..e33af42deb46 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -9,11 +9,7 @@ import pytest import ray.cluster_utils -from ray.test_utils import ( - client_test_enabled, - dicts_equal, - wait_for_pid_to_exit, -) +from ray.test_utils import (client_test_enabled) import ray @@ -170,126 +166,6 @@ class A2: x = 1 -def test_many_fractional_resources(shutdown_only): - ray.init(num_cpus=2, num_gpus=2, resources={"Custom": 2}) - - @ray.remote - def g(): - return 1 - - @ray.remote - def f(block, accepted_resources): - true_resources = { - resource: value[0][1] - for resource, value in ray.get_resource_ids().items() - } - if block: - ray.get(g.remote()) - return dicts_equal(true_resources, accepted_resources) - - # Check that the resource are assigned correctly. - result_ids = [] - for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): - resource_set = {"CPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) - - resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} - result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) - - resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} - result_ids.append( - f._remote([False, resource_set], resources={"Custom": rand1})) - - resource_set = { - "CPU": int(rand1 * 10000) / 10000, - "GPU": int(rand2 * 10000) / 10000, - "Custom": int(rand3 * 10000) / 10000 - } - result_ids.append( - f._remote( - [False, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3})) - result_ids.append( - f._remote( - [True, resource_set], - num_cpus=rand1, - num_gpus=rand2, - resources={"Custom": rand3})) - assert all(ray.get(result_ids)) - - # Check that the available resources at the end are the same as the - # beginning. - stop_time = time.time() + 10 - correct_available_resources = False - while time.time() < stop_time: - available_resources = ray.available_resources() - if ("CPU" in available_resources - and ray.available_resources()["CPU"] == 2.0 - and "GPU" in available_resources - and ray.available_resources()["GPU"] == 2.0 - and "Custom" in available_resources - and ray.available_resources()["Custom"] == 2.0): - correct_available_resources = True - break - if not correct_available_resources: - assert False, "Did not get correct available resources." - - -def test_background_tasks_with_max_calls(shutdown_only): - ray.init(num_cpus=2) - - @ray.remote - def g(): - time.sleep(.1) - return 0 - - @ray.remote(max_calls=1, max_retries=0) - def f(): - return [g.remote()] - - nested = ray.get([f.remote() for _ in range(10)]) - - # Should still be able to retrieve these objects, since f's workers will - # wait for g to finish before exiting. - ray.get([x[0] for x in nested]) - - @ray.remote(max_calls=1, max_retries=0) - def f(): - return os.getpid(), g.remote() - - nested = ray.get([f.remote() for _ in range(10)]) - while nested: - pid, g_id = nested.pop(0) - ray.get(g_id) - del g_id - wait_for_pid_to_exit(pid) - - -@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") -def test_fair_queueing(shutdown_only): - ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1}) - - @ray.remote - def h(): - return 0 - - @ray.remote - def g(): - return ray.get(h.remote()) - - @ray.remote - def f(): - return ray.get(g.remote()) - - # This will never finish without fair queueing of {f, g, h}: - # https://github.com/ray-project/ray/issues/3644 - ready, _ = ray.wait( - [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) - assert len(ready) == 1000, len(ready) - - def test_put_get(shutdown_only): ray.init(num_cpus=0) diff --git a/python/ray/tests/test_basic_3.py b/python/ray/tests/test_basic_3.py new file mode 100644 index 000000000000..3b4b7ac9493a --- /dev/null +++ b/python/ray/tests/test_basic_3.py @@ -0,0 +1,142 @@ +# coding: utf-8 +import logging +import os +import sys +import time + +import numpy as np +import pytest + +import ray.cluster_utils +from ray.test_utils import ( + dicts_equal, + wait_for_pid_to_exit, +) + +import ray + +logger = logging.getLogger(__name__) + + +def test_many_fractional_resources(shutdown_only): + ray.init(num_cpus=2, num_gpus=2, resources={"Custom": 2}) + + @ray.remote + def g(): + return 1 + + @ray.remote + def f(block, accepted_resources): + true_resources = { + resource: value[0][1] + for resource, value in ray.get_resource_ids().items() + } + if block: + ray.get(g.remote()) + return dicts_equal(true_resources, accepted_resources) + + # Check that the resource are assigned correctly. + result_ids = [] + for rand1, rand2, rand3 in np.random.uniform(size=(100, 3)): + resource_set = {"CPU": int(rand1 * 10000) / 10000} + result_ids.append(f._remote([False, resource_set], num_cpus=rand1)) + + resource_set = {"CPU": 1, "GPU": int(rand1 * 10000) / 10000} + result_ids.append(f._remote([False, resource_set], num_gpus=rand1)) + + resource_set = {"CPU": 1, "Custom": int(rand1 * 10000) / 10000} + result_ids.append( + f._remote([False, resource_set], resources={"Custom": rand1})) + + resource_set = { + "CPU": int(rand1 * 10000) / 10000, + "GPU": int(rand2 * 10000) / 10000, + "Custom": int(rand3 * 10000) / 10000 + } + result_ids.append( + f._remote( + [False, resource_set], + num_cpus=rand1, + num_gpus=rand2, + resources={"Custom": rand3})) + result_ids.append( + f._remote( + [True, resource_set], + num_cpus=rand1, + num_gpus=rand2, + resources={"Custom": rand3})) + assert all(ray.get(result_ids)) + + # Check that the available resources at the end are the same as the + # beginning. + stop_time = time.time() + 10 + correct_available_resources = False + while time.time() < stop_time: + available_resources = ray.available_resources() + if ("CPU" in available_resources + and ray.available_resources()["CPU"] == 2.0 + and "GPU" in available_resources + and ray.available_resources()["GPU"] == 2.0 + and "Custom" in available_resources + and ray.available_resources()["Custom"] == 2.0): + correct_available_resources = True + break + if not correct_available_resources: + assert False, "Did not get correct available resources." + + +def test_background_tasks_with_max_calls(shutdown_only): + ray.init(num_cpus=2) + + @ray.remote + def g(): + time.sleep(.1) + return 0 + + @ray.remote(max_calls=1, max_retries=0) + def f(): + return [g.remote()] + + nested = ray.get([f.remote() for _ in range(10)]) + + # Should still be able to retrieve these objects, since f's workers will + # wait for g to finish before exiting. + ray.get([x[0] for x in nested]) + + @ray.remote(max_calls=1, max_retries=0) + def f(): + return os.getpid(), g.remote() + + nested = ray.get([f.remote() for _ in range(10)]) + while nested: + pid, g_id = nested.pop(0) + ray.get(g_id) + del g_id + wait_for_pid_to_exit(pid) + + +@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") +def test_fair_queueing(shutdown_only): + ray.init(num_cpus=1, _system_config={"fair_queueing_enabled": 1}) + + @ray.remote + def h(): + return 0 + + @ray.remote + def g(): + return ray.get(h.remote()) + + @ray.remote + def f(): + return ray.get(g.remote()) + + # This will never finish without fair queueing of {f, g, h}: + # https://github.com/ray-project/ray/issues/3644 + ready, _ = ray.wait( + [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) + assert len(ready) == 1000, len(ready) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) From a3796b3ed536194a6226a0a844a1249d067f7dd5 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 29 Jan 2021 15:48:02 -0800 Subject: [PATCH 106/245] [CI] Add other Travis Linux builds to buildkite (#13769) --- .buildkite/Dockerfile | 9 ++++ .buildkite/pipeline.yml | 27 +++++++++++ ci/travis/build-docker-images.py | 26 +++++------ ci/travis/ci.sh | 8 +++- ci/travis/determine_tests_to_run.py | 69 ++++++++++++++++++++++------- java/test.sh | 10 +++++ 6 files changed, 117 insertions(+), 32 deletions(-) diff --git a/.buildkite/Dockerfile b/.buildkite/Dockerfile index 86bd28148985..d20a9170f31d 100644 --- a/.buildkite/Dockerfile +++ b/.buildkite/Dockerfile @@ -2,6 +2,8 @@ FROM ubuntu:focal ARG REMOTE_CACHE_URL ARG BUILDKITE_PULL_REQUEST +ARG BUILDKITE_COMMIT +ARG BUILDKITE_PULL_REQUEST_BASE_BRANCH ENV DEBIAN_FRONTEND=noninteractive ENV TZ=America/Los_Angeles @@ -11,6 +13,9 @@ ENV CI=true ENV PYTHON=3.6 ENV RAY_USE_RANDOM_PORTS=1 ENV RAY_DEFAULT_BUILD=1 +ENV BUILDKITE_PULL_REQUEST=${BUILDKITE_PULL_REQUEST} +ENV BUILDKITE_COMMIT=${BUILDKITE_COMMIT} +ENV BUILDKITE_PULL_REQUEST_BASE_BRANCH=${BUILDKITE_PULL_REQUEST_BASE_BRANCH} RUN apt-get update -qq RUN apt-get install -y -qq \ @@ -37,3 +42,7 @@ WORKDIR /ray COPY . . RUN ./ci/travis/ci.sh init RUN bash --login -i ./ci/travis/ci.sh build + +# Run determine test to run +RUN bash --login -i -c "python ./ci/travis/determine_tests_to_run.py --output=json > affected_set.json" +RUN cat affected_set.json \ No newline at end of file diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ebfd96322ecf..00931f9ddd54 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,3 +1,30 @@ +- label: ":book: Lint" + commands: + - export LINT=1 + - ./ci/travis/install-dependencies.sh + - ./ci/travis/ci.sh lint + - ./ci/travis/ci.sh build + +- label: ":java: Java" + commands: + - apt-get install -y openjdk-8-jdk maven clang-format + # Compile Java again so bazel will compile Java as a language. + - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build + - ./java/test.sh + +- label: ":java: Streaming" + commands: + - apt-get install -y openjdk-8-jdk maven + # Compile Java again so bazel will compile Java as a language. + - RAY_INSTALL_JAVA=1 ./ci/travis/ci.sh build + - bazel test --config=ci $(./scripts/bazel_export_options) + //streaming:all + - bash streaming/src/test/run_streaming_queue_test.sh + +- label: ":cpp: Worker" + commands: + - ./ci/travis/ci.sh test_cpp + - label: ":cpp: Tests" commands: - bazel test --config=ci $(./scripts/bazel_export_options) diff --git a/ci/travis/build-docker-images.py b/ci/travis/build-docker-images.py index ad69a15dbcaa..8283f5c8fb0f 100644 --- a/ci/travis/build-docker-images.py +++ b/ci/travis/build-docker-images.py @@ -1,13 +1,12 @@ import datetime +import json import functools import glob import os import re -import runpy import shutil +import subprocess import sys -from contextlib import redirect_stdout -from io import StringIO from typing import List, Tuple import docker @@ -69,18 +68,15 @@ def _get_wheel_name(minor_version_number): def _docker_affected(): - result = StringIO() - with redirect_stdout(result): - runpy.run_path( - f"{_get_curr_dir()}/determine_tests_to_run.py", - run_name="__main__") - variable_definitions = result.getvalue().split() - env_var_dict = { - x.split("=")[0]: x.split("=")[1] - for x in variable_definitions - } - affected = env_var_dict["RAY_CI_DOCKER_AFFECTED"] == "1" or \ - env_var_dict["RAY_CI_PYTHON_DEPENDENCIES_AFFECTED"] == "1" + proc = subprocess.run( + [ + sys.executable, f"{_get_curr_dir()}/determine_tests_to_run.py", + "--output=json" + ], + capture_output=True) + affected_env_var_list = json.loads(proc.stdout) + affected = ("RAY_CI_DOCKER_AFFECTED" in affected_env_var_list or + "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED" in affected_env_var_list) print(f"Docker affected: {affected}") return affected diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 2527a4c5b1cb..e72380bdb8c6 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -357,9 +357,13 @@ lint_web() { ( cd "${WORKSPACE_DIR}"/python/ray/new_dashboard/client set +x # suppress set -x since it'll get very noisy here - . "${HOME}/.nvm/nvm.sh" + + if [ -z "${BUILDKITE-}" ]; then + . "${HOME}/.nvm/nvm.sh" + nvm use --silent node + fi + install_npm_project - nvm use --silent node local filenames # shellcheck disable=SC2207 filenames=($(find src -name "*.ts" -or -name "*.tsx")) diff --git a/ci/travis/determine_tests_to_run.py b/ci/travis/determine_tests_to_run.py index cba016fcf610..be37a29469cc 100644 --- a/ci/travis/determine_tests_to_run.py +++ b/ci/travis/determine_tests_to_run.py @@ -9,6 +9,7 @@ import subprocess import sys from pprint import pformat +import argparse def list_changed_files(commit_range): @@ -30,7 +31,44 @@ def list_changed_files(commit_range): return [s.strip() for s in out.decode().splitlines() if s is not None] +def is_pull_request(): + event_type = None + + for key in ["GITHUB_EVENT_NAME", "TRAVIS_EVENT_TYPE"]: + event_type = os.getenv(key, event_type) + + if (os.environ.get("BUILDKITE") + and os.environ.get("BUILDKITE_PULL_REQUEST") != "false"): + event_type = "pull_request" + + return event_type == "pull_request" + + +def get_commit_range(): + commit_range = None + + if os.environ.get("TRAVIS"): + commit_range = os.environ["TRAVIS_COMMIT_RANGE"] + elif os.environ.get("GITHUB_EVENT_PATH"): + with open(os.environ["GITHUB_EVENT_PATH"], "rb") as f: + event = json.loads(f.read()) + base = event["pull_request"]["base"]["sha"] + commit_range = "{}...{}".format(base, event.get("after", "")) + elif os.environ.get("BUILDKITE"): + commit_range = "{}...{}".format( + os.environ["BUILDKITE_PULL_REQUEST_BASE_BRANCH"], + os.environ["BUILDKITE_COMMIT"], + ) + + assert commit_range is not None + return commit_range + + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--output", type=str, help="json or envvars", default="envvars") + args = parser.parse_args() RAY_CI_TUNE_AFFECTED = 0 RAY_CI_SGD_AFFECTED = 0 @@ -50,20 +88,10 @@ def list_changed_files(commit_range): RAY_CI_DOC_AFFECTED = 0 RAY_CI_PYTHON_DEPENDENCIES_AFFECTED = 0 - event_type = None - for key in ["GITHUB_EVENT_NAME", "TRAVIS_EVENT_TYPE"]: - event_type = os.getenv(key, event_type) - - if event_type == "pull_request": - - commit_range = os.getenv("TRAVIS_COMMIT_RANGE") - if commit_range is None: - with open(os.environ["GITHUB_EVENT_PATH"], "rb") as f: - event = json.loads(f.read()) - base = event["pull_request"]["base"]["sha"] - commit_range = "{}...{}".format(base, event.get("after", "")) + if is_pull_request(): + commit_range = get_commit_range() files = list_changed_files(commit_range) - + print(pformat(commit_range), file=sys.stderr) print(pformat(files), file=sys.stderr) skip_prefix_list = [ @@ -187,7 +215,7 @@ def list_changed_files(commit_range): RAY_CI_ONLY_RLLIB_AFFECTED = 1 # Log the modified environment variables visible in console. - print(" ".join([ + output_string = " ".join([ "RAY_CI_TUNE_AFFECTED={}".format(RAY_CI_TUNE_AFFECTED), "RAY_CI_SGD_AFFECTED={}".format(RAY_CI_SGD_AFFECTED), "RAY_CI_ONLY_RLLIB_AFFECTED={}".format(RAY_CI_ONLY_RLLIB_AFFECTED), @@ -209,4 +237,15 @@ def list_changed_files(commit_range): "RAY_CI_DOCKER_AFFECTED={}".format(RAY_CI_DOCKER_AFFECTED), "RAY_CI_PYTHON_DEPENDENCIES_AFFECTED={}".format( RAY_CI_PYTHON_DEPENDENCIES_AFFECTED), - ])) + ]) + + # Debug purpose + print(output_string, file=sys.stderr) + + # Used by buildkite log format + if args.output.lower() == "json": + pairs = [item.split("=") for item in output_string.split(" ")] + affected_vars = [key for key, affected in pairs if affected == "1"] + print(json.dumps(affected_vars)) + else: + print(output_string) diff --git a/java/test.sh b/java/test.sh index 86afc719b5b0..a842194e67fb 100755 --- a/java/test.sh +++ b/java/test.sh @@ -16,6 +16,16 @@ pushd "$ROOT_DIR" mvn -T16 checkstyle:check popd +on_exit() { + exit_code=$? + if [ $exit_code -ne 0 ]; then + echo "Exit trap, printing ray logs" + cat /tmp/ray/session_latest/logs/* + fi +} + +trap on_exit EXIT + run_testng() { local exit_code if "$@"; then From 30f82329e39b5cfae84589231eacbb8f84dcd2d9 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 29 Jan 2021 17:55:46 -0800 Subject: [PATCH 107/245] [core] Add debug information for the PullManager and LocalObjectManager (#13782) * Add debug info * Formatting. Co-authored-by: SangBin Cho --- src/ray/object_manager/object_manager.cc | 1 + src/ray/object_manager/pull_manager.cc | 12 ++++++++++++ src/ray/object_manager/pull_manager.h | 2 ++ src/ray/raylet/local_object_manager.cc | 18 +++++++++++++++++- src/ray/raylet/local_object_manager.h | 5 +++++ src/ray/raylet/node_manager.cc | 1 + src/ray/raylet/worker_pool.cc | 4 ++++ 7 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index ddd71c7665ab..448245e012ee 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -818,6 +818,7 @@ std::string ObjectManager::DebugString() const { result << "\n" << object_directory_->DebugString(); result << "\n" << store_notification_->DebugString(); result << "\n" << buffer_pool_.DebugString(); + result << "\n" << pull_manager_->DebugString(); return result.str(); } diff --git a/src/ray/object_manager/pull_manager.cc b/src/ray/object_manager/pull_manager.cc index f4920a8def92..9be63c7e1d64 100644 --- a/src/ray/object_manager/pull_manager.cc +++ b/src/ray/object_manager/pull_manager.cc @@ -424,4 +424,16 @@ void PullManager::Tick() { int PullManager::NumActiveRequests() const { return object_pull_requests_.size(); } +std::string PullManager::DebugString() const { + std::stringstream result; + result << "PullManager:"; + result << "\n- num bytes available for pulled objects: " << num_bytes_available_; + result << "\n- num bytes being pulled: " << num_bytes_being_pulled_; + result << "\n- num pull request bundles: " << pull_request_bundles_.size(); + result << "\n- num objects requested pull: " << object_pull_requests_.size(); + result << "\n- num objects actively being pulled: " + << active_object_pull_requests_.size(); + return result.str(); +} + } // namespace ray diff --git a/src/ray/object_manager/pull_manager.h b/src/ray/object_manager/pull_manager.h index 3a542fef7af2..b0c80e338597 100644 --- a/src/ray/object_manager/pull_manager.h +++ b/src/ray/object_manager/pull_manager.h @@ -100,6 +100,8 @@ class PullManager { /// The number of ongoing object pulls. int NumActiveRequests() const; + std::string DebugString() const; + private: /// A helper structure for tracking information about each ongoing object pull. struct ObjectPullRequest { diff --git a/src/ray/raylet/local_object_manager.cc b/src/ray/raylet/local_object_manager.cc index 9909beb76e55..9ebaf75a8088 100644 --- a/src/ray/raylet/local_object_manager.cc +++ b/src/ray/raylet/local_object_manager.cc @@ -32,6 +32,7 @@ void LocalObjectManager::PinObjects(const std::vector &object_ids, continue; } RAY_LOG(DEBUG) << "Pinning object " << object_id; + pinned_objects_size_ += object->GetSize(); pinned_objects_.emplace(object_id, std::move(object)); } } @@ -69,7 +70,10 @@ void LocalObjectManager::ReleaseFreedObject(const ObjectID &object_id) { if (automatic_object_deletion_enabled_) { spilled_object_pending_delete_.push(object_id); } - pinned_objects_.erase(object_id); + if (pinned_objects_.count(object_id)) { + pinned_objects_size_ -= pinned_objects_[object_id]->GetSize(); + pinned_objects_.erase(object_id); + } } // Try to evict all copies of the object from the cluster. @@ -237,6 +241,7 @@ void LocalObjectManager::SpillObjectsInternal( for (const auto &object_id : objects_to_spill) { auto it = objects_pending_spill_.find(object_id); RAY_CHECK(it != objects_pending_spill_.end()); + pinned_objects_size_ += it->second->GetSize(); pinned_objects_.emplace(object_id, std::move(it->second)); objects_pending_spill_.erase(it); } @@ -454,6 +459,17 @@ void LocalObjectManager::FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) stats->set_restored_objects_total(restored_objects_total_); } +std::string LocalObjectManager::DebugString() const { + std::stringstream result; + result << "LocalObjectManager:\n"; + result << "- num pinned objects: " << pinned_objects_.size() << "\n"; + result << "- pinned objects size: " << pinned_objects_size_ << "\n"; + result << "- num objects pending restore: " << objects_pending_restore_.size() << "\n"; + result << "- num objects pending spill: " << objects_pending_spill_.size() << "\n"; + result << "- num bytes pending spill: " << num_bytes_pending_spill_ << "\n"; + return result.str(); +} + }; // namespace raylet }; // namespace ray diff --git a/src/ray/raylet/local_object_manager.h b/src/ray/raylet/local_object_manager.h index c4f157d58019..57ef8d3a1673 100644 --- a/src/ray/raylet/local_object_manager.h +++ b/src/ray/raylet/local_object_manager.h @@ -136,6 +136,8 @@ class LocalObjectManager { /// \param Output parameter. void FillObjectSpillingStats(rpc::GetNodeStatsReply *reply) const; + std::string DebugString() const; + private: FRIEND_TEST(LocalObjectManagerTest, TestSpillObjectsOfSize); FRIEND_TEST(LocalObjectManagerTest, @@ -203,6 +205,9 @@ class LocalObjectManager { // Objects that are pinned on this node. absl::flat_hash_map> pinned_objects_; + // Total size of objects pinned on this node. + size_t pinned_objects_size_ = 0; + // Objects that were pinned on this node but that are being spilled. // These objects will be released once spilling is complete and the URL is // written to the object directory. diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 251e28e26aed..cbe287ef721d 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2334,6 +2334,7 @@ std::string NodeManager::DebugString() const { for (auto &pair : cluster_resource_map_) { result << "\n" << pair.first.Hex() << ": " << pair.second.DebugString(); } + result << "\n" << local_object_manager_.DebugString(); result << "\n" << object_manager_.DebugString(); result << "\n" << gcs_client_->DebugString(); result << "\n" << worker_pool_.DebugString(); diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 4ed257f4602e..ff6083199d0a 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -1037,6 +1037,10 @@ std::string WorkerPool::DebugString() const { << " workers: " << entry.second.registered_workers.size(); result << "\n- num " << Language_Name(entry.first) << " drivers: " << entry.second.registered_drivers.size(); + result << "\n- num object spill callbacks queued: " + << entry.second.spill_io_worker_state.pending_io_tasks.size(); + result << "\n- num object restore queued: " + << entry.second.restore_io_worker_state.pending_io_tasks.size(); } result << "\n- num idle workers: " << idle_of_all_languages_.size(); return result.str(); From 4b60c388efb861bff31867583eeb6c41e882dddf Mon Sep 17 00:00:00 2001 From: Dominic Ming Date: Sat, 30 Jan 2021 10:42:16 +0800 Subject: [PATCH 108/245] [Dashboard] fix new dashboard entrance and some table problem (#13790) --- .../client/src/pages/dashboard/Dashboard.tsx | 11 ++++- dashboard/client/src/pages/job/index.tsx | 3 -- dashboard/client/src/pages/layout/index.tsx | 12 +----- dashboard/client/src/pages/node/index.tsx | 43 +------------------ 4 files changed, 11 insertions(+), 58 deletions(-) diff --git a/dashboard/client/src/pages/dashboard/Dashboard.tsx b/dashboard/client/src/pages/dashboard/Dashboard.tsx index d7eeaf936b45..07f266961451 100644 --- a/dashboard/client/src/pages/dashboard/Dashboard.tsx +++ b/dashboard/client/src/pages/dashboard/Dashboard.tsx @@ -35,6 +35,7 @@ const useDashboardStyles = makeStyles((theme: Theme) => "& > :not(:first-child)": { marginTop: theme.spacing(4), }, + position: "relative", }, tabs: { borderBottomColor: theme.palette.divider, @@ -106,8 +107,14 @@ const Dashboard: React.FC = () => { return (
Ray Dashboard - { driverIpAddress, isDead, driverPid, - state, timestamp, - namespaceId, }) => ( @@ -114,7 +112,6 @@ const JobList = () => { {dayjs(timestamp * 1000).format("YYYY/MM/DD HH:mm:ss")} - {namespaceId} ), )} diff --git a/dashboard/client/src/pages/layout/index.tsx b/dashboard/client/src/pages/layout/index.tsx index b484a29db646..bcaffafce6ec 100644 --- a/dashboard/client/src/pages/layout/index.tsx +++ b/dashboard/client/src/pages/layout/index.tsx @@ -77,16 +77,6 @@ const BasicLayout = ( Ray
Ray Dashboard - history.push("/summary")} - > - SUMMARY - history.push("/")} > - BACK TO LEGACY + BACK TO EXISTING DASHBOARD { } const { raylet, hostname, ip, cpu, mem, net, disk, logUrl } = node; - const { nodeId, state, brpcPort } = raylet; + const { nodeId, state } = raylet; return ( @@ -126,15 +123,6 @@ export const NodeCard = (props: { node: NodeDetail }) => { )} - - -